Java實現(xiàn)word,pdf轉(zhuǎn)html并保留格式
一、word轉(zhuǎn)html
依賴:
<properties> <poi.version>5.2.3</poi.version> <xhtml.version>2.0.4</xhtml.version> </properties> <!--word轉(zhuǎn)html--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>${poi.version}</version> </dependency> <!--word轉(zhuǎn)html--> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId> <version>${xhtml.version}</version> </dependency> <!--處理office文檔表格相關(guān) 2007+版--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>${poi.version}</version> </dependency> <!--處理office文檔表格相關(guān) 2003版--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>${poi.version}</version> </dependency>
代碼:
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager; import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter; import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.commons.codec.binary.Base64; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.xwpf.usermodel.XWPFDocument; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.*; import java.net.URL; public class WordUtil { public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception { URL url = new URL(fileUrl); try (InputStream inputStream = url.openStream()) { if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){ return word2007ToHtml(inputStream); } else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) { return word2003ToHtml(inputStream); }else{ throw new RuntimeException("錯誤的文件后綴"); } } catch (RuntimeException e) { throw new RuntimeException(e.getMessage()); } } /** * word2007轉(zhuǎn)換成html * 對于docx,可以用下面這種方式: * @throws Exception */ public static String word2007ToHtml(InputStream inputStream) { try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream(); XWPFDocument docxDocument = new XWPFDocument(inputStream)) { XHTMLOptions options = XHTMLOptions.create(); // 是否忽略未使用的樣式 options.setIgnoreStylesIfUnused(false); // 設(shè)置片段模式,<div>標簽包裹 options.setFragment(true); // 圖片轉(zhuǎn)base64 options.setImageManager(new Base64EmbedImgManager()); // 轉(zhuǎn)換htm1 XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options); return htmlStream.toString(); } catch (Exception e) { System.out.println("Word轉(zhuǎn)Html過程出現(xiàn)異常!"); throw new RuntimeException(e.getMessage()); } } /** * word2003轉(zhuǎn)換成html * 對于doc,可以用下面這種方式: * @throws Exception */ public static String word2003ToHtml(InputStream inputStream ) throws Exception { try (StringWriter writer = new StringWriter(); HWPFDocument document = new HWPFDocument(inputStream)) { WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //將圖片轉(zhuǎn)成base64的格式 wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes)); wordToHtmlConverter.processDocument(document); org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument(); DOMSource domSource = new DOMSource(htmlDocument); TransformerFactory factory = TransformerFactory.newInstance(); Transformer serializer = factory.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, new StreamResult(writer)); return writer.toString(); } catch (Exception e) { System.out.println("Word轉(zhuǎn)Html過程出現(xiàn)異常!"); throw new RuntimeException(e.getMessage()); } } }
二、pdf轉(zhuǎn)html
依賴:
<dependency> <groupId>net.sf.cssbox</groupId> <artifactId>pdf2dom</artifactId> </dependency> <dependency> <groupId>net.mabboud.fontverter</groupId> <artifactId>FontVerter</artifactId> </dependency> <dependency> <groupId>org.reflections</groupId> <artifactId>reflections</artifactId> </dependency> <!--pdf轉(zhuǎn)文本--> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> </dependency>
代碼:
import org.apache.pdfbox.pdmodel.PDDocument; import org.fit.pdfdom.PDFDomTree; import java.io.*; import java.net.URL; public class PDFUtil { public static String pdfToHtml(String fileUrl) throws IOException { URL url = new URL(fileUrl); try (InputStream inputStream = url.openStream()){ return pdfToHtml(inputStream); }catch (Exception e){ throw new IOException(e.getMessage()); } } public static String pdfToHtml(InputStream inputStream) throws IOException { String outFilePath = "mypdf.html"; String pdfContent = ""; PDDocument document = PDDocument.load(inputStream); Writer writer = new PrintWriter(outFilePath, "UTF-8"); new PDFDomTree().writeText(document, writer); writer.close(); document.close(); // 獲取html內(nèi)容 try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) { StringBuilder htmlContent = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { htmlContent.append(line).append("\n"); // 追加每一行內(nèi)容,并添加換行符 } pdfContent = String.valueOf(htmlContent); return pdfContent; } catch (IOException e) { e.printStackTrace(); System.err.println("讀取 HTML 文件時出錯。"); } return null; } }
三、方法補充
Java實現(xiàn)word轉(zhuǎn)html
1.引入maven依賴
<properties> <poi.version>5.2.3</poi.version> <xhtml.version>2.0.4</xhtml.version> </properties> <!--word轉(zhuǎn)html--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>${poi.version}</version> </dependency> <!--word轉(zhuǎn)html--> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId> <version>${xhtml.version}</version> </dependency> <!--處理office文檔表格相關(guān) 2007+版--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>${poi.version}</version> </dependency> <!--處理office文檔表格相關(guān) 2003版--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>${poi.version}</version> </dependency>
2.Java代碼
/** * Word2007(docx)格式轉(zhuǎn)html * @param filePath 文件路徑 * @return 返回轉(zhuǎn)成String類型的html字符串 * @throws IOException */ public static String docxToHtml(String filePath) { try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream(); XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) { XHTMLOptions options = XHTMLOptions.create(); // 是否忽略未使用的樣式 options.setIgnoreStylesIfUnused(false); // 設(shè)置片段模式,<div>標簽包裹 options.setFragment(true); // 圖片轉(zhuǎn)base64 options.setImageManager(new Base64EmbedImgManager()); // 轉(zhuǎn)換htm1 XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options); return htmlStream.toString(); } catch (Exception e) { log.error("Word轉(zhuǎn)Html過程出現(xiàn)異常!", e); } return null; } /** * Word2003(doc)格式轉(zhuǎn)html * @param filePath 文件路徑 * @return 返回轉(zhuǎn)成String類型的html字符串 * @throws Exception */ public static String docToHtml(String filePath) { try (StringWriter writer = new StringWriter(); HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) { WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //將圖片轉(zhuǎn)成base64的格式 wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes)); wordToHtmlConverter.processDocument(document); org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument(); DOMSource domSource = new DOMSource(htmlDocument); TransformerFactory factory = TransformerFactory.newInstance(); Transformer serializer = factory.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, new StreamResult(writer)); return writer.toString(); } catch (Exception e) { log.error("Word轉(zhuǎn)Html過程出現(xiàn)異常!", e); } return null; } /** * word 轉(zhuǎn) html * 自動檢測文件格式轉(zhuǎn)換 * @param filePath 文件本地路徑 * @return 成功返回轉(zhuǎn)換后的html字符串;失敗返回null */ public static String autoWord2Html(String filePath) { int lastIndexOf = filePath.lastIndexOf("."); String suffix = filePath.substring(lastIndexOf + 1); if ("doc".equalsIgnoreCase(suffix)) { return docToHtml(filePath); } else if ("docx".equalsIgnoreCase(suffix)) { return docxToHtml(filePath); } else { log.info("文件格式錯誤,只支持Docx和Doc格式的文檔!"); return null; } }
使用Java實現(xiàn)PDF到HTML的轉(zhuǎn)換
引入以下依賴
<dependency> <groupId>net.sf.cssbox</groupId> <artifactId>pdf2dom</artifactId> <version>2.0.3</version> </dependency> <dependency> <groupId>net.mabboud.fontverter</groupId> <artifactId>FontVerter</artifactId> <version>1.2.22</version> <!-- 請根據(jù)需要使用最新版本 --> </dependency> <dependency> <groupId>org.reflections</groupId> <artifactId>reflections</artifactId> <version>0.10.2</version> <!-- 請根據(jù)需要使用最新版本 --> </dependency> <!--pdf轉(zhuǎn)文本--> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.24</version> </dependency>
實現(xiàn)關(guān)鍵代碼
File file = new File(pdfUrl); String localPdfFilePath = 要解析的PDF文件路徑(本地)+ file.getName(); String newPdfFilePath = 截取PDF后生成的PDF文件路徑+ file.getName(); String outFilePath = 生成的HTML文件.html"; String pdfContent = ""; PDDocument pdfDocument = PDDocument.load(new File(localPdfFilePath)); // 檢查文檔中是否有頁面 if (pdfDocument.getNumberOfPages() > 0) { // 移除第一頁 pdfDocument.removePage(0); } // 保存更改后的PDF到新文件 pdfDocument.save(new File(newPdfFilePath)); System.out.println("第一頁已被移除,新PDF保存在: " + newPdfFilePath); pdfDocument.close(); // 轉(zhuǎn)換成html格式文件 PDDocument document = PDDocument.load(new File(newPdfFilePath)); Writer writer = new PrintWriter(outFilePath, "UTF-8"); new PDFDomTree().writeText(document, writer); writer.close(); document.close(); // 獲取html內(nèi)容 try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) { StringBuilder htmlContent = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { htmlContent.append(line).append("\n"); // 追加每一行內(nèi)容,并添加換行符 } pdfContent = String.valueOf(htmlContent); } catch (IOException e) { e.printStackTrace(); System.err.println("讀取 HTML 文件時出錯。"); }
到此這篇關(guān)于Java實現(xiàn)word,pdf轉(zhuǎn)html并保留格式的文章就介紹到這了,更多相關(guān)Java word,pdf轉(zhuǎn)html內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
Java docx4j高效處理Word文檔的實戰(zhàn)指南
對于需要在Java應用程序中生成、修改或處理Word文檔的開發(fā)者來說,docx4j是一個強大而專業(yè)的選擇,下面我們就來看看docx4j的具體使用吧2025-07-07java中l(wèi)ombok的@Data引發(fā)問題詳解
這篇文章主要給大家介紹了關(guān)于java中l(wèi)ombok的@Data引發(fā)問題的相關(guān)資料,文中通過圖文介紹的非常詳細,對大家的學習或者工作具有一定的參考學習價值,需要的朋友們下面隨著小編來一起學習學習吧2020-09-09JVM性能調(diào)優(yōu)實戰(zhàn):讓你的IntelliJ Idea縱享絲滑
這篇文章主要介紹了JVM性能調(diào)優(yōu)實戰(zhàn):讓你的IntelliJ Idea縱享絲滑的相關(guān)資料,本文給大家介紹的非常詳細,對大家的學習或工作具有一定的參考借鑒價值,需要的朋友可以參考下2021-01-01Java基于LoadingCache實現(xiàn)本地緩存的示例代碼
本文主要介紹了Java基于LoadingCache實現(xiàn)本地緩存的示例代碼,文中通過示例代碼介紹的非常詳細,具有一定的參考價值,感興趣的小伙伴們可以參考一下2022-01-01Java中構(gòu)造器內(nèi)部的多態(tài)方法的行為實例分析
這篇文章主要介紹了Java中構(gòu)造器內(nèi)部的多態(tài)方法的行為,結(jié)合實例形式分析了java構(gòu)造器內(nèi)部多態(tài)方法相關(guān)原理、功能及操作技巧,需要的朋友可以參考下2019-10-10