首页 > 文章列表 > 如何使用Java将Word/PDF/TXT转换为HTML?

如何使用Java将Word/PDF/TXT转换为HTML?

java HTML
460 2023-04-24

Java怎么实现Word/Pdf/TXT转html

一:Java实现将word转换为html

   1:引入依赖

 1 <dependency>

 2   <groupId>fr.opensagres.xdocreport</groupId>

 3   <artifactId>fr.opensagres.xdocreport.document</artifactId>

 4   <version>1.0.5</version>

 5 </dependency>

 6 <dependency> 

 7   <groupId>fr.opensagres.xdocreport</groupId> 

 8   <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> 

 9   <version>1.0.5</version> 

10 </dependency>

11   <dependency>

12   <groupId>org.apache.poi</groupId>

13   <artifactId>poi</artifactId>

14   <version>3.12</version>

15 </dependency>

16 <dependency>

17   <groupId>org.apache.poi</groupId>

18   <artifactId>poi-scratchpad</artifactId>

19   <version>3.12</version>

20 </dependency>

  2:代码demo

  1 package com.svse.controller;

  2 

  3 import javax.xml.parsers.DocumentBuilderFactory;

  4 import javax.xml.parsers.ParserConfigurationException;

  5 import javax.xml.transform.OutputKeys;

  6 import javax.xml.transform.Transformer;

  7 import javax.xml.transform.TransformerException;

  8 import javax.xml.transform.TransformerFactory;

  9 import javax.xml.transform.dom.DOMSource;

 10 import javax.xml.transform.stream.StreamResult;

 11 

 12 import org.apache.poi.hwpf.HWPFDocument;

 13 import org.apache.poi.hwpf.converter.PicturesManager;

 14 import org.apache.poi.hwpf.converter.WordToHtmlConverter;

 15 import org.apache.poi.hwpf.usermodel.PictureType;

 16 import org.apache.poi.xwpf.converter.core.BasicURIResolver;

 17 import org.apache.poi.xwpf.converter.core.FileImageExtractor;

 18 import org.apache.poi.xwpf.converter.core.FileURIResolver;

 19 import org.apache.poi.xwpf.converter.core.IURIResolver;

 20 import org.apache.poi.xwpf.converter.core.IXWPFConverter;

 21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;

 22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;

 23 import org.apache.poi.xwpf.usermodel.XWPFDocument;

 24 /**

 25  * word 转换成html

 26  */

 27 public class TestWordToHtml {

 28 

 29     public static  final String STORAGEPATH="C://works//files//";

 30     public static  final String IP="192.168.30.222";

 31     public static  final String PORT="8010";

 32     public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {

 33         TestWordToHtml wt=new TestWordToHtml();

 34         //wt.Word2003ToHtml("甲骨文考证.doc");

 35         wt.Word2007ToHtml("甲骨文考证.docx");

 36 

 37     }

 38       

 39      /**

 40      * 2003版本word转换成html

 41      * @throws IOException

 42      * @throws TransformerException

 43      * @throws ParserConfigurationException

 44      */

 45     public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {

 46        

 47         final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片  图片会保存在此路径

 48         final String strRanString=getRandomNum();

 49         String filepath =STORAGEPATH;

 50         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";

 51         final String file = filepath + fileName;

 52         InputStream input = new FileInputStream(new File(file));

 53         HWPFDocument wordDocument = new HWPFDocument(input);

 54         WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

 55         //设置图片存放的位置

 56         wordToHtmlConverter.setPicturesManager(new PicturesManager() {

 57             public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {

 58                 File imgPath = new File(imagepath);

 59                 if(!imgPath.exists()){//图片目录不存在则创建

 60                     imgPath.mkdirs();

 61                 }

 62                 

 63                 File file = new File(imagepath +strRanString+suggestedName);

 64                 try {

 65                     OutputStream os = new FileOutputStream(file);

 66                     os.write(content);

 67                     os.close();

 68                 } catch (FileNotFoundException e) {

 69                     e.printStackTrace();

 70                 } catch (IOException e) {

 71                     e.printStackTrace();

 72                 }

 73                 

 74                 return  "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;

 75                // return imagepath +strRanString+suggestedName;

 76             }

 77         });

 78         

 79         //解析word文档

 80         wordToHtmlConverter.processDocument(wordDocument);

 81         Document htmlDocument = wordToHtmlConverter.getDocument();

 82         

 83         File htmlFile = new File(filepath +strRanString+htmlName);

 84         OutputStream outStream = new FileOutputStream(htmlFile);

 85         

 86 

 87         DOMSource domSource = new DOMSource(htmlDocument);

 88         StreamResult streamResult = new StreamResult(outStream);

 89 

 90         TransformerFactory factory = TransformerFactory.newInstance();

 91         Transformer serializer = factory.newTransformer();

 92         serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

 93         serializer.setOutputProperty(OutputKeys.INDENT, "yes");

 94         serializer.setOutputProperty(OutputKeys.METHOD, "html");

 95         

 96         serializer.transform(domSource, streamResult);

 97         outStream.close();

 98         

 99         System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);

100     }

101 

102     /**

103      * 2007版本word转换成html

104      * @throws IOException

105      */

106     public void Word2007ToHtml(String fileName) throws IOException {

107         

108        final String strRanString=getRandomNum();

109         

110         String filepath = STORAGEPATH+strRanString;

111         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";

112         File f = new File(STORAGEPATH+fileName);  

113         if (!f.exists()) {  

114             System.out.println("Sorry File does not Exists!");  

115         } else {  

116             if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {  

117                 try {

118                     // 1) 加载word文档生成 XWPFDocument对象  

119                     InputStream in = new FileInputStream(f);  

120                     XWPFDocument document = new XWPFDocument(in);  

121       

122                     // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)  

123                     File imageFolderFile = new File(filepath);  

124                     XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));  

125                     options.setExtractor(new FileImageExtractor(imageFolderFile));  

126                     options.URIResolver(new IURIResolver() {

127                         public String resolve(String uri) {

128                             //http://192.168.30.222:8010//uploadFile/....

129                             return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;

130                         }

131                     });

132                     

133                     options.setIgnoreStylesIfUnused(false);  

134                     options.setFragment(true);  

135                       

136                     // 3) 将 XWPFDocument转换成XHTML  

137                     OutputStream out = new FileOutputStream(new File(filepath + htmlName));  

138                     IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();

139                     converter.convert(document,out, options);

140                     //XHTMLConverter.getInstance().convert(document, out, options);  

141                     System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);

142                 } catch (Exception e) {

143                     e.printStackTrace();

144                 }

145             

146             } else {  

147                 System.out.println("Enter only MS Office 2007+ files");  

148             }  

149         }  

150     }  

151 

152      /**

153      *功能说明:生成时间戳

154      *创建人:zsq

155      *创建时间:2019年12月7日 下午2:37:09

156      *

157      */

158      public static String getRandomNum(){

159          Date dt = new Date();

160          SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");  

161          String str=sdf.format(dt);

162          return str;

163      }

164      

165    }

二:Java实现将Pdf转换为html

  1: 引入依赖

 1 <dependency>

 2             <groupId>net.sf.cssbox</groupId>

 3             <artifactId>pdf2dom</artifactId>

 4             <version>1.7</version>

 5         </dependency> 

 6         <dependency>

 7             <groupId>org.apache.pdfbox</groupId>

 8             <artifactId>pdfbox</artifactId>

 9             <version>2.0.12</version>

10         </dependency>

11         <dependency>

12             <groupId>org.apache.pdfbox</groupId>

13             <artifactId>pdfbox-tools</artifactId>

14             <version>2.0.12</version>

15  </dependency>

16

2:代码Demo

 1 public class PdfToHtml {

 2 

 3   /*

 4     pdf转换html

 5      */

 6     public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath)  {

 7        // String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html";

 8     9        //try() 写在()里面会自动关闭流

10         try{

11             BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));

12             //加载PDF文档

13             //PDDocument document = PDDocument.load(bytes);

14             PDDocument document = PDDocument.load(new File(inPdfPath));

15             PDFDomTree pdfDomTree = new PDFDomTree();

16             pdfDomTree.writeText(document,out);

17         } catch (Exception e) {

18             e.printStackTrace();

19         }

20     }

21 

22     public static void main(String[] args) throws IOException {

23         PdfToHtml ph=new PdfToHtml();

24         String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf";

25         String outputPath="C:\\works\\files\\武研中心行政考勤制度.html";

26         ph.pdfToHtmlTest(pdfPath,outputPath);

27   }

28 

29 }

三:Java实现将TXT转换为html

 1  /*

 2      * txt文档转html

 3        filePath:txt原文件路径

 4        htmlPosition:转化后生成的html路径

 5     */

 6     public static void txtToHtml(String filePath, String htmlPosition) {

 7         try {

 8             //String encoding = "GBK";

 9             File file = new File(filePath);

10             if (file.isFile() && file.exists()) { // 判断文件是否存在

11                 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");

12                 // 考虑到编码格式

13                 BufferedReader bufferedReader = new BufferedReader(read);

14                 // 写文件

15                 FileOutputStream fos = new FileOutputStream(new File(htmlPosition));

16                 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");

17                 BufferedWriter bw = new BufferedWriter(osw);

18                 String lineTxt = null;

19                 while ((lineTxt = bufferedReader.readLine()) != null) {

20                     bw.write("&nbsp&nbsp&nbsp"+lineTxt + "</br>");

21                 }

22                 bw.close();

23                 osw.close();

24                 fos.close();

25                 read.close();

26             } else {

27                 System.out.println("找不到指定的文件");

28             }

29         } catch (Exception e) {

30             System.out.println("读取文件内容出错");

31             e.printStackTrace();

32         }

33     }