依赖包版本:
compile(“org.apache.poi:poi:3.17”)
compile(“org.apache.poi:poi-ooxml:3.17”)
compile(“org.apache.poi:poi-ooxml-schemas:3.17”)
compile(“org.apache.poi:poi-scratchpad:3.17”)
compile(“org.apache.poi:ooxml-schemas:1.4”)
compile(“fr.opensagres.xdocreport:fr.opensagres.xdocreport.converter.docx.xwpf:2.0.1”)
maven库:
https://mvnrepository.com/artifact/fr.opensagres.xdocreport/fr.opensagres.xdocreport.converter.docx.xwpf
https://mvnrepository.com/artifact/org.apache.poi
import com.google.common.io.Files;
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.io.FileUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.UUID;
/**
* word文档转html文件
* 兼容.doc、.docx格式转换
*/
public class WordToHtml {
private final static String SEPARATOR = "/";
public static String format(InputStream in, String wordFilename, String htmlRootPath) throws Exception {
if(!htmlRootPath.endsWith(SEPARATOR)){
htmlRootPath += SEPARATOR;
}
String uuid = UUID.randomUUID().toString().trim().replaceAll("-", "");
final String htmlName = uuid + Constants.SUFFIX_HTML;
OutputStream htmlOut = null;
OutputStreamWriter writer = null;
try{
File htmlFile = new File(htmlRootPath + htmlName);
Files.createParentDirs(htmlFile);
htmlOut = new FileOutputStream(htmlFile);
writer = new OutputStreamWriter(htmlOut, StandardCharsets.UTF_8);
if (wordFilename.toLowerCase().endsWith(Constants.SUFFIX_DOCX)) {
XWPFDocument document = new XWPFDocument(in);
//保存图片
XHTMLOptions options = XHTMLOptions.create();
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
//图片保存文件夹路径
options.setImageManager(new ImageManager(new File(htmlRootPath), uuid));
//解析word文档
XHTMLConverter instance = (XHTMLConverter) XHTMLConverter.getInstance();
instance.convert(document, writer, options);
} else if (wordFilename.toLowerCase().endsWith(Constants.SUFFIX_DOC)) {
HWPFDocument wordDocument = new HWPFDocument(in);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//设置图片位置,也可在此方法中保存图片
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
return uuid + SEPARATOR +suggestedName;
}
});
// 保存图片
List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (Picture pic : pics) {
File imagePath = new File(htmlRootPath + uuid + SEPARATOR + pic.suggestFullFileName());
Files.createParentDirs(imagePath);
OutputStream imageOut = new FileOutputStream(imagePath);
pic.writeImageContent(imageOut);
if(null != imageOut){
imageOut.clode();
}
}
}
//解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
Transformer serializer = TransformerFactory.newInstance().newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.name());
serializer.setOutputProperty(OutputKeys.INDENT, Boolean.toString(true));
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(new DOMSource(htmlDocument), new StreamResult(writer));
}
String wrapHtml = wrap(FileUtils.readFileToString(htmlFile, StandardCharsets.UTF_8), Files.getNameWithoutExtension(wordFilename), StandardCharsets.UTF_8);
FileUtils.write(htmlFile, wrapHtml, StandardCharsets.UTF_8);
}catch(Exception e){
throw e;
}finally {
if(null != writer){
writer.clode();
}
if(null != htmlOut){
htmlOut.clode();
}
}
return htmlName;
}
private static String wrap(String contentBody, String title, final Charset encoding) {
StringBuilder wrapHtmlStringBuilder = new StringBuilder("<!DOCTYPE html>\n<html>\n<head>\n\t<title>");
wrapHtmlStringBuilder.append(title).append("</title>\n\t<meta http-equiv="Content-Type" content="text/html; charset=").append(encoding).append("">\n</head>\n<body>\n\n").append(contentBody)
.append("</body>\n</html>");
return wrapHtmlStringBuilder.toString();
}
interface Constants{
/**
* 后缀:docx
*/
String SUFFIX_DOCX = ".docx";
/**
* 后缀:doc
*/
String SUFFIX_DOC = ".doc";
/**
* 后缀:html
*/
String SUFFIX_HTML = ".html";
}
}
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.io.FileUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.UUID;
/**
* word文档转html文件
* 兼容.doc、.docx格式转换
*/
public class WordToHtml {
private final static String SEPARATOR = "/";
public static String format(InputStream in, String wordFilename, String htmlRootPath) throws Exception {
if(!htmlRootPath.endsWith(SEPARATOR)){
htmlRootPath += SEPARATOR;
}
String uuid = UUID.randomUUID().toString().trim().replaceAll("-", "");
final String htmlName = uuid + Constants.SUFFIX_HTML;
OutputStream htmlOut = null;
OutputStreamWriter writer = null;
try{
File htmlFile = new File(htmlRootPath + htmlName);
Files.createParentDirs(htmlFile);
htmlOut = new FileOutputStream(htmlFile);
writer = new OutputStreamWriter(htmlOut, StandardCharsets.UTF_8);
if (wordFilename.toLowerCase().endsWith(Constants.SUFFIX_DOCX)) {
XWPFDocument document = new XWPFDocument(in);
//保存图片
XHTMLOptions options = XHTMLOptions.create();
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
//图片保存文件夹路径
options.setImageManager(new ImageManager(new File(htmlRootPath), uuid));
//解析word文档
XHTMLConverter instance = (XHTMLConverter) XHTMLConverter.getInstance();
instance.convert(document, writer, options);
} else if (wordFilename.toLowerCase().endsWith(Constants.SUFFIX_DOC)) {
HWPFDocument wordDocument = new HWPFDocument(in);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//设置图片位置,也可在此方法中保存图片
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
return uuid + SEPARATOR +suggestedName;
}
});
// 保存图片
List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (Picture pic : pics) {
File imagePath = new File(htmlRootPath + uuid + SEPARATOR + pic.suggestFullFileName());
Files.createParentDirs(imagePath);
OutputStream imageOut = new FileOutputStream(imagePath);
pic.writeImageContent(imageOut);
if(null != imageOut){
imageOut.clode();
}
}
}
//解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
Transformer serializer = TransformerFactory.newInstance().newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.name());
serializer.setOutputProperty(OutputKeys.INDENT, Boolean.toString(true));
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(new DOMSource(htmlDocument), new StreamResult(writer));
}
String wrapHtml = wrap(FileUtils.readFileToString(htmlFile, StandardCharsets.UTF_8), Files.getNameWithoutExtension(wordFilename), StandardCharsets.UTF_8);
FileUtils.write(htmlFile, wrapHtml, StandardCharsets.UTF_8);
}catch(Exception e){
throw e;
}finally {
if(null != writer){
writer.clode();
}
if(null != htmlOut){
htmlOut.clode();
}
}
return htmlName;
}
private static String wrap(String contentBody, String title, final Charset encoding) {
StringBuilder wrapHtmlStringBuilder = new StringBuilder("<!DOCTYPE html>\n<html>\n<head>\n\t<title>");
wrapHtmlStringBuilder.append(title).append("</title>\n\t<meta http-equiv="Content-Type" content="text/html; charset=").append(encoding).append("">\n</head>\n<body>\n\n").append(contentBody)
.append("</body>\n</html>");
return wrapHtmlStringBuilder.toString();
}
interface Constants{
/**
* 后缀:docx
*/
String SUFFIX_DOCX = ".docx";
/**
* 后缀:doc
*/
String SUFFIX_DOC = ".doc";
/**
* 后缀:html
*/
String SUFFIX_HTML = ".html";
}
}