Apache POI - converting *.doc to *.html with images

user1134181 picture user1134181 · Dec 11, 2012 · Viewed 11.7k times · Source

There is a DOC file that contains some image. How to convert it to HTML with image?

I tried to use this example: Convert Word doc to HTML programmatically in Java

public class Converter {
    ...

    private File docFile, htmlFile;

    try {
        FileInputStream fos = new FileInputStream(docFile.getAbsolutePath()); 
        HWPFDocument doc = new HWPFDocument(fos);       
        Document newDoc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();

        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(newDoc) ;
        wordToHtmlConverter.processDocument(doc);

        StringWriter stringWriter = new StringWriter();

        Transformer transformer = TransformerFactory.newInstance().newTransformer();        
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        transformer.setOutputProperty(OutputKeys.METHOD, "html");
        transformer.transform(
                    new DOMSource(wordToHtmlConverter.getDocument()),
                    new StreamResult(stringWriter)
        );

        String html = stringWriter.toString();

        try {
            BufferedWriter out = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8")
            );     
            out.write(html);
            out.close();
       } catch (IOException e) {
           e.printStackTrace();
       }

       JEditorPane jEditorPane = new JEditorPane();
       jEditorPane.setContentType("text/html");
       jEditorPane.setEditable(false);
       jEditorPane.setPage(htmlFile.toURI().toURL());

       JScrollPane jScrollPane = new JScrollPane(jEditorPane);

       JFrame jFrame = new JFrame("display html file");
       jFrame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
       jFrame.getContentPane().add(jScrollPane);
       jFrame.setSize(512, 342);
       jFrame.setVisible(true);

    } catch(Exception e) {
        e.printStackTrace();
    }
    ...
}

But the image is lost.

The documentation for the WordToHtmlConverter class says the following:

...this implementation doesn't create images or links to them. This can be changed by overriding AbstractWordConverter.processImage(Element, boolean, Picture) method.

How to convert DOC to HTML with images?

Answer

raok1997 picture raok1997 · Mar 24, 2015

Extend WordToHtmlConverter and override processImageWithoutPicturesManager.

 import java.util.Base64;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class InlineImageWordToHtmlConverter extends WordToHtmlConverter {

    public InlineImageWordToHtmlConverter(Document document) {
        super(document);
    }

    @Override
    protected void processImageWithoutPicturesManager(Element currentBlock,
        boolean inlined, Picture picture)
    {
        Element imgNode = currentBlock.getOwnerDocument().createElement("img");
        StringBuilder sb = new StringBuilder();
        sb.append(Base64.getMimeEncoder().encodeToString(picture.getRawContent()));
        sb.insert(0, "data:"+picture.getMimeType()+";base64,");
        imgNode.setAttribute("src", sb.toString());
        currentBlock.appendChild(imgNode);
    }

}

Use the new class while parsing document as shown below

HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc(new FileInputStream("D:/temp/Temp.doc"));    
        WordToHtmlConverter wordToHtmlConverter = new InlineImageWordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.processDocument(wordDocument);