Hi guys,
Following code will enable us to read Microsoft Word Document file using JAVA API.
/* | |
* To change this license header, choose License Headers in Project Properties. | |
* To change this template file, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
package com.milind.mr.doc.test; | |
/** | |
* | |
* @author milind | |
*/ | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.util.List; | |
import org.apache.commons.io.FilenameUtils; | |
import org.apache.poi.hwpf.HWPFDocument; | |
import org.apache.poi.hwpf.extractor.WordExtractor; | |
import org.apache.poi.xwpf.usermodel.XWPFDocument; | |
import org.apache.poi.xwpf.usermodel.XWPFParagraph; | |
public class MicrosoftWordDocReader { | |
public static void readDocFile(String fileName) { | |
try { | |
File file = new File(fileName); | |
FileInputStream fis = new FileInputStream(file.getAbsolutePath()); | |
HWPFDocument doc = new HWPFDocument(fis); | |
WordExtractor we = new WordExtractor(doc); | |
String[] paragraphs = we.getParagraphText(); | |
System.out.println("Total no of paragraph " + paragraphs.length); | |
for (String para : paragraphs) { | |
System.out.println(para.toString()); | |
} | |
fis.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
public static void readDocxFile(String fileName) { | |
try { | |
File file = new File(fileName); | |
FileInputStream fis = new FileInputStream(file.getAbsolutePath()); | |
XWPFDocument document = new XWPFDocument(fis); | |
List<XWPFParagraph> paragraphs = document.getParagraphs(); | |
System.out.println("Total no of paragraph " + paragraphs.size()); | |
for (XWPFParagraph para : paragraphs) { | |
System.out.println(para.getText()); | |
} | |
fis.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
public static void main(String[] args) { | |
String ext = FilenameUtils.getExtension("D:\\test.docx"); | |
System.out.println("extension : " + ext); | |
if ("docx".equalsIgnoreCase(ext)) { | |
readDocxFile("D:\\Test.docx"); | |
} else if ("doc".equalsIgnoreCase(ext)) { | |
readDocFile("D:\\Test.doc"); | |
} else { | |
System.out.println("INVALID FILE TYPE. ONLY .doc and .docx are permitted."); | |
} | |
} | |
} |
Following is the pom.xml contents
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>com.milind</groupId> | |
<artifactId>mr-doc</artifactId> | |
<version>1.0</version> | |
<packaging>jar</packaging> | |
<properties> | |
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
<maven.compiler.source>1.7</maven.compiler.source> | |
<maven.compiler.target>1.7</maven.compiler.target> | |
</properties> | |
<dependencies> | |
<dependency> | |
<groupId>org.apache.poi</groupId> | |
<artifactId>poi-scratchpad</artifactId> | |
<version>3.0.1-FINAL</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.poi</groupId> | |
<artifactId>poi-ooxml</artifactId> | |
<version>3.9</version> | |
</dependency> | |
<dependency> | |
<groupId>commons-io</groupId> | |
<artifactId>commons-io</artifactId> | |
<version>2.4</version> | |
<type>jar</type> | |
</dependency> | |
</dependencies> | |
</project> |
Following is the word file contents

Following is the output of code.

Thanks for having a read.
Do comment below for your queries.
how to extract data from specific page
Did you mean a specific page in the word document?
IF YES, I have not explored page specific data extraction.
Please let me know if you found the solution for this.
Appreciate it.