POI使用HWPFDocumen编程频道|福州电脑网

2024年3月5日发(作者：)

POI使用

[摘要：本身正在做项目过程当中用到懂得析支解word2003战2007,2010等文件内容，以下是代码：上面是局部代码，若是念要全体代码能够收我邮箱，***************************，接心类：]

自己在做项目过程中用到了解析分割word2003和2007,2010等文件内容，以下是代码：

下面是部分代码，如果想要全部代码可以发我邮箱，***************************，

接口类：

package ;

import ;

/**

* 使用 POI 解析 WORD 文件的内容信息

* @author

* 2015-2-9

public interface PoiExtractContent {

/**

* 根据文件路径获得 Document 对象

* @param docPath 路径

* @return Document

public T getDocument(String docPath);

/**

* 解析 word 文档的标题

* @param doc Document 对象

* @return word 文档中标题

public String getTilte(T doc);

/**

* 获取 word 文档里所有文字内容（不包括图片、表格等格式的内容）

* @param doc Document 对象

* @return word 文档中文字部分全部内容

public String getContent(T doc);

/**

* 获取 word 文档里所有文字内容（不包括图片、表格等格式的内容）

* @param docPath doc 对象路径

* @return word 文档中文字部分全部内容

public String getContent(String docPath);

/**

* 获取 word 文档里面所有图片并另存到指定目录下

* @param doc Document 对象

* @param picPath 保存图片路径

* @param suffix 后缀名

public void getPictures(T doc, String picPath, String suffix);

/**

* 获取word 文档里面所有表格

* @param doc Document 对象

public void getTables(T doc);

/**

* 获取word 文档中最大的字体

* @param doc Document 对象

* @return 最大字体

public int getMaxFontSize(T doc);

/**

* 获取 word 文档的创建信息

* @param docPath doc路径

* @return 创建文档的信息

public Map getInfo(String docPath);

}

2003实现：

package ;

import putStream;

import tputStream;

import tream;

import ist;

import p;

import t;

import or;

import ;

import yInformation;

import cument;

import tractor;

import esTable;

import escription;

import heet;

import terRun;

import aph;

import e;

import ;

import ell;

import terator;

import ow;

import ractContent;

import ;

import TextUtil;

/**

* 使用 POI 解析 DOC2003 文件的内容信息

* @author

* 2015-2-9

public

/**

* 根据文件路径获得 Document 对象

* @param docPath 路径

* @return Document

class PoiHwpfExtractContentImpl implements

PoiExtractContent {

public HWPFDocument getDocument(String docPath) {

// hwpfDocument 是专门处理 word 的，在 poi 中还有处理其他

office 文档的类

HWPFDocument doc = null;

try {

doc = new HWPFDocument(new FileInputStream(docPath));

} catch (Exception e) {

tackTrace();

}

return doc;

}

/**

* 解析 word 文档的标题

* @param doc Document 对象

* @return 标题

public String getTilte(HWPFDocument doc) {

String title = "";

Range range = ge();

Paragraph p = null;

for (int i = 0; i < agraphs(); i++) {

p = agraph(i);

if(() != null && !().equals("")

&& !().equals("r")){

title = ().trim();

break;

}

return title;

}

/**

* 获取 word 文档里所有文字内容（不包括图片、表格等格式的内容）

* @param doc Document 对象

* @return word 文档中文字部分全部内容

public String getContent(HWPFDocument doc){

String content = "";

try {

content = t().toString().trim();

} catch (Exception e) {

tackTrace();

}

return eAll("", "");

}

/**

* 获取 word 文档里所有文字内容（不包括图片、表格等格式的内容）

* @param docPath doc 对象路径

* @return word 文档中文字部分全部内容

public String getContent(String docPath) {

StringBuffer strBuff = new StringBuffer("");

try {

WordExtractor extractor = new WordExtractor(new

FileInputStream(docPath));

//tFromPieces();

String [] strArray = agraphText();

for(int i = 0; i < ; ++i) {

(strArray[i].trim());

}

} catch (Exception e) {

tackTrace();

}

return ng().replaceAll("", "");

}

/**

* 获取 word 文档里面所有图片并另存到指定目录下

* @param doc Document 对象

* @param picPath 保存图片路径

* @param suffix 后缀名

public void getPictures(HWPFDocument doc, String picPath,

String suffix) {

Range range = ge();

byte[] dataStream = aStream();

int numChar = racterRuns();

PicturesTable pTable = new PicturesTable(doc, dataStream,

dataStream);

for (int i = 0; i < numChar; ++i) {

CharacterRun cuRun = racterRun(i);

boolean hasPic = ture(cuRun);

if (hasPic) {

Picture picture = tPicture(cuRun, true);

try {

mageContent(new FileOutputStream(picPath +

i + suffix));

} catch (Exception e) {

tackTrace();

}

/**

* 获取word 文档里面所有表格

* @param doc Document 对象

public void getTables(HWPFDocument doc){

Range range = ge();

TableIterator tableIt = new TableIterator(range);

while (t()) {

Table table = (Table)();

for(int j=0;j

TableRow tr = (j);

String content = "";

for(int i=0;i

TableCell cell = l(i);

for(int m=0;m

Paragraph para = agraph(m);

content += ().trim() + ";";

}

n(content);

}

/**

* 获取文章中所有标题集合

* @param doc Document

* @return

public List getTitleList(HWPFDocument doc){

Range range = ge();

byte[] dataStream = aStream();

int numP = agraphs();

List titleList = new ArrayList();

PicturesTable pTable = new PicturesTable(doc, dataStream,

dataStream);

for(int i=0;i

Range curRange = agraph(i);

Paragraph paragraph = agraph(i);

CharacterRun cr = racterRun(0);

if(ture(cr)){ //图片

continue;

}else{

char currentChar = 0;

for(int k=0;k<().length();k++){

currentChar = ().charAt(k);

if(currentChar != _ASCII){

break;

}

if(currentChar == _ASCII){ //回车符

continue;

}else if(currentChar == _ASCII){ //空格符

continue;

}else if(currentChar == TION_ASCII){ //水平制表符

continue;

}

int numStyles = leSheet().numStyles();

int styleIndex = leIndex();

if (numStyles > styleIndex) {

StyleSheet style_sheet = leSheet();

StyleDescription style =

style_leDescription(styleIndex);

String styleName = e();

if(styleName!=null&&ns("标题")){

(().trim());

n(().trim());

}

return titleList;

}

/**

* 获取整篇文章中所有标题样式名称

* @param doc Document

* @return

public Set getTitleStyleNameSet(HWPFDocument

doc){

Range range = ge();

byte[] dataStream = aStream();

int numP = agraphs();

Set titNameSet = new HashSet();

PicturesTable pTable = new PicturesTable(doc, dataStream,

dataStream);

for(int i=0;i

Range curRange = agraph(i);

Paragraph paragraph = agraph(i);

CharacterRun cr = racterRun(0);

if(ture(cr)){ //图片

continue;

}else{

char currentChar = 0;

for(int k=0;k<().length();k++){

currentChar = ().charAt(k);

if(currentChar != _ASCII){

break;

}

if(currentChar == _ASCII){ //回车符

continue;

}else if(currentChar == _ASCII){ //空格符

continue;

}else if(currentChar == TION_ASCII){ //水平制表符

continue;

}

int numStyles = leSheet().numStyles();

int styleIndex = leIndex();

if (numStyles > styleIndex) {

StyleSheet style_sheet = leSheet();

StyleDescription style =

style_leDescription(styleIndex);

String styleName = e();

if(styleName!=null&&ns("标题")){

if(ns(",")){

styleName = getFirstStyleName(styleName);

}

(styleName);

}

return titNameSet;

}

/**

* 处理标题样式名称的特殊格式，如：“标题 3,标题 3 Char,标题

3 Char Char” ,只获取“标题 3”

* @param styleName 需进行处理的标题样式，如"标题 3,标题

3 Char,标题 3 Char Char”

* @return

private String getFirstStyleName(String styleName){

if ((styleName != null) && (() > 0)) {

int styleLeng = (",").length;

if(styleLeng>1){

int comma = f(",");

if(comma>-1&&(comma<())){

return ing(0,comma);

}

return styleName;

}

/**

* 获取当前文章中最大标题样式名称，如“标题1”

* @param doc Document

* @return

public String getMaxTitleStyleName(HWPFDocument doc){

Set titNameSet = getTitleStyleNameSet(doc);

Iterator it = or();

List tempLst = new ArrayList();

while(t()){

String titName = (); //得到“标题 1”、“标题 2”

try {

int curStyleName =

nt(ing(2).trim());

(curStyleName);

} catch (NumberFormatException e) {

continue;

}

int max = (()==0?0:(0));

for(int i=0;i<();i++){

int curSize = (i);

if(curSize

max = curSize;

}

if(max==0){

return ""; //文章中不包含任何标题

}

return "标题 "+max;

}

/**

* 获取word 文档中最大的字体

* @param doc Document 对象

public int getMaxFontSize(HWPFDocument doc) {

int fontSize = 0;

try {

Range range = ge();

for (int i = 0; i < agraphs(); i++) {

Paragraph poiPara = agraph(i);

int j = 0;

while (true) {

CharacterRun run = racterRun(j++);

if(fontSize < tSize()) {

fontSize = tSize();

}//字体大小

if (Offset() == Offset()) {

break;

}

} catch (Exception e) {

tackTrace();

}

return fontSize;

}

/**

* 获取 word 文档的创建信息

* @param docPath doc路径

* @return 创建文档的信息

public Map getInfo(String docPath) {

try {

InputStream is = new FileInputStream(docPath);

WordExtractor extractor = new WordExtractor(is);

SummaryInformation

maryInformation();

Map mapInfo = new HashMap

String>();

("author", hor()); // 作者

("title", le()); // 标题

("subject", ject()); // 主题

("keyword", words()); // 关键词

info =

("createdate",

rmat(ateDateTime())); // 创建时间

("updatedate",

rmat(tSaveDateTime())); // 修改时间

} catch (Exception e) {

}

return null;

}

2007实现类：

package ;

import putStream;

import tFoundException;

import tputStream;

import ption;

import tream;

import p;

import or;

import ;

import Document;

import TextExtractor;

import operties;

import kage;

import rdExtractor;

import cument;

import ragraph;

import ctureData;

import ble;

import bleCell;

import ractContent;

import TextUtil;

/**

* 使用 POI 解析 DOCX2007 文件的内容信息

* @author

* 2015-2-9

public

/**

* 根据文件路径获得 Document 对象

* @param docxPath 路径

* @return Document

public XWPFDocument getDocument(String docxPath) {

//xwpfDocument是专门处理word的，在poi中还有处理其他office文档的类

XWPFDocument docx = null;

try {

OPCPackage pack =

ckage(docxPath);

docx = new XWPFDocument(pack) ;

} catch (Exception e) {

tackTrace();

class PoiXwpfExtractContentImpl implements

PoiExtractContent {

}

return docx;

}

/**

* 解析 word 文档的标题

* @param docx Document 对象

* @return word 文档中标题

public String getTilte(XWPFDocument docx) {

String title = "";

List paras = agraphs();

XWPFParagraph p = null;

for (int i = 0; i < (); i++) {

if(t() != null && !t().equals("")

&& !t().equals("r")){

title = t().trim();

break;

}

return title;

}

/**

* 获取 word 文档里所有文字内容（不包括图片、表格等格式的内容）

* @param docx Document 对象

* @return word 文档中文字部分全部内容

public String getContent(XWPFDocument docx) {

String content = "";

try {

List paras = agraphs();

for (XWPFParagraph para : paras) {

content += t().trim();

}

} catch (Exception e) {

tackTrace();

}

return eAll("", "");

}

/**

* 获取 word 文档里所有文字内容（不包括图片、表格等格式的内容）

* @param docxPath docx 对象路径

* @return word 文档中文字部分全部内容

public String getContent(String docxPath) {

String content = "";

try {

OPCPackage

POIXMLTextExtractor

opcPackage

extractor =

new

ckage(docxPath);

XWPFWordExtractor(opcPackage);

content += t().trim();

} catch (Exception e) {

tackTrace();

}

return eAll("", "");

}

/**

* 获取 word 文档里面所有图片并另存到指定目录下

* @param docx Document 对象

* @param picPath 保存图片路径

* @param suffix 后缀名

public void getPictures(XWPFDocument docx, String picPath,

String suffix){

List wpdList = Pictures();

if(wpdList != null && () > 0){

for (int i = 0; i < (); i++) {

byte[] picByte = (i).getData(); //获取图片数据流

FileOutputStream fos = null;

try {

fos = new FileOutputStream(picPath + i + suffix);

} catch (FileNotFoundException e) {

tackTrace();

}finally{

try {

(picByte);

} catch (IOException e) {

tackTrace();

}

/**

* 获取word 文档里面所有表格

* @param doc Document 对象

public void getTables(XWPFDocument docx){

Iterator tableIt = lesIterator();

while (t()) {

XWPFTable table = ();

String rowInfo = "";

for(int j = 0; j < s().size(); j ++){

List cells = (j).getTableCells();

// 获得所有列

for (int k = 0; k < (); k++) {

rowInfo += (k).getText().trim() + ";";

}

n(rowInfo);

}

/**

* 获取word 文档中最大的字体

* @param doc Document 对象

public int getMaxFontSize(XWPFDocument docx) {

int fontSize = 0;

/* List paraGraph = agraphs();

for(XWPFParagraph para :paraGraph ){

List run = s();

for(XWPFRun r : run){

int i = 0;

n("字体颜色："+or());

n("字体名称:"+tFamily());

n("字体大小："+tSize());

n("Text:"+t(i++));

n("粗体？："+());

n("斜体？："+ic());

if(fontSize < tSize()){

fontSize = tSize();

}

}*/

return fontSize;

}

/**

* 获取 word 文档的创建信息

* @param docPath docx路径

* @return 创建文档的信息

public Map getInfo(String docxPath) {

try {

InputStream is = new FileInputStream(docxPath);

XWPFDocument docx = new XWPFDocument(is);

XWPFWordExtractor

XWPFWordExtractor(docx);

CoreProperties coreProps = eProperties();

Map mapInfo = new HashMap

String>();

//("category", egory()); //分类

extractor = new

("author", ator()); //创建者

("title", le()); //标题

("subject", ject()); // 主题

("keyword", words()); // 关键词

("createdate",

rmat(ated())); //创建时间

("updatedate",

rmat(tPrinted())); // 修改时间

} catch (Exception e) {

}

return null;

}

POI使用HWPFDocumen

发布评论取消回复

最近发表

相关推荐

标签列表

POI使用HWPFDocumen

发布评论 取消回复

最近发表

相关推荐

标签列表

发布评论取消回复