关于这七种文档,我相信应该是最常用的文档了
在以下的介绍中会提到POI,现介绍下POI吧
poi处理WORD,EXCEL比较好:/poi/
poi处理至少需要如下几个JAR包
PDFbox处理PDF比较好:/download.html
下面一一介绍了
第一和第二是只支持03版的word和excel文档
第一、首先来看WORD文档:
我这里用的是poi,相关jar包自己去下载,然后加到工程中(以下所要用的jar包也是,不再重复说)
Java代码 
<span ><span >public static String readWord(String path) throws Exception { 
String bodyText = null; 
try { 
FileInputStream is = new FileInputStream(path); 
bodyText = new WordExtractor(is).getText(); 
} catch (Exception e) { 
System.out.println("======="); 
return bodyText; 
}</span></span> 
第二、Exel的文档
Java代码 
<span ><span >public static String ReadExcel(String path) throws IOException { 
InputStream inputStream = null; 
String content = null; 
try { 
inputStream = new FileInputStream(path); 
HSSFWorkbook wb = new HSSFWorkbook(inputStream); 
ExcelExtractor extractor = new ExcelExtractor(wb); 
extractor.setFormulasNotResults(true); 
extractor.setIncludeSheetNames(false); 
content = Text(); 
} catch (FileNotFoundException e) { 
e.printStackTrace(); 
return content; 
}</span></span> 
针对07版的word和excel的操作
st;   
Java代码 
<span >   
/**   
* 需要的jar包:   
* poi-3.0.2-FINAL-20080204.jar   
* poi-contrib-3.0.2-FINAL-20080204.jar   
* poi-scratchpad-3.0.2-FINAL-20080204.jar   
* poi-3.5-beta6-20090622.jar   
* geronimo-stax-api_1.0_spec-1.0.jar   
* ooxml-schemas-1.0.jar   
* openxml4j-bin-beta.jar   
* poi-ooxml-3.5-beta6-20090622.jar   
* xmlbeans-2.3.0.jar   
* dom4j-1.6.1.jar   
*/   
import java.io.FileInputStream;     
import java.io.IOException;     
import java.io.InputStream;     
import org.apache.poi.POIXMLDocument;     
import org.apache.poi.POIXMLTextExtractor;     
import org.apache.poi.hssf.usermodel.HSSFCell;     
import org.apache.poi.hssf.usermodel.HSSFRow;     
import org.apache.poi.hssf.usermodel.HSSFSheet;     
import org.apache.poi.hssf.usermodel.HSSFWorkbook;     
import org.apache.actor.WordExtractor;     
import org.apache.ptions.OpenXML4JException;     
import org.apache.poi.openxml4j.opc.OPCPackage;     
import org.apache.poi.xssf.usermodel.XSSFCell;     
import org.apache.poi.xssf.usermodel.XSSFRow;     
import org.apache.poi.xssf.usermodel.XSSFSheet;     
import org.apache.poi.xssf.usermodel.XSSFWorkbook;     
import org.apache.actor.XWPFWordExtractor;     
import lbeans.XmlException;     
public class WordAndExcelExtractor {     
public static void main(String[] args){     
try{     
String wordFile = "D:/松山血战.docx";     
String wordText2007 = actTextFromDOC2007(wordFile);     
System.out.println("wordText2007======="+wordText2007);     
InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");       
String excelText = actTextFromXLS(is);       
System.out.println("text2003==========" + excelText);     
String excelFile = "D:/Hello2007.xlsx";       
String excelText2007 = actTextFromXLS2007(excelFile);     
System.out.println("excelText2007==========" + excelText2007);     
}catch(Exception e ){     
e.printStackTrace();     
}     
}     
/
**   
* @Method: extractTextFromDOCX   
* @Description: 从word 2003文档中提取纯文本   
*   
* @param   
* @return String   
* @throws   
*/   
public static String extractTextFromDOC(InputStream is) throws IOException {     
WordExtractor ex = new WordExtractor(is); //is是WORD文件的InputStream     
Text();     
}     
/**   
* @Method: extractTextFromDOCX   
* @Description: 从word 2007文档中提取纯文本   
*   
* @param   
* @return String   
* @throws   
*/   
public static String extractTextFromDOC2007(String fileName) throws IOException, OpenXML4JException, XmlException {     
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);     
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);       
Text();     
}     
/**   
* @Method: extractTextFromXLS   
* @Description: 从excel 2003文档中提取纯文本   
*   
* @param   
* @return String   
* @throws   
*/   
@SuppressWarnings("deprecation")     
private static String extractTextFromXLS(InputStream is)     
throws IOException {     
StringBuffer content  = new StringBuffer();     
HSSFWorkbook workbook = new HSSFWorkbook(is); //创建对Excel工作簿文件的引用     
for (int numSheets = 0; numSheets < NumberOfSheets(); numSheets++) {     
if (null != SheetAt(numSheets)) {     
HSSFSheet aSheet = SheetAt(numSheets); //获得一个sheet     
for (int rowNumOfSheet = 0; rowNumOfSheet <= LastRowNum(); rowNumOfSheet++) {     
if (null != Row(rowNumOfSheet)) {     
HSSFRow aRow = Row(rowNumOfSheet); //获得一行     
for (short cellNum
OfRow = 0; cellNumOfRow <= LastCellNum(); cellNumOfRow++) {     
if (null != Cell(cellNumOfRow)) {     
HSSFCell aCell = Cell(cellNumOfRow); //获得列值     
CellType() == HSSFCell.CELL_TYPE_NUMERIC){     
content.NumericCellValue());     
}else CellType() == HSSFCell.CELL_TYPE_BOOLEAN){     
content.BooleanCellValue());     
}else {     
content.StringCellValue());     
}     
}     
}     
}     
}     
}     
}     
String();     
}     
/**   
* @Method: extractTextFromXLS2007   
* @Description: 从excel 2007文档中提取纯文本   
*   
* @param   
* @return String   
* @throws   
*/   
private static String extractTextFromXLS2007(String fileName) throws Exception{     
StringBuffer content = new StringBuffer();     
//构造 XSSFWorkbook 对象,strPath 传入文件路径         
XSSFWorkbook xwb = new XSSFWorkbook(fileName);     
//循环工作表Sheet     
for(int numSheet = 0; numSheet < NumberOfSheets(); numSheet++){     
XSSFSheet xSheet = SheetAt(numSheet);     
if(xSheet == null){     
continue;     
}     
//循环行Row     
for(int rowNum = 0; rowNum <= LastRowNum(); rowNum++){     
XSSFRow xRow = Row(rowNum);     
if(xRow == null){     
continue;     
}     
//循环列Cell     
for(int cellNum = 0; cellNum <= LastCellNum(); cellNum++){     
XSSFCell xCell = Cell(cellNum);     
if(xCell == null){     
continue;     
}     
CellType() == XSSFCell.CELL_TYPE_BOOLEAN){     
content.BooleanCellValue());     
}else CellType() == XSSFCell.CELL_TYPE_NUMERIC){     
content.NumericCellValue());     
}else{     
content.StringCellValue());     
}     
}     
}     
}     
String();     
}     
}     
</span> 
第三、PowerPoint的文档
Java代码 
<span ><span >public static String readPowerPoint(String path) { 
StringBuffer content = new StringBuffer("")
try { 
SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream( 
path)));// is 
// 为文件的InputStream,建立SlideShow 
jfinal jar包下载Slide[] slides = ss.getSlides();// 获得每一张幻灯片 
for (int i = 0; i < slides.length; i++) { 
TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun 
for (int j = 0; j < t.length; j++) { 
content.append(t[j].getText());// 这里会将文字内容加到content中去 
} catch (Exception ex) { 
System.out.String()); 
String(); 
}</span></span> 
第四、PDF的文档
Java代码 
<span ><span >public static String readPdf(String path) throws Exception { 
StringBuffer content = new StringBuffer(""); 
FileInputStream fis = new FileInputStream(path); 
PDFParser p = new PDFParser(fis); 
p.parse(); 
PDFTextStripper ts = new PDFTextStripper(); 
content.PDDocument())); 
fis.close(); 
String().trim(); 
}</span></span> 
第五、HTML的文档,要说明的是,HTML文档我们要获取其TITLE,BODY中的内容就要先获取源文件,然后再对源文件进行标签上的过滤,很麻烦
Html代码 
<span >public static String readHtml(String urlString) { 
StringBuffer content = new StringBuffer(""); 
File file = new File(urlString); 
FileInputStream fis = null; 
try { 
fis = new FileInputStream(file); 
BufferedReader reader = new BufferedReader(new InputStreamReader( 
fis, "utf-8")); 
String line = null; 
while ((line = adLine()) != null) { 
content.append(line + "\n"); 
reader.close(); 
} catch (Exception e) { 
e.printStackTrace(); 
String contentcontentString = String(); 
String htmlStr = contentString; // 含html标签的字符串 
String textStr = ""; 
Pattern p_script; 
Matcher m_script; 
Pattern p_style; 
Matcher m_style; 
Pattern p_html; 
Matcher m_html; 
try { 
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\ 
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]* 
String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 
p_script = Patternpile(regEx_script, Pattern.CASE_INSENSITIVE); 
m_script = p_script.matcher(htmlStr); 
htmlStr = placeAll(""); // 过滤script标签 
p_style = Patternpile(regEx_style, Pattern.CASE_INSENSITIVE); 
m_style = p_style.matcher(htmlStr); 
htmlStr = placeAll(""); // 过滤style标签 
p_html = Patternpile(regEx_html, Pattern.CASE_INSENSITIVE); 
m_html = p_html.matcher(htmlStr); 
htmlStr = placeAll(""); // 过滤html标签 
textStr = htmlStr; 
} catch (Exception e) { 
Syst
return textStr;// 返回文本字符串 
}</span> 
第六、TXT的文档,给TXT文本建立索引时要注意
本项目实现了组合查询的功能
//这一步如果不设置为GBK,TXT内容将全部乱码 BufferedReader reader=new BufferedReader(new InputStreamReader(is,"GBK")); 具体代码如下
Java代码 
<span ><span >public static String readTxt(String path) throws IOException { 
StringBuffer sb = new StringBuffer(""); 
InputStream is = new FileInputStream(path); 
// 必须设置成GBK,否则将出现乱码 
BufferedReader reader = new BufferedReader(new InputStreamReader(is, 
"GBK")); 
try { 
String line = ""; 
while ((line = adLine()) != null) { 
sb.append(line + "\r"); 
} catch (FileNotFoundException e) { 
e.printStackTrace(); 
String().trim(); 
}</span></span> 
第七、RTF文档,rtf的转换则在javax中就有
Java代码 
<span ><span >public static String readRtf(String path) { 
String result = null; 
File file = new File(path); 
try { 
DefaultStyledDocument styledDoc = new DefaultStyledDocument(); 
InputStream is = new FileInputStream(file); 
new RTFEditorKit().read(is, styledDoc, 0); 
result = new Text(0, Length()) 
.getBytes("iso8859-1"), "gbk"); 
// 提取文本,读取中文需要使用gbk编码,否则会出现乱码 
} catch (IOException e) { 
e.printStackTrace(); 
} catch (BadLocationException e) { 
e.printStackTrace(); 
return result; 
}</span></span> 

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。