webview打开各种文件--688IT编程网

关于这七种文档，我相信应该是最常用的文档了

在以下的介绍中会提到POI，现介绍下POI吧

poi处理WORD,EXCEL比较好:/poi/

poi处理至少需要如下几个JAR包

PDFbox处理PDF比较好：/download.html

下面一一介绍了

第一和第二是只支持03版的word和excel文档

第一、首先来看WORD文档：

我这里用的是poi，相关jar包自己去下载，然后加到工程中（以下所要用的jar包也是，不再重复说）

Java代码

public static String readWord(String path) throws Exception {

String bodyText = null;

try {

FileInputStream is = new FileInputStream(path);

bodyText = new WordExtractor(is).getText();

} catch (Exception e) {

System.out.println("=======");

}

return bodyText;

}

第二、Exel的文档

Java代码

public static String ReadExcel(String path) throws IOException {

InputStream inputStream = null;

String content = null;

try {

inputStream = new FileInputStream(path);

HSSFWorkbook wb = new HSSFWorkbook(inputStream);

ExcelExtractor extractor = new ExcelExtractor(wb);

extractor.setFormulasNotResults(true);

extractor.setIncludeSheetNames(false);

content = Text();

} catch (FileNotFoundException e) {

e.printStackTrace();

}

return content;

}

针对07版的word和excel的操作

st;

Java代码

/**

* 需要的jar包：

* poi-3.0.2-FINAL-20080204.jar

* poi-contrib-3.0.2-FINAL-20080204.jar

* poi-scratchpad-3.0.2-FINAL-20080204.jar

* poi-3.5-beta6-20090622.jar

* geronimo-stax-api_1.0_spec-1.0.jar

* ooxml-schemas-1.0.jar

* openxml4j-bin-beta.jar

* poi-ooxml-3.5-beta6-20090622.jar

* xmlbeans-2.3.0.jar

* dom4j-1.6.1.jar

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import org.apache.poi.POIXMLDocument;

import org.apache.poi.POIXMLTextExtractor;

import org.apache.poi.hssf.usermodel.HSSFCell;

import org.apache.poi.hssf.usermodel.HSSFRow;

import org.apache.poi.hssf.usermodel.HSSFSheet;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import org.apache.actor.WordExtractor;

import org.apache.ptions.OpenXML4JException;

import org.apache.poi.openxml4j.opc.OPCPackage;

import org.apache.poi.xssf.usermodel.XSSFCell;

import org.apache.poi.xssf.usermodel.XSSFRow;

import org.apache.poi.xssf.usermodel.XSSFSheet;

import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import org.apache.actor.XWPFWordExtractor;

import lbeans.XmlException;

public class WordAndExcelExtractor {

public static void main(String[] args){

try{

String wordFile = "D:/松山血战.docx";

String wordText2007 = actTextFromDOC2007(wordFile);

System.out.println("wordText2007======="+wordText2007);

InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");

String excelText = actTextFromXLS(is);

System.out.println("text2003==========" + excelText);

String excelFile = "D:/Hello2007.xlsx";

String excelText2007 = actTextFromXLS2007(excelFile);

System.out.println("excelText2007==========" + excelText2007);

}catch(Exception e ){

e.printStackTrace();

}

* @Method: extractTextFromDOCX

* @Description: 从word 2003文档中提取纯文本

* @param

* @return String

* @throws

public static String extractTextFromDOC(InputStream is) throws IOException {

WordExtractor ex = new WordExtractor(is); //is是WORD文件的InputStream

Text();

}

/**

* @Method: extractTextFromDOCX

* @Description: 从word 2007文档中提取纯文本

* @param

* @return String

* @throws

public static String extractTextFromDOC2007(String fileName) throws IOException, OpenXML4JException, XmlException {

OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);

POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);

Text();

}

/**

* @Method: extractTextFromXLS

* @Description: 从excel 2003文档中提取纯文本

* @param

* @return String

* @throws

@SuppressWarnings("deprecation")

private static String extractTextFromXLS(InputStream is)

throws IOException {

StringBuffer content = new StringBuffer();

HSSFWorkbook workbook = new HSSFWorkbook(is); //创建对Excel工作簿文件的引用

for (int numSheets = 0; numSheets < NumberOfSheets(); numSheets++) {

if (null != SheetAt(numSheets)) {

HSSFSheet aSheet = SheetAt(numSheets); //获得一个sheet

for (int rowNumOfSheet = 0; rowNumOfSheet <= LastRowNum(); rowNumOfSheet++) {

if (null != Row(rowNumOfSheet)) {

HSSFRow aRow = Row(rowNumOfSheet); //获得一行

for (short cellNum

OfRow = 0; cellNumOfRow <= LastCellNum(); cellNumOfRow++) {

if (null != Cell(cellNumOfRow)) {

HSSFCell aCell = Cell(cellNumOfRow); //获得列值

CellType() == HSSFCell.CELL_TYPE_NUMERIC){

content.NumericCellValue());

}else CellType() == HSSFCell.CELL_TYPE_BOOLEAN){

content.BooleanCellValue());

}else {

content.StringCellValue());

}

String();

}

/**

* @Method: extractTextFromXLS2007

* @Description: 从excel 2007文档中提取纯文本

* @param

* @return String

* @throws

private static String extractTextFromXLS2007(String fileName) throws Exception{

StringBuffer content = new StringBuffer();

//构造 XSSFWorkbook 对象，strPath 传入文件路径

XSSFWorkbook xwb = new XSSFWorkbook(fileName);

//循环工作表Sheet

for(int numSheet = 0; numSheet < NumberOfSheets(); numSheet++){

XSSFSheet xSheet = SheetAt(numSheet);

if(xSheet == null){

continue;

}

//循环行Row

for(int rowNum = 0; rowNum <= LastRowNum(); rowNum++){

XSSFRow xRow = Row(rowNum);

if(xRow == null){

continue;

}

//循环列Cell

for(int cellNum = 0; cellNum <= LastCellNum(); cellNum++){

XSSFCell xCell = Cell(cellNum);

if(xCell == null){

continue;

}

CellType() == XSSFCell.CELL_TYPE_BOOLEAN){

content.BooleanCellValue());

}else CellType() == XSSFCell.CELL_TYPE_NUMERIC){

content.NumericCellValue());

}else{

content.StringCellValue());

}

String();

}

第三、PowerPoint的文档

Java代码

public static String readPowerPoint(String path) {

StringBuffer content = new StringBuffer("")

;

try {

SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(

path)));// is

// 为文件的InputStream，建立SlideShow

jfinal jar包下载Slide[] slides = ss.getSlides();// 获得每一张幻灯片

for (int i = 0; i < slides.length; i++) {

TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容，建立TextRun

for (int j = 0; j < t.length; j++) {

content.append(t[j].getText());// 这里会将文字内容加到content中去

}

} catch (Exception ex) {

System.out.String());

}

String();

}

第四、PDF的文档

Java代码

public static String readPdf(String path) throws Exception {

StringBuffer content = new StringBuffer("");

FileInputStream fis = new FileInputStream(path);

PDFParser p = new PDFParser(fis);

p.parse();

PDFTextStripper ts = new PDFTextStripper();

content.PDDocument()));

fis.close();

String().trim();

}

第五、HTML的文档，要说明的是，HTML文档我们要获取其TITLE，BODY中的内容就要先获取源文件，然后再对源文件进行标签上的过滤，很麻烦

Html代码

public static String readHtml(String urlString) {

StringBuffer content = new StringBuffer("");

File file = new File(urlString);

FileInputStream fis = null;

try {

fis = new FileInputStream(file);

BufferedReader reader = new BufferedReader(new InputStreamReader(

fis, "utf-8"));

String line = null;

while ((line = adLine()) != null) {

content.append(line + "\n");

}

reader.close();

} catch (Exception e) {

e.printStackTrace();

}

String contentcontentString = String();

String htmlStr = contentString; // 含html标签的字符串

String textStr = "";

Pattern p_script;

Matcher m_script;

Pattern p_style;

Matcher m_style;

Pattern p_html;

Matcher m_html;

try {

String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\

String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*

String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式

p_script = Patternpile(regEx_script, Pattern.CASE_INSENSITIVE);

m_script = p_script.matcher(htmlStr);

htmlStr = placeAll(""); // 过滤script标签

p_style = Patternpile(regEx_style, Pattern.CASE_INSENSITIVE);

m_style = p_style.matcher(htmlStr);

htmlStr = placeAll(""); // 过滤style标签

p_html = Patternpile(regEx_html, Pattern.CASE_INSENSITIVE);

m_html = p_html.matcher(htmlStr);

htmlStr = placeAll(""); // 过滤html标签

textStr = htmlStr;

} catch (Exception e) {

Syst

}

return textStr;// 返回文本字符串

}

第六、TXT的文档，给TXT文本建立索引时要注意

本项目实现了组合查询的功能

//这一步如果不设置为GBK，TXT内容将全部乱码 BufferedReader reader=new BufferedReader(new InputStreamReader(is,"GBK")); 具体代码如下

Java代码

public static String readTxt(String path) throws IOException {

StringBuffer sb = new StringBuffer("");

InputStream is = new FileInputStream(path);

// 必须设置成GBK，否则将出现乱码

BufferedReader reader = new BufferedReader(new InputStreamReader(is,

"GBK"));

try {

String line = "";

while ((line = adLine()) != null) {

sb.append(line + "\r");

}

} catch (FileNotFoundException e) {

e.printStackTrace();

}

String().trim();

}

第七、RTF文档，rtf的转换则在javax中就有

Java代码

public static String readRtf(String path) {

String result = null;

File file = new File(path);

try {

DefaultStyledDocument styledDoc = new DefaultStyledDocument();

InputStream is = new FileInputStream(file);

new RTFEditorKit().read(is, styledDoc, 0);

result = new Text(0, Length())

.getBytes("iso8859-1"), "gbk");

// 提取文本，读取中文需要使用gbk编码，否则会出现乱码

} catch (IOException e) {

e.printStackTrace();

} catch (BadLocationException e) {

e.printStackTrace();

}

return result;

}

688IT编程网

webview打开各种文件

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符回溯引用和前后查匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式选择题

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

688IT编程网

webview打开各种文件

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符 回溯引用和前后查 匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式 选择题

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

java正则表达式选择题

非零金额正则表达式

基本的元字符回溯引用和前后查匹配模式

java正则表达式选择题

非零金额正则表达式