JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容--688IT编程网

JAVA提取Word,Excel,PPT,PDF,TXT等⽂档⽂字内容⾸先引⼊Maven库

[html]

01. <dependency>

02. <groupId>org.apache.poi</groupId>

03. <artifactId>poi</artifactId>

04. <version>3.15</version>

05. </dependency>

06. <dependency>

07. <groupId>org.apache.poi</groupId>

08. <artifactId>poi-ooxml</artifactId>

09. <version>3.15</version>

10. </dependency>

11. <dependency>

12. <groupId>org.apache.poi</groupId>

13. <artifactId>poi-scratchpad</artifactId>

14. <version>3.15</version>

15. </dependency>

16. <dependency>

17. <groupId>org.apache.pdfbox</groupId>

18. <artifactId>pdfbox</artifactId>

19. <version>2.0.4</version>

20. </dependency>

[java]

01.

[java]

01. <pre name="code"class="html">public class ParseText {

02.

03. // 判断⽂档类型，调⽤不同的解析⽅法

04. public static String parse(byte[] buffer, String suffix) {

05. String text = "";

06. switch (suffix) {

07. case"doc":

08. text = getTextFromWord(buffer);

09. break;

10. case"docx":

11. text = getTextFromWord2007(buffer);

12. break;

13. case"xls":

14. text = getTextFromExcel(buffer);

15. break;

16. case"xlsx":

17. text = getTextFromExcel2007(buffer);

18. break;

19. case"ppt":

20. text = getTextFromPPT(buffer);

21. break;

22. case"pptx":

23. text = getTextFromPPT2007(buffer);

24. break;

25. case"pdf":

26. text = getTextFormPDF(buffer);

27. break;

28. case"txt":

29. text = getTextFormTxt(buffer);

30. break;

31. default:

32. System.out.println("不⽀持解析的⽂档类型");

33. }

34.

35. placeAll("\\s*", "");

36. }

37.

38. // 读取Word97-2003的全部内容 doc

39. private static String getTextFromWord(byte[] file) {

40. String text = "";

41. InputStream fis = null;

42. WordExtractor ex = null;

43. try {

44. // word 2003：图⽚不会被读取

45. fis = new ByteArrayInputStream(file);

46. ex = new WordExtractor(fis);

47. text = ex.getText();

48. ex.close();

49. } catch (Exception e) {

50. // TODO Auto-generated catch block

51. e.printStackTrace();

52. }

53. return text;

54. }

55.

56. // 读取Word2007+的全部内容 docx

57. private static String getTextFromWord2007(byte[] file) {

58. String text = "";

59. InputStream fis = null;

60. XWPFDocument doc = null;

61. XWPFWordExtractor workbook = null;

62. try {

63. fis = new ByteArrayInputStream(file);

64. doc = new XWPFDocument(fis);

65. workbook = new XWPFWordExtractor(doc);

66. text = Text();

67. workbook.close();

68. } catch (IOException e) {

69. // TODO Auto-generated catch block

70. e.printStackTrace();

71. }

72. return text;

73. }

74.

75. // 读取Excel97-2003的全部内容 xls

76. private static String getTextFromExcel(byte[] file) {

77. InputStream is = null;

78. HSSFWorkbook wb = null;

79. String text = "";

80. try {

81. is = new ByteArrayInputStream(file);

82. wb = new HSSFWorkbook(new POIFSFileSystem(is));

83. ExcelExtractor extractor = new ExcelExtractor(wb);

84. extractor.setFormulasNotResults(false);

85. extractor.setIncludeSheetNames(false);

86. text = Text();

87. extractor.close();

88. } catch (IOException e) {

89. e.printStackTrace();

90. }

91. return text;

92. }

93.

94. // 读取Excel2007+的全部内容 xlsx

95. private static String getTextFromExcel2007(byte[] file) {

96. InputStream is = null;

97. XSSFWorkbook workBook = null;

98. String text = "";

99. try {

100. is = new ByteArrayInputStream(file);

101. workBook = new XSSFWorkbook(is);

102. XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook); 103. extractor.setIncludeSheetNames(false);

104. text = Text();

105. extractor.close();

106. } catch (IOException e) {

107. e.printStackTrace();

108. }

109. return text;

110. }

111.

112. // 读取Powerpoint97-2003的全部内容 ppt

113. private static String getTextFromPPT(byte[] file) {

114. String text = "";

115. InputStream fis = null;

116. PowerPointExtractor ex = null;

117. try {

118. // word 2003：图⽚不会被读取

119. fis = new ByteArrayInputStream(file);

120. ex = new PowerPointExtractor(fis);

121. text = ex.getText();

122. ex.close();

123. } catch (Exception e) {

124. // TODO Auto-generated catch block

125. e.printStackTrace();

126. }

127. return text;

128. }

129.

130. // 抽取幻灯⽚2007+全部内容 pptx

131. private static String getTextFromPPT2007(byte[] file) {

132. InputStream is = null;

133. XMLSlideShow slide = null;

134. String text = "";

135. try {

136. is = new ByteArrayInputStream(file);

137. slide = new XMLSlideShow(is);

138. XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide); 139. text = Text();

140. extractor.close();

141. } catch (IOException e) {

142. e.printStackTrace();

143. }

144. return text;

145. }

146.

147. // 读取pdf⽂件全部内容 pdf

148. private static String getTextFormPDF(byte[] file) {

149. String text = "";

150. PDDocument pdfdoc = null;

151. InputStream is = null;

152. try {

153. is = new ByteArrayInputStream(file);

154. pdfdoc = PDDocument.load(is);

155. PDFTextStripper stripper = new PDFTextStripper();

156. text = Text(pdfdoc);

157.

158. } catch (IOException e) {

159. e.printStackTrace();

160. } finally {

161. try {

162. if (pdfdoc != null) {

163. pdfdoc.close();

164. }

165. } catch (IOException e) {

166. // TODO Auto-generated catch block

167. e.printStackTrace();

168. }

169. }

170. return text;

171. }

172.

173. // 读取txt⽂件全部内容 txt

174. private static String getTextFormTxt(byte[] file) {

175. String text = "";

176. try {

177. String encoding = get_charset(file);

178. text = new String(file, encoding);

179. } catch (UnsupportedEncodingException e) {

180. e.printStackTrace();

181. } catch (IOException e1) {

182. e1.printStackTrace();

183. }

184. return text;

185. }

186.

187. // 获得txt⽂件编码⽅式

188. private static String get_charset(byte[] file) throws IOException {

189. String charset = "GBK";

190. byte[] first3Bytes = new byte[3];

191. InputStream bis = null;

192. try {

193. boolean checked = false;

194. bis = new ByteArrayInputStream(file);

195. bis.mark(0);

196. int read = ad(first3Bytes, 0, 3);

197. if (read == -1)

198. return charset;

199. if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {

200. charset = "UTF-16LE";

201. checked = true;

202. } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { 203. charset = "UTF-16BE";

204. checked = true;

205. } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB 206. && first3Bytes[2] == (byte) 0xBF) {

207. charset = "UTF-8";

208. checked = true;

209. }

210. set();

211. if (!checked) {

212. while ((read = ad()) != -1) {

213. if (read >= 0xF0)

214. break;

215. if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK 216. break;

217. if (0xC0 <= read && read <= 0xDF) {

218. read = ad();

219. if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)

220. // (0x80 - 0xBF)，也可能在GB编码内

221. continue;

222. else

223. break;

224. } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是⼏率较⼩225. read = ad();

226. if (0x80 <= read && read <= 0xBF) {

227. read = ad();

228. if (0x80 <= read && read <= 0xBF) {

229. charset = "UTF-8";

230. break;

231. } else

232. break;

233. } else

234. break;

235. }

236. }

237. }

238. } catch (Exception e) {

239. e.printStackTrace();

240. } finally {

241. if (bis != null) {

242. bis.close();

243. }

244. }

245. return charset;

246. }

247. }</pre><br>

248. <br>

excel最强教科书完全版pdf249. <p></p>

250. <pre></pre>

251. <br>

252. <p></p>

253. <div ><pre name="code"class="java"> // 读取pdf⽂件254. private static String getTextFormPDF(byte[] file) {

255. String text = "";

256. PDDocument pdfdoc = null;

257. InputStream is = null;

258. try {

259. is = new ByteArrayInputStream(file);

260. pdfdoc = PDDocument.load(is);

261. PDFTextStripper stripper = new PDFTextStripper();

262. text = Text(pdfdoc);

263.

264. } catch (IOException e) {

265. e.printStackTrace();

266. } finally {

267. try {

268. if (pdfdoc != null) {

269. pdfdoc.close();

270. }

271. } catch (IOException e) {

272. // TODO Auto-generated catch block

273. e.printStackTrace();

274. }

275. }

276. return text;

277. }

278.

279. // 读取txt⽂件

280. private static String getTextFormTxt(byte[] file) {

281. String text = "";

282. try {

283. String encoding = get_charset(file);

284. text = new String(file, encoding);

285. } catch (UnsupportedEncodingException e) {

286. e.printStackTrace();

287. } catch (IOException e1) {

288. e1.printStackTrace();

289. }

290. return text;

291. }</pre></div>

688IT编程网

JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符回溯引用和前后查匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式选择题

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

688IT编程网

JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符 回溯引用和前后查 匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式 选择题

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

java正则表达式选择题

非零金额正则表达式

基本的元字符回溯引用和前后查匹配模式

java正则表达式选择题

非零金额正则表达式