JAVA提取Word,Excel,PPT,PDF,TXT等⽂档⽂字内容⾸先引⼊Maven库
[html]
01. <dependency>
02. <groupId>org.apache.poi</groupId>
03. <artifactId>poi</artifactId>
04. <version>3.15</version>
05. </dependency>
06. <dependency>
07. <groupId>org.apache.poi</groupId>
08. <artifactId>poi-ooxml</artifactId>
09. <version>3.15</version>
10. </dependency>
11. <dependency>
12. <groupId>org.apache.poi</groupId>
13. <artifactId>poi-scratchpad</artifactId>
14. <version>3.15</version>
15. </dependency>
16. <dependency>
17. <groupId>org.apache.pdfbox</groupId>
18. <artifactId>pdfbox</artifactId>
19. <version>2.0.4</version>
20. </dependency>
[java]
01.
[java]
01. <pre name="code"class="html">public class ParseText {
02.
03. // 判断⽂档类型,调⽤不同的解析⽅法
04. public static String parse(byte[] buffer, String suffix) {
05. String text = "";
06. switch (suffix) {
07. case"doc":
08. text = getTextFromWord(buffer);
09. break;
10. case"docx":
11. text = getTextFromWord2007(buffer);
12. break;
13. case"xls":
14. text = getTextFromExcel(buffer);
15. break;
16. case"xlsx":
17. text = getTextFromExcel2007(buffer);
18. break;
19. case"ppt":
20. text = getTextFromPPT(buffer);
21. break;
22. case"pptx":
23. text = getTextFromPPT2007(buffer);
24. break;
25. case"pdf":
26. text = getTextFormPDF(buffer);
27. break;
28. case"txt":
29. text = getTextFormTxt(buffer);
30. break;
31. default:
32. System.out.println("不⽀持解析的⽂档类型");
33. }
34.
35. placeAll("\\s*", "");
36. }
37.
38. // 读取Word97-2003的全部内容 doc
38. // 读取Word97-2003的全部内容 doc
39. private static String getTextFromWord(byte[] file) {
40. String text = "";
41. InputStream fis = null;
42. WordExtractor ex = null;
43. try {
44. // word 2003:图⽚不会被读取
45. fis = new ByteArrayInputStream(file);
46. ex = new WordExtractor(fis);
47. text = ex.getText();
48. ex.close();
49. } catch (Exception e) {
50. // TODO Auto-generated catch block
51. e.printStackTrace();
52. }
53. return text;
54. }
55.
56. // 读取Word2007+的全部内容 docx
57. private static String getTextFromWord2007(byte[] file) {
58. String text = "";
59. InputStream fis = null;
60. XWPFDocument doc = null;
61. XWPFWordExtractor workbook = null;
62. try {
63. fis = new ByteArrayInputStream(file);
64. doc = new XWPFDocument(fis);
65. workbook = new XWPFWordExtractor(doc);
66. text = Text();
67. workbook.close();
68. } catch (IOException e) {
69. // TODO Auto-generated catch block
70. e.printStackTrace();
71. }
72. return text;
73. }
74.
75. // 读取Excel97-2003的全部内容 xls
76. private static String getTextFromExcel(byte[] file) {
77. InputStream is = null;
78. HSSFWorkbook wb = null;
79. String text = "";
80. try {
81. is = new ByteArrayInputStream(file);
82. wb = new HSSFWorkbook(new POIFSFileSystem(is));
83. ExcelExtractor extractor = new ExcelExtractor(wb);
84. extractor.setFormulasNotResults(false);
85. extractor.setIncludeSheetNames(false);
86. text = Text();
87. extractor.close();
88. } catch (IOException e) {
89. e.printStackTrace();
90. }
91. return text;
92. }
93.
94. // 读取Excel2007+的全部内容 xlsx
95. private static String getTextFromExcel2007(byte[] file) {
96. InputStream is = null;
97. XSSFWorkbook workBook = null;
98. String text = "";
99. try {
100. is = new ByteArrayInputStream(file);
101. workBook = new XSSFWorkbook(is);
102. XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook); 103. extractor.setIncludeSheetNames(false);
104. text = Text();
105. extractor.close();
106. } catch (IOException e) {
106. } catch (IOException e) {
107. e.printStackTrace();
108. }
109. return text;
110. }
111.
112. // 读取Powerpoint97-2003的全部内容 ppt
113. private static String getTextFromPPT(byte[] file) {
114. String text = "";
115. InputStream fis = null;
116. PowerPointExtractor ex = null;
117. try {
118. // word 2003:图⽚不会被读取
119. fis = new ByteArrayInputStream(file);
120. ex = new PowerPointExtractor(fis);
121. text = ex.getText();
122. ex.close();
123. } catch (Exception e) {
124. // TODO Auto-generated catch block
125. e.printStackTrace();
126. }
127. return text;
128. }
129.
130. // 抽取幻灯⽚2007+全部内容 pptx
131. private static String getTextFromPPT2007(byte[] file) {
132. InputStream is = null;
133. XMLSlideShow slide = null;
134. String text = "";
135. try {
136. is = new ByteArrayInputStream(file);
137. slide = new XMLSlideShow(is);
138. XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide); 139. text = Text();
140. extractor.close();
141. } catch (IOException e) {
142. e.printStackTrace();
143. }
144. return text;
145. }
146.
147. // 读取pdf⽂件全部内容 pdf
148. private static String getTextFormPDF(byte[] file) {
149. String text = "";
150. PDDocument pdfdoc = null;
151. InputStream is = null;
152. try {
153. is = new ByteArrayInputStream(file);
154. pdfdoc = PDDocument.load(is);
155. PDFTextStripper stripper = new PDFTextStripper();
156. text = Text(pdfdoc);
157.
158. } catch (IOException e) {
159. e.printStackTrace();
160. } finally {
161. try {
162. if (pdfdoc != null) {
163. pdfdoc.close();
164. }
165. } catch (IOException e) {
166. // TODO Auto-generated catch block
167. e.printStackTrace();
168. }
169. }
170. return text;
171. }
172.
173. // 读取txt⽂件全部内容 txt
174. private static String getTextFormTxt(byte[] file) {
174. private static String getTextFormTxt(byte[] file) {
175. String text = "";
176. try {
177. String encoding = get_charset(file);
178. text = new String(file, encoding);
179. } catch (UnsupportedEncodingException e) {
180. e.printStackTrace();
181. } catch (IOException e1) {
182. e1.printStackTrace();
183. }
184. return text;
185. }
186.
187. // 获得txt⽂件编码⽅式
188. private static String get_charset(byte[] file) throws IOException {
189. String charset = "GBK";
190. byte[] first3Bytes = new byte[3];
191. InputStream bis = null;
192. try {
193. boolean checked = false;
194. bis = new ByteArrayInputStream(file);
195. bis.mark(0);
196. int read = ad(first3Bytes, 0, 3);
197. if (read == -1)
198. return charset;
199. if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
200. charset = "UTF-16LE";
201. checked = true;
202. } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { 203. charset = "UTF-16BE";
204. checked = true;
205. } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB 206. && first3Bytes[2] == (byte) 0xBF) {
207. charset = "UTF-8";
208. checked = true;
209. }
210. set();
211. if (!checked) {
212. while ((read = ad()) != -1) {
213. if (read >= 0xF0)
214. break;
215. if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK 216. break;
217. if (0xC0 <= read && read <= 0xDF) {
218. read = ad();
219. if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
220. // (0x80 - 0xBF),也可能在GB编码内
221. continue;
222. else
223. break;
224. } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是⼏率较⼩225. read = ad();
226. if (0x80 <= read && read <= 0xBF) {
227. read = ad();
228. if (0x80 <= read && read <= 0xBF) {
229. charset = "UTF-8";
230. break;
231. } else
232. break;
233. } else
234. break;
235. }
236. }
237. }
238. } catch (Exception e) {
239. e.printStackTrace();
240. } finally {
241. if (bis != null) {
242. bis.close();
242. bis.close();
243. }
244. }
245. return charset;
246. }
247. }</pre><br>
248. <br>
excel最强教科书完全版pdf249. <p></p>
250. <pre></pre>
251. <br>
252. <p></p>
253. <div ><pre name="code"class="java"> // 读取pdf⽂件254. private static String getTextFormPDF(byte[] file) {
255. String text = "";
256. PDDocument pdfdoc = null;
257. InputStream is = null;
258. try {
259. is = new ByteArrayInputStream(file);
260. pdfdoc = PDDocument.load(is);
261. PDFTextStripper stripper = new PDFTextStripper();
262. text = Text(pdfdoc);
263.
264. } catch (IOException e) {
265. e.printStackTrace();
266. } finally {
267. try {
268. if (pdfdoc != null) {
269. pdfdoc.close();
270. }
271. } catch (IOException e) {
272. // TODO Auto-generated catch block
273. e.printStackTrace();
274. }
275. }
276. return text;
277. }
278.
279. // 读取txt⽂件
280. private static String getTextFormTxt(byte[] file) {
281. String text = "";
282. try {
283. String encoding = get_charset(file);
284. text = new String(file, encoding);
285. } catch (UnsupportedEncodingException e) {
286. e.printStackTrace();
287. } catch (IOException e1) {
288. e1.printStackTrace();
289. }
290. return text;
291. }</pre></div>
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论