【Java爬⾍】爬取⽹页中的内容,提取其中⽂字
挺乱的,临时存⼀下
package aw;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.HttpURLConnection;
import java.URL;
import Pattern;
public class HtmlText {
public static String SRC ="FirstPage.html";
public static void main(String[] args)throws IOException {
// 爬取⽹页写进txt
InputStream is2 = UrlCrawBoke.doGet("blog.csdn/sinat_42483341/article/details/95988975");
String pageStr = UrlCrawBoke.inputStreamToString(is2,"UTF-8");
is2.close();
FileWriter is1 =new FileWriter("");
is1.write(pageStr);// 可以⽤ write("666")
is1.close();
// 正则提取:从html中提取纯⽂本
String after =String());
FileWriter is =new FileWriter("");
is.write(after);// 可以⽤ write("666")
is.close();
System.out.println(after);
}
// 从html中提取纯⽂本
public static String Html2Text(String inputString){
String htmlStr = inputString;// 含html标签的字符串
String textStr ="";
Pattern p_script;
Matcher m_script;
Pattern p_style;
replaceall()Matcher m_style;
Pattern p_html;
Matcher m_html;
try{
String regEx_script ="<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";// 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script>  String regEx_style ="<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";// 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>
String regEx_html ="<[^>]+>";// 定义HTML标签的正则表达式
p_script = Patternpile(regEx_script, Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = placeAll("");// 过滤script标签
p_style = Patternpile(regEx_style, Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = placeAll("");// 过滤style标签
p_html = Patternpile(regEx_html, Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = placeAll("");// 过滤html标签
textStr = htmlStr;
}catch(Exception e){
}
// 剔除空格⾏
textStr = placeAll("[ ]+","");
textStr = placeAll("[ ]+","");
textStr = placeAll("1","").replaceAll("2","").replaceAll("3","").replaceAll("4","")
.replaceAll("5","").replaceAll("6","").replaceAll("7","").replaceAll("8","").replaceAll("9","")
.replaceAll("0","");
textStr = placeAll("(?m)^\\s*$(\\n|\\r\\n)","");
textStr = placeAll("\t","");
textStr = placeAll(" ","").replace(">","").replace("—","");// 还有什么查⼀查
textStr = placeAll("\\\\","");// 正则表达式中匹配⼀个反斜杠要⽤四个反斜杠
textStr = placeAll("\r\n","");
textStr = placeAll("\n","");
return textStr;// 返回⽂本字符串
}
}
class UrlCrawBoke {
public static InputStream doGet(String urlstr)throws IOException {
URL url =new URL(urlstr);
HttpURLConnection conn =(HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");  InputStream inputStream = InputStream();
return inputStream;
}
public static String inputStreamToString(InputStream is, String charset)throws IOException {
byte[] bytes =new byte[1024];
int byteLength =0;
StringBuffer sb =new StringBuffer();
while((byteLength = is.read(bytes))!=-1){
sb.append(new String(bytes,0, byteLength, charset));
}
String();
}
}

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。