【Java爬虫】爬取网页中的内容,提取其中文字--688IT编程网

【Java爬⾍】爬取⽹页中的内容，提取其中⽂字

挺乱的，临时存⼀下

package aw;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.HttpURLConnection;

import java.URL;

import Pattern;

public class HtmlText {

public static String SRC ="FirstPage.html";

public static void main(String[] args)throws IOException {

// 爬取⽹页写进txt

InputStream is2 = UrlCrawBoke.doGet("blog.csdn/sinat_42483341/article/details/95988975");

String pageStr = UrlCrawBoke.inputStreamToString(is2,"UTF-8");

is2.close();

FileWriter is1 =new FileWriter("");

is1.write(pageStr);// 可以⽤ write("666")

is1.close();

// 正则提取：从html中提取纯⽂本

String after =String());

FileWriter is =new FileWriter("");

is.write(after);// 可以⽤ write("666")

is.close();

System.out.println(after);

}

// 从html中提取纯⽂本

public static String Html2Text(String inputString){

String htmlStr = inputString;// 含html标签的字符串

String textStr ="";

Pattern p_script;

Matcher m_script;

Pattern p_style;

replaceall()Matcher m_style;

Pattern p_html;

Matcher m_html;

try{

String regEx_script ="<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";// 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> String regEx_style ="<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";// 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>

String regEx_html ="<[^>]+>";// 定义HTML标签的正则表达式

p_script = Patternpile(regEx_script, Pattern.CASE_INSENSITIVE);

m_script = p_script.matcher(htmlStr);

htmlStr = placeAll("");// 过滤script标签

p_style = Patternpile(regEx_style, Pattern.CASE_INSENSITIVE);

m_style = p_style.matcher(htmlStr);

htmlStr = placeAll("");// 过滤style标签

p_html = Patternpile(regEx_html, Pattern.CASE_INSENSITIVE);

m_html = p_html.matcher(htmlStr);

htmlStr = placeAll("");// 过滤html标签

textStr = htmlStr;

}catch(Exception e){

}

// 剔除空格⾏

textStr = placeAll("[ ]+","");

textStr = placeAll("1","").replaceAll("2","").replaceAll("3","").replaceAll("4","")

.replaceAll("5","").replaceAll("6","").replaceAll("7","").replaceAll("8","").replaceAll("9","")

.replaceAll("0","");

textStr = placeAll("(?m)^\\s*$(\\n|\\r\\n)","");

textStr = placeAll("\t","");

textStr = placeAll(" ","").replace(">","").replace("—","");// 还有什么查⼀查

textStr = placeAll("\\\\","");// 正则表达式中匹配⼀个反斜杠要⽤四个反斜杠

textStr = placeAll("\r\n","");

textStr = placeAll("\n","");

return textStr;// 返回⽂本字符串

}

class UrlCrawBoke {

public static InputStream doGet(String urlstr)throws IOException {

URL url =new URL(urlstr);

HttpURLConnection conn =(HttpURLConnection) url.openConnection();

conn.setRequestProperty("User-Agent",

"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"); InputStream inputStream = InputStream();

return inputStream;

}

public static String inputStreamToString(InputStream is, String charset)throws IOException {

byte[] bytes =new byte[1024];

int byteLength =0;

StringBuffer sb =new StringBuffer();

while((byteLength = is.read(bytes))!=-1){

sb.append(new String(bytes,0, byteLength, charset));

}

String();

}

688IT编程网

【Java爬虫】爬取网页中的内容,提取其中文字

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符回溯引用和前后查匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式选择题

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

688IT编程网

【Java爬虫】爬取网页中的内容,提取其中文字

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符 回溯引用和前后查 匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式 选择题

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

java正则表达式选择题

非零金额正则表达式

基本的元字符回溯引用和前后查匹配模式

java正则表达式选择题

非零金额正则表达式