java抓取网页内容三种方式
2011-12-05 11:23
一、GetURL.java
import java.io.*;
import java.*;
public class GetURL {
public static void main(String[] args) {
InputStream in = null;
OutputStream out = null;
try {
// 检查命令行参数
if ((args.length != 1)&& (args.length != 2))
throw new IllegalArgumentException("Wrong number of args");


URL url = new URL(args[0]); //创建 URL
in = url.openStream(); // 打开到这个URL的流
if (args.length == 2) // 创建一个适当的输出流
out = new FileOutputStream(args[1]);
else out = System.out;

// 复制字节到输出流
byte[] buffer = new byte[4096];
int bytes_read;
while((bytes_read = in.read(buffer)) != -1)
out.write(buffer, 0, bytes_read);
}

catch (Exception e) {
println(e);
println("Usage: java GetURL <URL> [<filename>]");
}
finally { //无论如何都要关闭流
try { in.close(); out.close(); } catch (Exception e) {}
}
}
}

运行方法:
C:\java>java GetURL 127.0.0.1:8080/kj/index.html index.html
二、geturl.jsp
<%@ page import="java.io.*" contentType="text/html;charset=gb2312" %>
<%@ page language="java" import="java.*"%>
java和jsp

<%
String htmpath=null;
BufferedReader in = null;
InputStreamReader isr = null;
InputStream is = null;
PrintWriter pw=null;
HttpURLConnection huc = null;
try{
htmpath=getServletContext().getRealPath("/")+"html\\morejava.html";
pw=new PrintWriter(htmpath);
URL url = new URL("127.0.0.1:8080/kj/morejava.jsp"); //创建 URL
huc = (HttpURLConnection)url.openConnection();
is = InputStream();
isr = new InputStreamReader(is);
in = new BufferedReader(isr);
String line = null;
while(((line = in.readLine()) != null)) {
if(line.length()==0)
continue;
pw.println(line);
}

}

catch (Exception e) {
println(e);
}
finally { //无论如何都要关闭流
try { is.close(); isr.close();in.close();huc.disconnect();pw.close();
} catch (Exception e) {}
}

%>
OK--,创建文件成功
三、HttpClient.java
import java.io.*;
import java.*;
public class HttpClient {
public static void main(String[] args) {
try {
// 检查命令行参数
if ((args.length != 1) && (args.length != 2))
throw new IllegalArgumentException("Wrong number of args");

OutputStream to_file;
if (args.length == 2)
to_file = new FileOutputStream(args[1]);//输出到文件
else
to_file = System.out;//输出到控制台


URL url = new URL(args[0]);
String protocol = Protocol();
if (!protocol.equals("http"))
throw new IllegalArgumentException("Must use 'http:' protocol");
String host = Host();
int port = Port();
if (port == -1) port = 80;
String filename = File();
Socket socket = new Socket(host, port);//打开一个socket连接
InputStream from_server = InputStream();//获取输入流
PrintWriter to_server = new OutputStream());//获取输出流


to_server.print("GET " + filename + "\n\n");//请求服务器上的文件
to_server.flush(); // Send it right now!

byte[] buffer = new byte[4096];
int bytes_read;
//读服务器上的响应,并写入文件。
while((bytes_read = ad(buffer)) != -1)
to_file.write(buffer, 0, bytes_read);

socket.close();
to_file.close();
}
catch (Exception e) {
println(e);
println("Usage: java HttpClient <URL> [<filename>]");
}
}
}

运行方法:C:\java>java HttpClient 127.0.0.1:8080/kj/index.html index.html
注意中文可能会显示乱码,在得到源码后,应该做相应的转码工作,例如:

public static String GetURLstr(String strUrl)
{
InputStream in = null;
OutputStream out = null;
String strdata = "";
try
{
URL url = new URL(strUrl); // 创建 URL
in = url.openStream(); // 打开到这个URL的流
out = System.out;
// 复制字节到输出流
byte[] buffer = new byte[4096];
int bytes_read;
while ((bytes_read = in.read(buffer)) != -1)
{
String reads = new String(buffer, 0, bytes_read, "UTF-8");
//System.out.print(reads);
strdata = strdata + reads;
// out.write(buffer, 0, bytes_read);
}
in.close();
out.close();
return strdata;
}
catch (Exception e)
{
println(e);
println("Usage: java GetURL <URL> [<filename>]");
return strdata;
}

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。