java抓取网页内容三种方式
2011-12-05 11:23
一、GetURL.java import java.io.*; import java.*; public class GetURL { public static void main(String[] args) { InputStream in = null; OutputStream out = null; try { // 检查命令行参数 if ((args.length != 1)&& (args.length != 2)) throw new IllegalArgumentException("Wrong number of args"); URL url = new URL(args[0]); //创建 URL in = url.openStream(); // 打开到这个URL的流 if (args.length == 2) // 创建一个适当的输出流 out = new FileOutputStream(args[1]); else out = System.out; // 复制字节到输出流 byte[] buffer = new byte[4096]; int bytes_read; while((bytes_read = in.read(buffer)) != -1) out.write(buffer, 0, bytes_read); } catch (Exception e) { println(e); println("Usage: java GetURL <URL> [<filename>]"); } finally { //无论如何都要关闭流 try { in.close(); out.close(); } catch (Exception e) {} } } } 运行方法: C:\java>java GetURL 127.0.0.1:8080/kj/index.html index.html 二、geturl.jsp <%@ page import="java.io.*" contentType="text/html;charset=gb2312" %> java和jsp<%@ page language="java" import="java.*"%> <% String htmpath=null; BufferedReader in = null; InputStreamReader isr = null; InputStream is = null; PrintWriter pw=null; HttpURLConnection huc = null; try{ htmpath=getServletContext().getRealPath("/")+"html\\morejava.html"; pw=new PrintWriter(htmpath); URL url = new URL("127.0.0.1:8080/kj/morejava.jsp"); //创建 URL huc = (HttpURLConnection)url.openConnection(); is = InputStream(); isr = new InputStreamReader(is); in = new BufferedReader(isr); String line = null; while(((line = in.readLine()) != null)) { if(line.length()==0) continue; pw.println(line); } } catch (Exception e) { println(e); } finally { //无论如何都要关闭流 try { is.close(); isr.close();in.close();huc.disconnect();pw.close(); } catch (Exception e) {} } %> OK--,创建文件成功 三、HttpClient.java import java.io.*; import java.*; public class HttpClient { public static void main(String[] args) { try { // 检查命令行参数 if ((args.length != 1) && (args.length != 2)) throw new IllegalArgumentException("Wrong number of args"); OutputStream to_file; if (args.length == 2) to_file = new FileOutputStream(args[1]);//输出到文件 else to_file = System.out;//输出到控制台 URL url = new URL(args[0]); String protocol = Protocol(); if (!protocol.equals("http")) throw new IllegalArgumentException("Must use 'http:' protocol"); String host = Host(); int port = Port(); if (port == -1) port = 80; String filename = File(); Socket socket = new Socket(host, port);//打开一个socket连接 InputStream from_server = InputStream();//获取输入流 PrintWriter to_server = new OutputStream());//获取输出流 to_server.print("GET " + filename + "\n\n");//请求服务器上的文件 to_server.flush(); // Send it right now! byte[] buffer = new byte[4096]; int bytes_read; //读服务器上的响应,并写入文件。 while((bytes_read = ad(buffer)) != -1) to_file.write(buffer, 0, bytes_read); socket.close(); to_file.close(); } catch (Exception e) { println(e); println("Usage: java HttpClient <URL> [<filename>]"); } } } 运行方法:C:\java>java HttpClient 127.0.0.1:8080/kj/index.html index.html 注意中文可能会显示乱码,在得到源码后,应该做相应的转码工作,例如: public static String GetURLstr(String strUrl) { InputStream in = null; OutputStream out = null; String strdata = ""; try { URL url = new URL(strUrl); // 创建 URL in = url.openStream(); // 打开到这个URL的流 out = System.out; // 复制字节到输出流 byte[] buffer = new byte[4096]; int bytes_read; while ((bytes_read = in.read(buffer)) != -1) { String reads = new String(buffer, 0, bytes_read, "UTF-8"); //System.out.print(reads); strdata = strdata + reads; // out.write(buffer, 0, bytes_read); } in.close(); out.close(); return strdata; } catch (Exception e) { println(e); println("Usage: java GetURL <URL> [<filename>]"); return strdata; } |
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论