java的selenium和chromeDriver爬取淘宝店铺商品列表失败,做
个总结
因为公司需要爬取淘宝的店铺商品列表,所以研究了下,最后结果是失败的,技术不⾏没办法,做⼀个记录,等待以后有⼤神搞定。
⼀、selenium的使⽤
引⼊jar包
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
</dependency>
同时呢还需要⼀个,直接放在项⽬的⽬录下,
先说遇到的问题吧,代码在最后附上。淘宝的店铺商品列表需要登录状态下才能看到,这就需要有登录状态,有了登录状态,还是有问题,当你打开爬取的店铺页⾯,尽在页⾯上采⽤点击的⽅式进⾏翻页,3-5秒这样⼀个频率是没问题的,如果你以10秒或10秒⼀下这样⼀个频率去操作打开不同的店铺页⾯爬取数据,会出现频繁访问的弹窗,10秒到15秒随机等待的话,我试了试貌似能⽀持很久,这个弹窗需要⼿动解除滑块,问题卡就卡在滑块上了,⽆论是登陆的滑块,还是频繁访问的滑块,⽤驱动器操作都过不了,我也尝试了滑块动作的时候速度变化,模拟⼈⼿动滑动也不⾏,⽽且在测试的时候还发现了另⼀个问题,滑块出现的次数多了,会出现封号,甚⾄封ip的现象。
1、滑块解决不了,
2、滑块频繁出现后,会封号甚⾄封ip
这两个要命的问题直接给我弄崩溃了,可能解决滑块还是解决不了问题,⽅向可能就是错的,应该想办法让他不出滑块,说⼀下前后过程吧。
开始的时候想⽤2台或多台windows服务器做这个爬取的服务器,登陆靠⼈⼯,依靠mq发送消息,只要之后程序能保持登陆状态就⾏,这个时候需要解决滑块问题,
@Slf4j
public class BootStrap {
public static ChromeDriver driver;
static {
try {
//启动浏览器
getDriver();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws InterruptedException {
/
/启动消息监听
// start();
}
@SuppressWarnings("resource")
public static void start() throws InterruptedException {
new ClassPathXmlApplicationContext("l");
//等待消息发送
Thread.sleep(1000);
while(true){
Thread.sleep(10 * 60 * 1000);
log.info("keep session ...");
<("all/");
}
}
/**
* 获取 ChromeDriver
* @throws InterruptedException
*/
private static void getDriver() throws InterruptedException{
String os = Property("os.name");
if (os.toLowerCase().startsWith("win")) {
System.setProperty("webdriver.chrome.driver", Property("user.dir") + "\\chromedriver_win32\\");
} else {
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
}
ChromeOptions options = new ChromeOptions();
// 关闭界⾯上的---Chrome正在受到⾃动软件的控制
// 允许重定向
options.addArguments("--disable-web-security");
// 最⼤化
options.addArguments("--start-maximized");
options.addArguments("--no-sandbox");
List<String> excludeSwitches = wArrayList("enable-automation");
options.setExperimentalOption("excludeSwitches", excludeSwitches);
driver = new ChromeDriver(options);
//响应等待时间
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
<("all");
Thread.sleep(1000);
while(true) {
if(isLogin()){
break;
}
Thread.sleep(2000);
}
}
/**
* 判断是否登录
* @return boolean
*/
private static boolean isLogin(){
Document doc = Jsoup.PageSource());
().contains("Hi,") || ().contains("Hi!")) {
return true;
}
return false;
}
}
上⾯是启动浏览器打开登陆页⾯,这个时候到淘宝有三个登陆,⼀个是天猫的,⼀个是淘宝的,还有⼀个是淘宝登陆页⾥⾯有个微博登陆的窗⼝,淘宝和天猫的都有滑块,⽽微博登陆的窗⼝是图⽚验证码,如果有道友对图⽚验证码有信⼼可以去试试。
这时候⽤⼈⼯扫码登陆,但是爬取的时候发现频繁访问窗⼝,尝试解锁发现过不去,这个时候发现换个账号就不会出现解锁弹窗,就想着如果出现弹窗就换个账号重新登录,这就需要解决⾃动登录,事实证明⾃⼰太年轻,想的太简单了,登录页⾯也有滑块,
但是发现微博登录淘宝的页⾯是验证码,可以尝试下。
先贴拟⼈滑块登陆的代码吧
public class Test {
public static ChromeDriver driver;
public static List<Integer> back_tracks = new ArrayList<>();
static {
try {
//启动浏览器
getDriver();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException, InterruptedException {
// start();
}
/**
* 获取 ChromeDriver
* @throws InterruptedException
*/
private static void getDriver() throws InterruptedException{
String os = Property("os.name");
if (os.toLowerCase().startsWith("win")) {
System.setProperty("webdriver.chrome.driver",
} else {
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
}
ChromeOptions options = new ChromeOptions();
// 关闭界⾯上的---Chrome正在受到⾃动软件的控制
options.addArguments("--disable-infobars");
// 允许重定向
options.addArguments("--disable-web-security");
// 最⼤化
options.addArguments("--start-maximized");
List<String> excludeSwitches = wArrayList("enable-automation");
options.setExperimentalOption("excludeSwitches", excludeSwitches);
driver = new ChromeDriver(options);
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
<("login.taobao/member/login.jhtml");
WebElement itemDiv = driver.findElement(By.id("J_Quick2Static"));
itemDiv.click();
WebElement tpl_username_1 = driver.findElement(By.id("TPL_username_1"));
WebElement TPL_password_1 = driver.findElement(By.id("TPL_password_1"));
WebElement nc_1_n1z = driver.findElement(By.id("nc_1_n1z"));
WebElement usernameFeld = driver.findElement(By.className("username-field")); usernameFeld.click();
tpl_username_1.sendKeys("12");
TPL_password_1.click();
TPL_password_1.sendKeys("12");
Actions actions = new Actions(driver);
actions.clickAndHold(nc_1_n1z);
List<Double> tacks = getTacks();
Random ra =new Random();
for (Double tack : tacks) {
int a = tack.intValue();
int yoffset_random = ra.nextInt(6) - 2;
Thread.sleep(20+ ra.nextInt(20));
}
Thread.sleep(200+ ra.nextInt(400));
for (Integer back_track : back_tracks) {
int yoffset_random = ra.nextInt(4) - 2;
}
int x = ra.nextInt(3) - 4;
int y = ra.nextInt(4) - 2;
x = ra.nextInt(2) +1;
y = ra.nextInt(4) - 2;
Thread.sleep(200+ ra.nextInt(400));
while(true) {
if(isLogin()){
java做什么的break;
}
Thread.sleep(2000);
}
}
/**
* 判断是否登录
* @return boolean
*/
private static boolean isLogin(){
Document doc = Jsoup.PageSource());
().contains("淘宝账户登录") ) {
return false;
}
return true;
}
/**
* 改变拖拽速度,更加拟⼈化
* @return
*/
public static List<Double> getTacks(){
List<Double> list = new ArrayList<>();
Random ra =new Random();
int length = 258;
double mid1 = und(length *( Math.random() * 0.1 + 0.1));
double mid2 = und(length *( Math.random() * 0.1 + 0.65));
double mid3 = und(length *( Math.random() * 0.04 + 0.84));
double current = 0;
double a = 0;
double v = 0;
double t = 0.2;
while (current < length){
if(current < mid1){
a = ra.nextInt(5)+ 10;
}else if(current < mid2){
a = ra.nextInt(10) + 30;
}else if(current < mid3){
a = -70;
}else{
a = ra.nextInt(7)-25;
}
double v0 = v;
v = v0 + a * t;
v = v > 0 ? v : 0;
double move = v0 * t + ((a * (t * t))/2);
move = und(move > 0 ? move : 1);
current += move;
list.add(move);
}
double out_range = length - current;
if (out_range < -8){
int sub = ((Double)(out_range + 8)).intValue();
back_tracks.add(-1);
back_tracks.add(sub);
back_tracks.add(-3);
back_tracks.add(-1);
back_tracks.add(-1);
back_tracks.add(-1);
back_tracks.add(-1);
}else if(out_range < -2){
int sub = ((Double)(out_range + 3)).intValue();
back_tracks.add(-1);
back_tracks.add(-1);
back_tracks.add(sub);
}
System.out.println(list);
return list;
}
}
下⾯贴验证码识别的,我⽤的是百度的图⽚识别,基本是没有能正确识别的,其他代码差不多,我就贴截图和识别的代码。/**
* 获取 ChromeDriver
* @throws InterruptedException
*/
private static void getDriver() throws InterruptedException{
String os = Property("os.name");
if (os.toLowerCase().startsWith("win")) {
System.setProperty("webdriver.chrome.driver", Property("user.dir") + "\\chromedriver_win32\\");
} else {
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
}
ChromeOptions options = new ChromeOptions();
// 关闭界⾯上的---Chrome正在受到⾃动软件的控制
options.addArguments("--disable-infobars");
// 允许重定向
options.addArguments("--disable-web-security");
// 最⼤化
options.addArguments("--start-maximized");
options.addArguments("--no-sandbox");
List<String> excludeSwitches = wArrayList("enable-automation");
options.setExperimentalOption("excludeSwitches", excludeSwitches);
driver = new ChromeDriver(options);
//响应等待时间
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
// ("all");
<("login.taobao/member/login.jhtml");
WebElement itemDiv = driver.findElement(By.id("J_Quick2Static"));
itemDiv.click();
WebElement element = driver.findElement(By.className("weibo-login"));
element.click();
Thread.sleep(1000);
//微博登陆
Document doc = Jsoup.PageSource());
().contains("快速登录")) {
WebElement spanBg = driver.findElementByCssSelector(".btn_tip > .W_btn_g > span");
spanBg.click();
}else{
WebElement username = driver.findElement(By.name("username"));
WebElement password = driver.findElement(By.name("password"));
Actions action = new Actions(driver);
action.click(username).perform();
username.clear();
username.sendKeys("");
action.click(password).perform();
password.clear();
password.sendKeys("");
Thread.sleep(2000);
}
//截取整个页⾯
File screenShot = ScreenshotAs(OutputType.FILE);
try {
String name = "C:\\Users\\Administrator\\Desktop\\image\\"+System.currentTimeMillis()+".jpg";
//到验证码位置,截取验证码
WebElement code = driver.findElementByCssSelector(".code > img");
Point location = Location();
Dimension size = Size();
Image img = DefaultToolkit().getImage(name);
BufferedImage read = toBufferedImage(img);
BufferedImage subimage = X(), Y(), Width(),
tHeight()); String newName = "C:\\Users\\Administrator\\Desktop\\image\\"+System.currentTimeMillis()+".jpg";
System.out.println(newName);
ImageIO.write(subimage, "jpg", new File(newName));
//使⽤百度图⽚识别
AipOcr client = new AipOcr("11", "22", "33");
client.setConnectionTimeoutInMillis(2000);
client.setSocketTimeoutInMillis(60000);
HashMap<String, String> param = new HashMap<>(16);
param.put("detect_direction", "true");
param.put("probability", "true");
JSONObject res = client.basicAccurateGeneral(newName, param);
System.out.String(2));
} catch (IOException e) {
e.printStackTrace();
}
Thread.sleep(1000);
while(true) {
if(isLogin()){
break;
}
Thread.sleep(2000);
}
}
public static BufferedImage toBufferedImage(Image image) {
if (image instanceof BufferedImage) {
return (BufferedImage)image;
}
image = new ImageIcon(image).getImage();
BufferedImage bimage = null;
GraphicsEnvironment ge = LocalGraphicsEnvironment();
try {
int transparency = Transparency.OPAQUE;
GraphicsDevice gs = ge.getDefaultScreenDevice();
GraphicsConfiguration gc = gs.getDefaultConfiguration();
bimage = gc.createCompatibleImage(
} catch (HeadlessException e) {
// The system does not have a screen
}
if (bimage == null) {
// Create a buffered image using the default color model
int type = BufferedImage.TYPE_INT_RGB;
bimage = new Width(null), Height(null), type);
}
Graphics g = ateGraphics();
g.drawImage(image, 0, 0, null);
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论