Selenium [1] 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera等。这个工具的主要功能包括:测试与浏览器的兼容性——测试你的应用程序看是否能够很好得工作在不同浏览器和操作系统之上。测试系统功能——创建回归测试检验软件功能和用户需求。支持自动录制动作和自动生成 .Net、Java、Perl等不同语言的测试脚本。
selenium我们搞爬虫用的是谷歌浏览器,这里需要对应的selenium驱动;
驱动下载地址:http://chromedriver.storage.googleapis.com/index.html

注意:浏览器与selenium驱动的版本一定要保持一致;
这里浏览器的版本是:版本 77.0.3865.120(正式版本)
Selenium浏览器驱动的版本是:/77.0.3865.40/
Selenium的版本过高,运行代码的时候会报错;
版本不对类似错误信息如下:
selenium os.name: 'Windows 7', os.arch: 'amd64', os.version: '6.1', java.version: '1.8.0_144'
相关pom依赖
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
</dependencies>
Article.java
package com.javaxl.selenium.entity;
/**
* @author 小李飞刀
* @site www.javaxl.com
* @company
* @create 2019-10-17 11:05
*/
public class Article {
private Integer id;
private String name;
private String content;
private String include_date;
private String share_user;
private String share_url;
private String password;
private String share_date;
private boolean is_index;
private Integer state;
@Override
public String toString() {
return "Article{" +
"id=" + id +
", name='" + name + '\'' +
", content='" + content + '\'' +
", include_date='" + include_date + '\'' +
", share_user='" + share_user + '\'' +
", share_url='" + share_url + '\'' +
", password='" + password + '\'' +
", share_date='" + share_date + '\'' +
", is_index=" + is_index +
", state=" + state +
'}';
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getInclude_date() {
return include_date;
}
public void setInclude_date(String include_date) {
this.include_date = include_date;
}
public String getShare_user() {
return share_user;
}
public void setShare_user(String share_user) {
this.share_user = share_user;
}
public String getShare_url() {
return share_url;
}
public void setShare_url(String share_url) {
this.share_url = share_url;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password;
}
public String getShare_date() {
return share_date;
}
public void setShare_date(String share_date) {
this.share_date = share_date;
}
public boolean isIs_index() {
return is_index;
}
public void setIs_index(boolean is_index) {
this.is_index = is_index;
}
public Integer getState() {
return state;
}
public void setState(Integer state) {
this.state = state;
}
}
StringUtil.java
package com.javaxl.selenium.utils;
/**
* 字符串工具类
* @author
*
*/
public class StringUtil {
/**
* 判断是否是空
* @param str
* @return
*/
public static boolean isEmpty(String str){
if(str==null||"".equals(str.trim())){
return true;
}else{
return false;
}
}
/**
* 判断是否不是空
* @param str
* @return
*/
public static boolean isNotEmpty(String str){
if((str!=null)&&!"".equals(str.trim())){
return true;
}else{
return false;
}
}
/**
* 去掉html标签
* @param content
* @return
*/
public static String stripHtml(String content) {
// <p>段落替换为换行
content = content.replaceAll("<p .*?>", "\r\n");
// <br><br/>替换为换行
content = content.replaceAll("<br\\s*/?>", "\r\n");
// 去掉其它的<>之间的东西
content = content.replaceAll("\\<.*?>", "");
// 去掉空格
content = content.replaceAll(" ", "");
return content;
}
}
DateUtil.java
package com.javaxl.selenium.utils;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* 日期工具类
*/
public class DateUtil {
/**
* 把指定日期对象转成指定格式的日期字符串
* @param date
* @param format
* @return
*/
public static String formatDate(Date date,String format){
String result="";
SimpleDateFormat sdf=new SimpleDateFormat(format);
if(date!=null){
result=sdf.format(date);
}
return result;
}
/**
* 把指定日期字符串转成指定格式的日期对象
* @param str
* @param format
* @return
* @throws Exception
*/
public static Date formatString(String str,String format) throws Exception{
if(StringUtil.isEmpty(str)){
return null;
}
SimpleDateFormat sdf=new SimpleDateFormat(format);
return sdf.parse(str);
}
/**
* 获取当前日期的字符串
* @return
* @throws Exception
*/
public static String getCurrentDateStr(){
Date date=new Date();
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return sdf.format(date);
}
}
案例一:打开百度首页
package com.javaxl.selenium.test;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.remote.RemoteWebDriver;
/**
* @author 小李飞刀
* @site www.javaxl.com
* @company
* @create 2019-10-17 11:04
*
* selenium入门案例一
*/
public class Demo1 {
public static void main(String[] args) {
System.setProperty("webdriver.chrome.driver", "D:\\initPath\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
String url = "http://www.baidu.com";
driver.get(url); // 打开指定的网站
System.out.println(driver.getTitle());
//driver.navigate().to(url); // 打开指定的网站
// driver.close(); // 浏览器关闭
// driver.quit(); // 释放资源
}
}
运行结果如下

案例二:获取百度云分享链接内部信息
package com.javaxl.selenium.test;
import com.javaxl.selenium.entity.Article;
import com.javaxl.selenium.utils.DateUtil;
import com.javaxl.selenium.utils.StringUtil;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.util.HashMap;
import java.util.Map;
/**
* @author 小李飞刀
* @site www.javaxl.com
* @company
* @create 2019-10-17 12:06
*
* 百度云链接内部数据获取
* 链接地址:https://pan.baidu.com/s/1rVgdoLTqsfz9mXZlUqYjSg
* 提取密码:lt5k
*/
public class Demo2 {
public static void main(String[] args) {
String shareUrl="https://pan.baidu.com/s/1rVgdoLTqsfz9mXZlUqYjSg";
String password="lt5k";
System.setProperty("webdriver.chrome.driver", "D:\\initPath\\chromedriver.exe");
// 排除掉图片的加载,提升性能
Map<String,Object> preferences=new HashMap<String,Object>();
ChromeOptions options=new ChromeOptions();
preferences.put("profile.managed_default_content_settings.images",2);
options.setExperimentalOption("prefs",preferences);
WebDriver driver=new ChromeDriver(options);
driver.get(shareUrl);
// 超时等待
WebDriverWait wait = new WebDriverWait(driver, 5);
wait.until(new ExpectedCondition<Boolean>() {
public Boolean apply(WebDriver d) {
boolean loadcomplete = d.findElement(By.tagName("body")).isDisplayed();
return loadcomplete;
}
});
boolean hasPassword=false; // 是否有密码
String title = driver.findElement(By.cssSelector(".pickpw.clearfix")).getText();
if(StringUtil.isNotEmpty(title) && title.contains("请输入提取码")){
hasPassword=true;
}
if(hasPassword){
WebElement pInput = driver.findElement(By.cssSelector(".QKKaIE.LxgeIt"));
WebElement btn = driver.findElement(By.cssSelector(".g-button-right"));
pInput.sendKeys(password);
btn.click();
}
Article article=new Article();
article.setShare_url(shareUrl);
article.setPassword(password);
try {
genPageData(driver,article);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(article);
// driver.close(); // 浏览器关闭
// driver.quit(); // 释放资源
}
/**
* 生成基本数据
* @param driver
* @param article
* @throws Exception
*/
public static void genPageData(WebDriver driver, Article article)throws Exception{
try{
Thread.sleep(1000);
}catch(Exception e){
e.printStackTrace();
}
WebElement fileNameEle = driver.findElement(By.cssSelector(".filename"));
article.setName(fileNameEle.getText());
WebElement shareDateEle = driver.findElement(By.cssSelector(".share-file-info span"));
article.setShare_date(shareDateEle.getText());
WebElement shareUserEle = driver.findElement(By.cssSelector(".share-person-data-top a.share-person-username.global-ellipsis"));
article.setShare_user(shareUserEle.getAttribute("textContent"));
article.setContent(fileNameEle.getText()); // 预先设置
article.setState(1);
article.setInclude_date(DateUtil.getCurrentDateStr());
}
}

代码运行结果是:
1、selenium模拟了谷歌浏览器手动输入百度云分享链接
2、打开了需要输入提取码的页面,selenium再次模拟了人工输入百度云链接提取码
3、Selenium再次模拟了人工点击 ”提取文件” 按钮
4、Selenium最后能够如同jsoup一样,分析提取个人所需有用信息;
over......
备案号:湘ICP备19000029号
Copyright © 2018-2019 javaxl晓码阁 版权所有