本篇博客的目标是利用selenium自动化技术,将百度云链接内部包含的信息内容生成列表;
有点类似于这篇博客的最终结果
http://www.javaxl.com/blog/articles/220
实现思路
1、从数据库中读取到要解析的某一个百度云分享链接以及提取码,然后通过selenium技术,模拟打开浏览器,填写百度云分享链接以及提取码,然后模拟点击提取资源按钮;接下来就开始解析百度云分享链接内部的信息了
2、首先我们需要通过样式判断用户是分享的目录还是文件,如果是文件,那么很好处理,直接就可以提取到文件的文件名,都无需递归
3、如果是目录,那么需要处理的目录个数加1;如果是第一层级,catalogNumber值肯定是1;如果是第二三层级,catalogNumber会根据实际遍历层次目录数依次+1,而文件会添加到fileEleList,文件名会追加到treeInfo;
4、追加到fileEleList容器中的文件,实际上是已经被selenium中处理过的,我们需要将其从elements中移除,那么elements剩下的元素都是没有被selenium处理的
5、如果没有目录要处理了,那么利用selenium模拟返回上一层。如果当前层级还有目录要处理,那么将当前层级处理的状态信息添加到allTreeInfo中;
6、从allTreeInfo获取要处理的treeLevel,先将其信息添加到treeInfo,然后对这个目录进行递归处理
7、当第三层级的最后一个目录处理完,selenium会模拟返回上一级,到第二层级的最后一个目录,selenium继续模拟返回上一级,到第一层级,最后整个递归结束,程序也就结束了;
核心代码中的重要变量含义:
treeInfo:存放最后打印信息的可变字符串容器;
allElement:百度云分享连接内部行信息对象(文件名、大小、修改日期...)
作用:可以用来区分,这一行是目录还是文件
elements:百度云分享连接内部第一列信息对象(只有文件名)
catalogNumber:用来记录当前层级中目录的个数
fileEleList:用来存放,已经被selenium解析过的文件的容器
allTreeInfo:用来存放,每一层级元素遍历的状态
所需实体类(用来标记被selenium正在处理的节点对象)
package com.javaxl.selenium.entity;
import java.util.List;
/**
* @author 小李飞刀
* @site www.javaxl.com
* @company
* @create 2019-10-17 15:37
*
* level1
* level2.1
* level2.1_3.1
* level2.1_3.2
* level2.2
* level2.2_3.1
* level2.2_3.2
*
* 当前处理第一层级(level1)
* currentIndex:0
* levelCatalog:level1
*
* 当前处理第二层级:
* 当前处理的TreeLevel是:level2.1
* currentIndex:0
* levelCatalog:
* level2.1
* level2.2
* 当前处理的TreeLevel是:level2.2
* currentIndex:1
* levelCatalog:
* level2.1
* level2.2
*
* 当前处理第三层级:
* 当前处理的TreeLevel是:level2.1_3.1
* currentIndex:0
* levelCatalog:
* level2.1_3.1
* level2.1_3.2
* 当前处理的TreeLevel是:level2.1_3.2
* currentIndex:1
* levelCatalog:
* level2.1_3.1
* level2.1_3.2
*
* ......
*
*/
public class TreeLevel {
private Integer currentIndex; // 遍历层次(第一二三层级)被处理的目录所在索引
private List<String> levelCatalog; // 遍历层次(第一二三层级)的所有目录的节点名称
public TreeLevel() {
}
public TreeLevel(Integer currentIndex, List<String> levelCatalog) {
this.currentIndex = currentIndex;
this.levelCatalog = levelCatalog;
}
public Integer getCurrentIndex() {
return currentIndex;
}
public void setCurrentIndex(Integer currentIndex) {
this.currentIndex = currentIndex;
}
public List<String> getLevelCatalog() {
return levelCatalog;
}
public void setLevelCatalog(List<String> levelCatalog) {
this.levelCatalog = levelCatalog;
}
}
核心代码:
package com.javaxl.selenium.test;
import com.javaxl.selenium.entity.Article;
import com.javaxl.selenium.entity.TreeLevel;
import com.javaxl.selenium.utils.StringUtil;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
/**
* @author 小李飞刀
* @site www.javaxl.com
* @company
* @create 2019-10-17 15:46
*
* 给个百度云链接,最多列出其中三层节点信息
*/
public class SeleniumTest3Level {
private static Integer currentLevel=1; // 当前处理的层次
private static StringBuffer treeInfo=null; // 树形结构内容信息
private static Map<String, TreeLevel> allTreeInfo=new HashMap<String,TreeLevel>(); // 记录所有层次的所有结构信息
private static boolean forward=true; // 执行方向
public static void main(String[] args) {
SeleniumTest3Level test3Level=new SeleniumTest3Level();
String shareUrl="https://pan.baidu.com/s/1vcYHHlG4izblfsqMZ_WsMw";
String password="rtg5";
System.setProperty("webdriver.chrome.driver", "D:\\initPath\\chromedriver.exe");
Map<String,Object> preferences=new HashMap<String,Object>();
ChromeOptions options=new ChromeOptions();
preferences.put("profile.managed_default_content_settings.images",2);
options.setExperimentalOption("prefs",preferences);
WebDriver driver=new ChromeDriver(options);
driver.get(shareUrl);
WebDriverWait wait = new WebDriverWait(driver, 5);
wait.until(new ExpectedCondition<Boolean>() {
public Boolean apply(WebDriver d) {
boolean loadcomplete = d.findElement(By.tagName("body")).isDisplayed();
return loadcomplete;
}
});
boolean hasPassword=false; // 是否有密码
String title = driver.findElement(By.cssSelector(".pickpw.clearfix")).getText();
if(StringUtil.isNotEmpty(title) && title.contains("请输入提取码")){
hasPassword=true;
}
if(hasPassword){
WebElement pInput = driver.findElement(By.cssSelector(".QKKaIE.LxgeIt"));
WebElement btn = driver.findElement(By.cssSelector(".g-button-right"));
pInput.sendKeys(password);
btn.click();
}
Article article=new Article();
article.setShare_url(shareUrl);
article.setPassword(password);
try {
// 这里才真正开始去提取百度云链接内部的信息
test3Level.generate3Level(driver,article);
} catch (Exception e) {
e.printStackTrace();
}
// System.out.println(article);
driver.close(); // 浏览器关闭
driver.quit(); // 释放资源
}
/**
* 生成三层结构的树结构资源内容信息
* @param driver
* @param article
* @throws Exception
*/
public static void generate3Level(WebDriver driver, Article article)throws Exception{
treeInfo=new StringBuffer();
try {
Thread.sleep(2000);
// 只有能够加载出文件夹的才会出现这个样式,这行代码才不会出现异常
WebElement element = driver.findElement(By.cssSelector(".EgMMec"));
// System.out.println("是目录");
dealCatalog(driver);
}catch(Exception e){
// System.out.println("是文件");
dealFile(driver);
}
}
/**
* 处理文件
*/
public static void dealFile(WebDriver driver){
WebDriverWait wait = new WebDriverWait(driver, 5);
wait.until(new ExpectedCondition<Boolean>() {
public Boolean apply(WebDriver d) {
boolean loadcomplete = d.findElement(By.cssSelector(".file-name")).isDisplayed();
return loadcomplete;
}
});
WebElement fileNameEle = driver.findElement(By.cssSelector(".file-name"));
treeInfo.append(fileNameEle.getText());
}
/**
* 打印层次
*/
private static void printLine(int n) {
for(int i=2;i<=n;i++) {
if(i<=n-1){
System.out.print(" ");
treeInfo.append(" ");
}else{
System.out.print("|____");
treeInfo.append("|____");
}
}
}
private static boolean backParent(WebDriver driver){
if(currentLevel==1) {
return true;
}else{
--currentLevel;
forward=false;
driver.navigate().back();
dealCatalog(driver);
}
return false;
}
/**
* 处理目录
*/
public static void dealCatalog(WebDriver driver) {
WebDriverWait wait = new WebDriverWait(driver, 5);
try {
wait.until(new ExpectedCondition<Boolean>() {
public Boolean apply(WebDriver d) {
boolean loadcomplete = d.findElement(By.cssSelector(".g-clearfix.AuPKyz")).isDisplayed();
return loadcomplete;
}
});
} catch (Exception e) {
allTreeInfo.remove(String.valueOf(currentLevel));
// 超时处理,返回上一层
if (backParent(driver)) return;
return;
}
// 所有的文件夹及文件元素,那一行对应的所有信息
List<WebElement> allElement = driver.findElements(By.cssSelector(".g-clearfix.AuPKyz"));
// 某一行对应的文件或文件夹名称
List<WebElement> elements = driver.findElements(By.cssSelector(".filename"));
// 有几个目录需要被遍历
int catalogNumber = 0; // 目录数据
// fileEleList存放所有等待遍历的文件夹目录
List<WebElement> fileEleList = new LinkedList<WebElement>();
for (int i = 0; i < allElement.size(); i++) {
WebElement webElement = allElement.get(i);
WebElement element = null;
try {
// 能获取到,说明是目录,不能获取到,说明是文件
element = webElement.findElement(By.cssSelector(".JS-fileicon.dir-small"));
} catch (Exception e) {
} finally {
// 是文件或到了目录遍历的第三层
if (element == null || currentLevel == 3) {
// 左侧单个文件对象
WebElement webElement1 = elements.get(i);
if (forward || currentLevel == 3) {
printLine(currentLevel);
System.out.println(webElement1.getText());
treeInfo.append(webElement1.getText() + "<br/>");
}
fileEleList.add(webElement1);
} else {
catalogNumber++;
}
continue;
}
}
for (WebElement ele : fileEleList) {
elements.remove(ele);
}
if (catalogNumber == 0 && currentLevel == 1) {
return;
}
if (catalogNumber == 0) {
allTreeInfo.remove(String.valueOf(currentLevel));
--currentLevel;
forward = false;
driver.navigate().back();
dealCatalog(driver);
} else {
if (allTreeInfo.get(String.valueOf(currentLevel)) == null) {
List<String> allInfo = new LinkedList<String>();
for (WebElement e : elements) {
String text = e.getText();
allInfo.add(text);
}
// 某一层级,存放所有节点信息,包括节点的索引(从0开始),节点的名称
allTreeInfo.put(String.valueOf(currentLevel), new TreeLevel(0, allInfo));
}
}
TreeLevel treeLevel = allTreeInfo.get(String.valueOf(currentLevel));
if (treeLevel == null) {
return;
}
Integer currentIndex = treeLevel.getCurrentIndex();
if(currentIndex<elements.size()){
WebElement webElement = elements.get(currentIndex);
printLine(currentLevel);
treeInfo.append(webElement.getText()+"<br/>");
System.out.println(webElement.getText());
String winHandleBefore = driver.getWindowHandle();
webElement.click();
// for(String winhandle:driver.getWindowHandles()){
// if(winhandle.equals(winHandleBefore)){
// continue;
// }
// driver.switchTo().window(winhandle);
// break;
// }
treeLevel.setCurrentIndex(currentIndex+1);
allTreeInfo.put(String.valueOf(currentLevel),treeLevel);
++currentLevel;
forward=true;
dealCatalog(driver);
}else{
allTreeInfo.remove(String.valueOf(currentLevel));
if(currentLevel==1){
return;
}else{
--currentLevel;
forward=false;
driver.navigate().back();
dealCatalog(driver);
}
}
}
}
稍微改改用到页面上去
package com.javaxl.selenium.test;
import com.javaxl.selenium.entity.Article;
import com.javaxl.selenium.entity.TreeLevel;
import com.javaxl.selenium.utils.StringUtil;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* @author 小李飞刀
* @site www.javaxl.com
* @company
* @create 2019-10-22 15:42
*/
@WebServlet("/yunParse")
public class SeleniumServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
doPost(req,resp);
}
@Override
protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
SeleniumTest3Level test3Level=new SeleniumTest3Level();
// String shareUrl="https://pan.baidu.com/s/1vcYHHlG4izblfsqMZ_WsMw";
// String password="rtg5";
String shareUrl=req.getParameter("shareUrl");
String password=req.getParameter("password");
if(StringUtil.isEmpty(shareUrl) || StringUtil.isEmpty(password)){
req.getRequestDispatcher("index.jsp").forward(req,resp);
return;
}
System.setProperty("webdriver.chrome.driver", "D:\\initPath\\chromedriver.exe");
Map<String,Object> preferences=new HashMap<String,Object>();
ChromeOptions options=new ChromeOptions();
preferences.put("profile.managed_default_content_settings.images",2);
options.setExperimentalOption("prefs",preferences);
WebDriver driver=new ChromeDriver(options);
driver.get(shareUrl);
WebDriverWait wait = new WebDriverWait(driver, 5);
wait.until(new ExpectedCondition<Boolean>() {
public Boolean apply(WebDriver d) {
boolean loadcomplete = d.findElement(By.tagName("body")).isDisplayed();
return loadcomplete;
}
});
boolean hasPassword=false; // 是否有密码
String title = driver.findElement(By.cssSelector(".pickpw.clearfix")).getText();
if(StringUtil.isNotEmpty(title) && title.contains("请输入提取码")){
hasPassword=true;
}
if(hasPassword){
WebElement pInput = driver.findElement(By.cssSelector(".QKKaIE.LxgeIt"));
WebElement btn = driver.findElement(By.cssSelector(".g-button-right"));
pInput.sendKeys(password);
btn.click();
}
Article article=new Article();
article.setShare_url(shareUrl);
article.setPassword(password);
String treeInfo = null;
try {
// 这里才真正开始去提取百度云链接内部的信息
treeInfo = test3Level.generate3Level(driver, article);
} catch (Exception e) {
e.printStackTrace();
}
req.setAttribute("msg",treeInfo);
req.setAttribute("shareUrl",shareUrl);
req.setAttribute("password",password);
req.getRequestDispatcher("index.jsp").forward(req,resp);
driver.close(); // 浏览器关闭
driver.quit(); // 释放资源
}
}
Index.jsp页面
index.jsp
最终运行结果

over......
备案号:湘ICP备19000029号
Copyright © 2018-2019 javaxl晓码阁 版权所有