这里我用两个案例来讲解jsoup爬虫
案例1:完整版的爬虫案例(CSDN博客的增量爬取)
涉及到的点有以下几个:
1、httpclient获取网页内容
2、Jsoup解析网页内容
3、要达到增量爬取的效果,那么需要利用缓存ehcache对重复URL判重
4、将爬取到的数据存入数据库
5、为解决某些网站防盗链的问题,那么需要将对方网站的静态资源(这里只处理了图片)本地化
案例2:为了辅助上篇博客Python的学习,我们需要爬取电影网站的数据
以下是案例1的相关代码及结果图:
Pom依赖
<!-- jdbc驱动包 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.44</version> </dependency> <!-- 添加Httpclient支持 --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.2</version> </dependency> <!-- 添加jsoup支持 --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.1</version> </dependency> <!-- 添加日志支持 --> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.16</version> </dependency> <!-- 添加ehcache支持 --> <dependency> <groupId>net.sf.ehcache</groupId> <artifactId>ehcache</artifactId> <version>2.10.3</version> </dependency> <!-- 添加commons io支持 --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.47</version> </dependency>
crawler.properties
dbUrl=jdbc:mysql://localhost:3306/mybatis_ssm?autoReconnect=true dbUserName=mybatis_ssm dbPassword=xiaoli jdbcName=com.mysql.jdbc.Driver ehcacheXmlPath=C://blogCrawler/ehcache.xml blogImages=C://blogCrawler/blogImages/
DbUtil.java
package com.javaxl.util; import java.sql.Connection; import java.sql.DriverManager; /** * 数据库工具类 * @author user * */ public class DbUtil { /** * 获取连接 * @return * @throws Exception */ public Connection getCon()throws Exception{ Class.forName(PropertiesUtil.getValue("jdbcName")); Connection con=DriverManager.getConnection(PropertiesUtil.getValue("dbUrl"), PropertiesUtil.getValue("dbUserName"), PropertiesUtil.getValue("dbPassword")); return con; } /** * 关闭连接 * @param con * @throws Exception */ public void closeCon(Connection con)throws Exception{ if(con!=null){ con.close(); } } public static void main(String[] args) { DbUtil dbUtil=new DbUtil(); try { dbUtil.getCon(); System.out.println("数据库连接成功"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("数据库连接失败"); } } }
PropertiesUtil.java
package com.javaxl.util; import java.io.IOException; import java.io.InputStream; import java.util.Properties; /** * properties工具类 * @author user * */ public class PropertiesUtil { /** * 根据key获取value值 * @param key * @return */ public static String getValue(String key){ Properties prop=new Properties(); InputStream in=new PropertiesUtil().getClass().getResourceAsStream("/crawler.properties"); try { prop.load(in); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return prop.getProperty(key); } }
BlogCrawlerStarter.java(核心代码)
package com.javaxl.crawler; import java.io.File; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; import org.apache.commons.io.FileUtils; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.javaxl.util.DateUtil; import com.javaxl.util.DbUtil; import com.javaxl.util.PropertiesUtil; import net.sf.ehcache.Cache; import net.sf.ehcache.CacheManager; import net.sf.ehcache.Status; /** * @author Administrator * */ public class BlogCrawlerStarter { private static Logger logger = Logger.getLogger(BlogCrawlerStarter.class); private static String HOMEURL = "https://www.csdn.net/nav/newarticles"; private static CloseableHttpClient httpClient; private static Connection con; private static CacheManager cacheManager; private static Cache cache; /** * httpclient解析首页,获取首页内容 */ public static void parseHomePage() { logger.info("开始爬取首页:" + HOMEURL); cacheManager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath")); cache = cacheManager.getCache("cnblog"); httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(HOMEURL); RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); httpGet.setConfig(config); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response == null) { logger.info(HOMEURL + ":爬取无响应"); return; } if (response.getStatusLine().getStatusCode() == 200) { HttpEntity entity = response.getEntity(); String homePageContent = EntityUtils.toString(entity, "utf-8"); // System.out.println(homePageContent); parseHomePageContent(homePageContent); } } catch (ClientProtocolException e) { logger.error(HOMEURL + "-ClientProtocolException", e); } catch (IOException e) { logger.error(HOMEURL + "-IOException", e); } finally { try { if (response != null) { response.close(); } if (httpClient != null) { httpClient.close(); } } catch (IOException e) { logger.error(HOMEURL + "-IOException", e); } } if(cache.getStatus() == Status.STATUS_ALIVE) { cache.flush(); } cacheManager.shutdown(); logger.info("结束爬取首页:" + HOMEURL); } /** * 通过网络爬虫框架jsoup,解析网页类容,获取想要数据(博客的连接) * * @param homePageContent */ private static void parseHomePageContent(String homePageContent) { Document doc = Jsoup.parse(homePageContent); Elements aEles = doc.select("#feedlist_id .list_con .title h2 a"); for (Element aEle : aEles) { // 这个是首页中的博客列表中的单个链接URL String blogUrl = aEle.attr("href"); if (null == blogUrl || "".equals(blogUrl)) { logger.info("该博客未内容,不再爬取插入数据库!"); continue; } if(cache.get(blogUrl) != null) { logger.info("该数据已经被爬取到数据库中,数据库不再收录!"); continue; } // System.out.println("************************"+blogUrl+"****************************"); parseBlogUrl(blogUrl); } } /** * 通过博客地址获取博客的标题,以及博客的类容 * * @param blogUrl */ private static void parseBlogUrl(String blogUrl) { logger.info("开始爬取博客网页:" + blogUrl); httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(blogUrl); RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); httpGet.setConfig(config); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response == null) { logger.info(blogUrl + ":爬取无响应"); return; } if (response.getStatusLine().getStatusCode() == 200) { HttpEntity entity = response.getEntity(); String blogContent = EntityUtils.toString(entity, "utf-8"); parseBlogContent(blogContent, blogUrl); } } catch (ClientProtocolException e) { logger.error(blogUrl + "-ClientProtocolException", e); } catch (IOException e) { logger.error(blogUrl + "-IOException", e); } finally { try { if (response != null) { response.close(); } } catch (IOException e) { logger.error(blogUrl + "-IOException", e); } } logger.info("结束爬取博客网页:" + HOMEURL); } /** * 解析博客类容,获取博客中标题以及所有内容 * * @param blogContent */ private static void parseBlogContent(String blogContent, String link) { Document doc = Jsoup.parse(blogContent); Elements titleEles = doc .select("#mainBox main .blog-content-box .article-header-box .article-header .article-title-box h1"); if (titleEles.size() == 0) { logger.info("博客标题为空,不插入数据库!"); return; } String title = titleEles.get(0).html(); Elements blogContentEles = doc.select("#content_views"); if (blogContentEles.size() == 0) { logger.info("博客内容为空,不插入数据库!"); return; } String blogContentBody = blogContentEles.get(0).html(); // Elements imgEles = doc.select("img"); // List<String> imgUrlList = new LinkedList<String>(); // if(imgEles.size() > 0) { // for (Element imgEle : imgEles) { // imgUrlList.add(imgEle.attr("src")); // } // } // // if(imgUrlList.size() > 0) { // Map<String, String> replaceUrlMap = downloadImgList(imgUrlList); // blogContent = replaceContent(blogContent,replaceUrlMap); // } String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)"; try { PreparedStatement pst = con.prepareStatement(sql); pst.setObject(1, title); pst.setObject(2, blogContentBody); pst.setObject(3, link); if(pst.executeUpdate() == 0) { logger.info("爬取博客信息插入数据库失败"); }else { cache.put(new net.sf.ehcache.Element(link, link)); logger.info("爬取博客信息插入数据库成功"); } } catch (SQLException e) { logger.error("数据异常-SQLException:",e); } } /** * 将别人博客内容进行加工,将原有图片地址换成本地的图片地址 * @param blogContent * @param replaceUrlMap * @return */ private static String replaceContent(String blogContent, Map<String, String> replaceUrlMap) { for(Map.Entry<String, String> entry: replaceUrlMap.entrySet()) { blogContent = blogContent.replace(entry.getKey(), entry.getValue()); } return blogContent; } /** * 别人服务器图片本地化 * @param imgUrlList * @return */ private static Map<String, String> downloadImgList(List<String> imgUrlList) { Map<String, String> replaceMap = new HashMap<String, String>(); for (String imgUrl : imgUrlList) { CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(imgUrl); RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); httpGet.setConfig(config); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response == null) { logger.info(HOMEURL + ":爬取无响应"); }else { if (response.getStatusLine().getStatusCode() == 200) { HttpEntity entity = response.getEntity(); String blogImagesPath = PropertiesUtil.getValue("blogImages"); String dateDir = DateUtil.getCurrentDatePath(); String uuid = UUID.randomUUID().toString(); String subfix = entity.getContentType().getValue().split("/")[1]; String fileName = blogImagesPath + dateDir + "/" + uuid + "." + subfix; FileUtils.copyInputStreamToFile(entity.getContent(), new File(fileName)); replaceMap.put(imgUrl, fileName); } } } catch (ClientProtocolException e) { logger.error(imgUrl + "-ClientProtocolException", e); } catch (IOException e) { logger.error(imgUrl + "-IOException", e); } catch (Exception e) { logger.error(imgUrl + "-Exception", e); } finally { try { if (response != null) { response.close(); } } catch (IOException e) { logger.error(imgUrl + "-IOException", e); } } } return replaceMap; } public static void start() { while(true) { DbUtil dbUtil = new DbUtil(); try { con = dbUtil.getCon(); parseHomePage(); } catch (Exception e) { logger.error("数据库连接势失败!"); } finally { try { if (con != null) { con.close(); } } catch (SQLException e) { logger.error("数据关闭异常-SQLException:",e); } } try { Thread.sleep(1000*60); } catch (InterruptedException e) { logger.error("主线程休眠异常-InterruptedException:",e); } } } public static void main(String[] args) { start(); } }
控制台效果图:
以下是案例2的相关代码及结果图:
package com.javaxl.crawler; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.LinkedList; import java.util.List; import java.util.UUID; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.zking.util.DbUtil; import com.zking.util.PropertiesUtil; import net.sf.ehcache.Cache; import net.sf.ehcache.CacheManager; import net.sf.ehcache.Status; /** * 第一页 http://www.8gw.com/8gli/index8.html 第二页 * http://www.8gw.com/8gli/index8_2.html 图片地址 * http://img.dphydh.com/uploadimg/2013-3/20105141822720399.jpg * * @author Administrator * */ public class MovieCrawlerStarter { private static Logger logger = Logger.getLogger(MovieCrawlerStarter.class); private static String URL = "http://www.8gw.com/"; private static String PROJECT_URL = "http://www.8gw.com"; private static Connection con; private static CacheManager manager; private static Cache cache; private static CloseableHttpClient httpClient; private static long total = 0; /** * 等待爬取的52个链接的数据 * * @return */ private static List<String> getUrls() { List<String> list = new LinkedList<String>(); list.add("http://www.8gw.com/8gli/index8.html"); for (int i = 2; i < 53; i++) { list.add("http://www.8gw.com/8gli/index8_" + i + ".html"); } return list; } /** * 获取URL主体类容 * * @param url */ private static void parseUrl(String url) { logger.info("开始爬取系列列表::" + url); HttpGet httpGet = new HttpGet(url); RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); httpGet.setConfig(config); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response == null) { logger.info("链接超时!"); } else { if (response.getStatusLine().getStatusCode() == 200) { HttpEntity entity = response.getEntity(); String pageContent = EntityUtils.toString(entity, "GBK"); parsePageContent(pageContent, url); } } } catch (ClientProtocolException e) { logger.error(url + "-解析异常-ClientProtocolException", e); } catch (IOException e) { logger.error(url + "-解析异常-IOException", e); } finally { try { if (response != null) { response.close(); } } catch (IOException e) { logger.error(url + "-解析异常-IOException", e); } } logger.info("结束爬取系列列表::" + url); } /** * 获取当前页中的具体影片的链接 * @param pageContent * @param url */ private static void parsePageContent(String pageContent, String url) { // System.out.println("****************" + url + "***********************"); Document doc = Jsoup.parse(pageContent); Elements liEles = doc.select(".span_2_800 #list_con li"); for (Element liEle : liEles) { String movieUrl = liEle.select(".info a").attr("href"); if (null == movieUrl || "".equals(movieUrl)) { logger.info("该影片未内容,不再爬取插入数据库!"); continue; } if(cache.get(movieUrl) != null) { logger.info("该数据已经被爬取到数据库中,数据库不再收录!"); continue; } parseSingleMovieUrl(PROJECT_URL+movieUrl); } } /** * 解析单个影片链接 * @param movieUrl */ private static void parseSingleMovieUrl(String movieUrl) { logger.info("开始爬取影片网页:" + movieUrl); httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(movieUrl); RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); httpGet.setConfig(config); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response == null) { logger.info(movieUrl + ":爬取无响应"); return; } if (response.getStatusLine().getStatusCode() == 200) { HttpEntity entity = response.getEntity(); String blogContent = EntityUtils.toString(entity, "GBK"); parseSingleMovieContent(blogContent, movieUrl); } } catch (ClientProtocolException e) { logger.error(movieUrl + "-ClientProtocolException", e); } catch (IOException e) { logger.error(movieUrl + "-IOException", e); } finally { try { if (response != null) { response.close(); } } catch (IOException e) { logger.error(movieUrl + "-IOException", e); } } logger.info("结束爬取影片网页:" + movieUrl); } /** * 解析页面主体类容(影片名字、影片描述、影片地址) * @param pageContent * @param movieUrl */ private static void parseSingleMovieContent(String pageContent, String movieUrl) { // System.out.println("****************" + movieUrl + "***********************"); Document doc = Jsoup.parse(pageContent); Elements divEles = doc.select(".wrapper .main .moviedteail"); // .wrapper .main .moviedteail .moviedteail_tt h1 // .wrapper .main .moviedteail .moviedteail_list .moviedteail_list_short a // .wrapper .main .moviedteail .moviedteail_img img Elements h1Eles = divEles.select(".moviedteail_tt h1"); if (h1Eles.size() == 0) { logger.info("影片名字为空,不插入数据库!"); return; } String mname = h1Eles.get(0).html(); Elements aEles = divEles.select(".moviedteail_list .moviedteail_list_short a"); if (aEles.size() == 0) { logger.info("影片描述为空,不插入数据库!"); return; } String mdesc = aEles.get(0).html(); Elements imgEles = divEles.select(".moviedteail_img img"); if (null == imgEles || "".equals(imgEles)) { logger.info("影片描述为空,不插入数据库!"); return; } String mimg = imgEles.attr("src"); String sql = "insert into movie(mname,mdesc,mimg,mlink) values(?,?,?,99)"; try { System.out.println("****************" + mname + "***********************"); System.out.println("****************" + mdesc + "***********************"); System.out.println("****************" + mimg + "***********************"); PreparedStatement pst = con.prepareStatement(sql); pst.setObject(1, mname); pst.setObject(2, mdesc); pst.setObject(3, mimg); if(pst.executeUpdate() == 0) { logger.info("爬取影片信息插入数据库失败"); }else { cache.put(new net.sf.ehcache.Element(movieUrl, movieUrl)); logger.info("爬取影片信息插入数据库成功"); } } catch (SQLException e) { logger.error("数据异常-SQLException:",e); } } public static void main(String[] args) { manager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath")); cache = manager.getCache("8gli_movies"); httpClient = HttpClients.createDefault(); DbUtil dbUtil = new DbUtil(); try { con = dbUtil.getCon(); List<String> urls = getUrls(); for (String url : urls) { try { parseUrl(url); } catch (Exception e) { // urls.add(url); } } } catch (Exception e1) { logger.error("数据库连接势失败!"); } finally { try { if (httpClient != null) { httpClient.close(); } if (con != null) { con.close(); } } catch (IOException e) { logger.error("网络连接关闭异常-IOException:",e); } catch (SQLException e) { logger.error("数据关闭异常-SQLException:",e); } } // 最终将数据缓存到硬盘中 if (cache.getStatus() == Status.STATUS_ALIVE) { cache.flush(); } manager.shutdown(); } }
over......
备案号:湘ICP备19000029号
Copyright © 2018-2019 javaxl晓码阁 版权所有