java 使用webmagic 爬虫框架爬取博客园数据
生活随笔
收集整理的這篇文章主要介紹了
java 使用webmagic 爬虫框架爬取博客园数据
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
? ? ? java 使用webmagic 爬蟲框架爬取博客園數據存入數據庫
學習記錄??
?
webmagic簡介:
WebMagic是一個簡單靈活的Java爬蟲框架。你可以快速開發出一個高效、易維護的爬蟲。
http://webmagic.io/
?
準備工作:
Maven依賴(我這里用的Maven創建的web項目做測試):
<dependencies> <!-- junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency><!--日志配置 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency><dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-core</artifactId> <version>1.2.3</version> </dependency> <!-- 實現slf4j接口并整合 --> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.2.3</version> </dependency><!-- 數據庫部分 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.34</version> <scope>runtime</scope> </dependency> <!-- c3p0連接池 --> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency><!-- dao框架:mybatis --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.4.0</version> </dependency> <!-- mybatis 整合spring --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis-spring</artifactId> <version>1.3.0</version> </dependency><!-- servlet web依賴 --> <dependency> <groupId>taglibs</groupId> <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>jstl</groupId> <artifactId>jstl</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>javax.servlet</groupId> <artifactId>javax.servlet-api</artifactId> <version>3.1.0</version> </dependency><!-- spring 依賴 --> <!-- 1.spring核心依賴 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- 2.spring dao 層依賴 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>4.2.5.RELEASE</version> </dependency><!-- spring web --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-web</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>4.2.5.RELEASE</version> </dependency><!-- spring test 依賴 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>4.2.6.RELEASE</version> </dependency> <!-- webmagic 網絡爬蟲jar --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> </dependencies> View Code?
? 數據庫表SQL:
CREATE TABLE `Boke` (`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',`title` varchar(255) DEFAULT NULL COMMENT '標題',`linke` varchar(255) DEFAULT NULL COMMENT '正文地址',`author` varchar(255) DEFAULT NULL COMMENT '作者',`authorUrl` varchar(255) DEFAULT NULL COMMENT '作者主頁',`summary` varchar(1000) DEFAULT NULL COMMENT '簡介',PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;?
數據庫鏈接工具類:
import java.sql.DriverManager; import java.sql.SQLException;import com.mysql.jdbc.Connection;public class MySqlJdbcUtils {private static String driver = "com.mysql.jdbc.Driver";private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";private static String name="tradingbp";private static String pwd="123456";/*** * 獲取鏈接** @date 2017年8月31日* @return*/public static Connection getOpenConnection(){Connection conn= null;try {//加載驅動 Class.forName(driver);conn=(Connection) DriverManager.getConnection(url, name, pwd);System.out.println("獲得數據庫鏈接");} catch (ClassNotFoundException e) {e.printStackTrace();}catch (SQLException e) {e.printStackTrace();}return conn;}public static void main(String[] args) {getOpenConnection();}} View Code?
實體類:
/*** *java 博客實體** @date 2017年8月24日* @see [相關類/方法]* @since [產品/模塊版本]*/ public class JavaBokeModel {//標題private String title;//鏈接地址private String linke;//作者private String author;//作者主頁地址private String authorUrl;//簡介private String summary;public String getSummary() {return summary;}public void setSummary(String summary) {this.summary = summary;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getLinke() {return linke;}public void setLinke(String linke) {this.linke = linke;}public String getAuthor() {return author;}public void setAuthor(String author) {this.author = author;}public String getAuthorUrl() {return authorUrl;}public void setAuthorUrl(String authorUrl) {this.authorUrl = authorUrl;}} View Code?
webmagic 框架爬取數據并保存
?
import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.Date; import java.util.List;import org.jsoup.Jsoup; import org.jsoup.nodes.Document;import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor;import com.mysql.jdbc.Connection; import com.nio.webmagic.jdbc.MySqlJdbcUtils; import com.nio.webmagic.model.JavaBokeModel; /*** * 爬蟲** @version [VCES V201R001, 2017年10月12日]** @see 方法實現 PageProcessor * @since [產品/模塊版本]*/ public class JavaBoKePageProcessor implements PageProcessor {private static Connection conn=null;private static PreparedStatement ps =null;//標題和鏈接獲取private static String TITLEQUERY="div.post_item_body h3 a.titlelnk";//作者private static String AUTHORQUERY="div.post_item_foot a.lightblue ";//簡介private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";//插入sql語句private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";//初始鏈接private static Connection getConnection(){if (conn==null) {conn = MySqlJdbcUtils.getOpenConnection();}return conn;}/*** * insert操作** @date 2017年8月31日* @return*/private synchronized void insertDb(List<JavaBokeModel> javaBokes){try {ps = conn.prepareStatement(insertSql);for (JavaBokeModel javaBoke:javaBokes) {ps.setString(1, javaBoke.getTitle().toString());ps.setString(2, javaBoke.getLinke().toString());ps.setString(3, javaBoke.getAuthor().toString());ps.setString(4, javaBoke.getAuthorUrl().toString());ps.setString(5, javaBoke.getSummary().toString());ps.executeUpdate();}} catch (SQLException e) {// TODO Auto-generated catch block e.printStackTrace();}}//初始化帶爬取網頁地址private static List<String> urls(){List<String> listUrl =new ArrayList<String>();for (int i = 2; i <=200; i++) {//listUrl.add("http://www.cnblogs.com/cate/java/"+i);listUrl.add("http://www.cnblogs.com/cate/java/"+i);}listUrl.toArray(new String[listUrl.size()]);return listUrl;}/*** * jsoup根據 html 字符串和語法獲取內容;* @date 2017年8月31日* @param htmlText* @return*/private static String seletDocumentText(String htmlText,String Query){Document doc = Jsoup.parse(htmlText);String select = doc.select(Query).text();return select;}/*** * jsoup根據 html 字符串和語法獲取鏈接地址;* @date 2017年8月31日* @param htmlText* @return*/private static String seletDocumentLink(String htmlText,String Query){Document doc = Jsoup.parse(htmlText);String select = doc.select(Query).attr("href");return select;}/*** process是定制爬蟲邏輯的核心接口,在這里編寫抽取邏輯* @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page)*/@Overridepublic void process(Page page) {// page.addTargetRequests(urls());//div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()'// 定義如何抽取頁面信息,并保存下來List<String> htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all();List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();for (String html:htmls) {JavaBokeModel javaBoke =new JavaBokeModel();//標題和鏈接String title =seletDocumentText(html,TITLEQUERY);String linke =seletDocumentLink(html,TITLEQUERY);//作者和作者主頁String author=seletDocumentText(html, AUTHORQUERY);String authorUrl=seletDocumentLink(html, AUTHORQUERY);//簡介String summary=seletDocumentText(html, SUMMARYQUERY);javaBoke.setTitle(title);javaBoke.setAuthor(author);javaBoke.setAuthorUrl(authorUrl);javaBoke.setLinke(linke);javaBoke.setSummary(summary);javaBokes.add(javaBoke);}insertDb(javaBokes);}@Overridepublic Site getSite() {//抓去網站的相關配置包括:編碼、重試次數、抓取間隔return Site.me().setSleepTime(1000).setRetryTimes(10);}public static void main(String[] args) {long startTime ,endTime;System.out.println("========小爬蟲【啟動】嘍!=========");getConnection();startTime = new Date().getTime();//入口Spider create = Spider.create(new JavaBoKePageProcessor());//定義入口地址create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); try {ps.close();conn.close();} catch (Exception e) {// TODO: handle exception }endTime = new Date().getTime();System.out.println("========小爬蟲【結束】嘍!=========");System.out.println("用時為:"+(endTime-startTime)/1000+"s");}}?
數據:
?
?
?
?
?
?
轉載于:https://www.cnblogs.com/cjbbk/p/7655233.html
總結
以上是生活随笔為你收集整理的java 使用webmagic 爬虫框架爬取博客园数据的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: SQL查询交集、并集、差集
- 下一篇: 一个老程序猿的焦虑