java房源信息管理的代码_crawler4j源码学习(2):Ziroom租房网房源信息采集爬虫
/*** @date 2016年8月20日 下午6:13:24
*@version*@sinceJDK 1.8*/
public class ZiroomCrawler extendsWebCrawler {/**爬取匹配原則*/
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|ico"
+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");/**爬取數(shù)據(jù)保存文件路徑*/
private final static String DATA_PATH = "data/crawl/ziroom.csv";/**爬取link文件路徑*/
private final static String LINK_PATH = "data/crawl/link.csv";//private static final Logger logger =//LoggerFactory.getLogger(ZiroomCrawler.class);
private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl/";private finalFile fLinks;private finalFile fDatas;privateCsvWriter csvLinks;privateCsvWriter csvDatas;/*** You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).*/ZiroomCrawlStat myCrawlStat;public ZiroomCrawler() throwsIOException {
myCrawlStat= newZiroomCrawlStat();
fLinks= newFile(DATA_PATH);
fDatas= newFile(LINK_PATH);if(fLinks.isFile()) {
fLinks.delete();
}if(fDatas.isFile()) {
fDatas.delete();
}
csvDatas= new CsvWriter(new FileWriter(fDatas, true), ',');
csvDatas.write("請(qǐng)求路徑");
csvDatas.endRecord();
csvDatas.close();
csvLinks= new CsvWriter(new FileWriter(fLinks, true), ',');
csvLinks.write("圖片");
csvLinks.write("價(jià)格");
csvLinks.write("地址");
csvLinks.write("說明");
csvLinks.endRecord();
csvLinks.close();
}public voiddumpMyData() {final int id =getMyId();//You can configure the log to output to file
logger.info("Crawler {} > Processed Pages: {}", id, myCrawlStat.getTotalProcessedPages());
logger.info("Crawler {} > Total Links Found: {}", id, myCrawlStat.getTotalLinks());
logger.info("Crawler {} > Total Text Size: {}", id, myCrawlStat.getTotalTextSize());
}
@OverridepublicObject getMyLocalData() {returnmyCrawlStat;
}
@Overridepublic voidonBeforeExit() {
dumpMyData();
}/** 這個(gè)方法決定了要抓取的URL及其內(nèi)容,例子中只允許抓取“http://sh.ziroom.com/z/nl/”這個(gè)域的頁面,
* 不允許.css、.js和多媒體等文件
*
* @see edu.uci.ics.crawler4j.crawler.WebCrawler#shouldVisit(edu.uci.ics.
* crawler4j.crawler.Page, edu.uci.ics.crawler4j.url.WebURL)*/@Overridepublic booleanshouldVisit(Page referringPage, WebURL url) {final String href =url.getURL().toLowerCase();if (FILTERS.matcher(href).matches() || !href.startsWith(URL_PREFIX)) {return false;
}return true;
}/** 當(dāng)URL下載完成會(huì)調(diào)用這個(gè)方法。你可以輕松獲取下載頁面的url, 文本, 鏈接, html,和唯一id等內(nèi)容。
*
* @see
* edu.uci.ics.crawler4j.crawler.WebCrawler#visit(edu.uci.ics.crawler4j.
* crawler.Page)*/@Overridepublic voidvisit(Page page) {final String url =page.getWebURL().getURL();
logger.info("爬取路徑:" +url);
myCrawlStat.incProcessedPages();if (page.getParseData() instanceofHtmlParseData) {final HtmlParseData htmlParseData =(HtmlParseData) page.getParseData();final Set links =htmlParseData.getOutgoingUrls();try{
linkToCsv(links);
}catch (finalIOException e2) {//TODO Auto-generated catch block
e2.printStackTrace();
}
myCrawlStat.incTotalLinks(links.size());try{
myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("UTF-8").length);
}catch (finalUnsupportedEncodingException e1) {//TODO Auto-generated catch block
e1.printStackTrace();
}final String html =htmlParseData.getHtml();final Document doc =Jsoup.parse(html);final Elements contents = doc.select("li[class=clearfix]");for (finalElement c : contents) {//圖片
final String img = c.select(".img img").first().attr("src");
logger.debug("圖片:" +img);//地址
final Element txt = c.select("div[class=txt]").first();final String arr1 = txt.select("h3 a").first().text();final String arr2 = txt.select("h4 a").first().text();final String arr3 = txt.select("div[class=detail]").first().text();final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3);
logger.debug("地址:" +arr);//說明
final String rank = txt.select("p").first().text();
logger.debug("說明:" +rank);//價(jià)格
final String pirce = c.select("p[class=price]").first().text();try{
csvLinks= new CsvWriter(new FileWriter(fLinks, true), ',');
csvLinks.write(img);
csvLinks.write(pirce);
csvLinks.write(arr);
csvLinks.write(rank);
csvLinks.endRecord();
csvLinks.flush();
csvLinks.close();
}catch (finalIOException e) {
e.printStackTrace();
}
}
}
}private void linkToCsv(Set links) throwsIOException {
csvDatas= new CsvWriter(new FileWriter(fDatas, true), ',');for (finalWebURL webURL : links) {
csvDatas.write(webURL.getURL());
}
csvDatas.flush();
csvDatas.endRecord();
csvDatas.close();
}
}
總結(jié)
以上是生活随笔為你收集整理的java房源信息管理的代码_crawler4j源码学习(2):Ziroom租房网房源信息采集爬虫的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 小米路由器Mini安装设置方法
- 下一篇: 竹枝词教学设计一等奖