小爬简单模式
學習兩周python之后,就學著寫了個爬蟲,實現對百度百科關鍵字python搜索100個相關鏈接頁面的標題和摘要。
項目結構如下:
源代碼如下:
html_downloader.py
?
# coding:utf-8 import urllib2class HtmlDownloader():def download(self, url):if url is None:return Noneresponse = urllib2.urlopen(url)if response.getcode() != 200:return Nonereturn response.read()?
html_outper.py
?
# coding:utf-8class HtmlOutputer(object):def __init__(self):self.datas = []def collect_data(self, new_data):if new_data is None:returnself.datas.append(new_data)def output_html(self):try:file = open("spider_output.html", "w")file.write("<html><head><meta charset='UTF-8'></head><body><table>")for data in self.datas:file.write("<tr>")file.write("<td>%s</td>" % data["url"])file.write("<td>%s</td>" % data["title"].encode("utf-8"))file.write("<td>%s</td>" % data["summary"].encode("utf-8"))file.write("</tr>")file.write("</table></body></html>")except:print "output_html error!"finally:file.close()html_parser.py
?
?
# coding:utf-8 import re import urlparsefrom bs4 import BeautifulSoupclass HtmlParser():def parse(self, page_url, html_cont):if page_url is None or html_cont is None:returnsoup = BeautifulSoup(html_cont, "html.parser", from_encoding="utf-8")new_urls = self.get_new_urls(page_url, soup)new_data = self.get_new_data(page_url, soup)return new_urls, new_datadef get_new_urls(self, page_url, soup):new_urls = set()links = soup.find_all("a", href=re.compile(r"/item/"))for l in links:new_url = l["href"]new_full_link = urlparse.urljoin(page_url, new_url)new_urls.add(new_full_link)return new_urlsdef get_new_data(self, page_url, soup):res_data = {}res_data["url"] = page_url'''<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1></dd>'''title_node = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")res_data["title"] = title_node.get_text()'''<div class="lemma-summary" label-module="lemmaSummary"></div>'''summary_node = soup.find("div", class_="lemma-summary")res_data["summary"] = summary_node.get_text()return res_dataurl_manager.py
# coding:utf-8class UrlManager():'''維護兩個url集合,訪問過的urls、待訪問的urls'''def __init__(self):self.new_urls = set()self.old_urls = set()def add_new_url(self, url):if url is None:returnif url not in self.old_urls and url not in self.new_urls:self.new_urls.add(url)def add_new_urls(self, new_urls):if new_urls is None or len(new_urls) == 0:returnfor url in new_urls:self.new_urls.add(url)def has_new_url(self):return len(self.new_urls) != 0def get_new_url(self):new_url = self.new_urls.pop()self.old_urls.add(new_url)return new_urlspider_main.py
?
?
# coding:utf-8 import html_downloader import html_outputer import html_parser import url_managerclass SpiderMan(object):def __init__(self):self.urls = url_manager.UrlManager()self.downloader = html_downloader.HtmlDownloader()self.parser = html_parser.HtmlParser()self.outputer = html_outputer.HtmlOutputer()def craw(self, root_url):count = 1self.urls.add_new_url(root_url)while self.urls.has_new_url():new_url = self.urls.get_new_url()print "current url is %d , url = %s" % (count, new_url)html_cont = self.downloader.download(new_url)new_urls, new_data = self.parser.parse(new_url, html_cont)self.urls.add_new_urls(new_urls)self.outputer.collect_data(new_data)count += 1if (count == 100):breakself.outputer.output_html()if __name__ == "__main__":root_url = "https://baike.baidu.com/item/Python/407313?fr=aladdin"obj_spider = SpiderMan()obj_spider.craw(root_url)運行結果:
?
?
轉載于:https://www.cnblogs.com/jasonhaven/p/7355001.html
總結
- 上一篇: WebDriver自动化测试工具(3)-
- 下一篇: HttpInvoker GET/POST