import requests
from bs4 import BeautifulSoupwithopen(r'foobar2000_components.txt','w',encoding='utf-8')as fp:url='http://www.foobar2000.org/components'r=requests.get(url)soup=BeautifulSoup(r.text,'lxml')for item in soup.find_all('a'):k=item.get('href'fp.write(k+'\n')
獲取的文本如下: 額,好像多了點東西 加一個判斷語句
import requests
from bs4 import BeautifulSoupwithopen(r'foobar2000_components.txt','w',encoding='utf-8')as fp:url='http://www.foobar2000.org/components'r=requests.get(url)soup=BeautifulSoup(r.text,'lxml')components=[]for item in soup.find_all('a'):k=item.get('href')if'tag'notin k and'components/view'in k:if k+'\n'notin components:#去重components.append(k+'\n')for item in components:fp.write(item)
如圖:
這樣就差不多了,但這只是插件介紹頁的網址,所以還要逐個解析這些網址來獲取下載鏈接
fp1=open(r'foobar2000_components.txt','r',encoding='utf-8')
fp2=open(r'foobar2000_components_download.txt','w',encoding='utf-8')
lines=fp1.readlines()
urls=[]for i inrange(len(lines)):urls.append('http://www.foobar2000.org'+lines[i])
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'}for url in urls:r=requests.get(url[0:-1], headers = headers)soup=BeautifulSoup(r.text,'lxml')for item in soup.find_all('a'):k=item.get('href')if'getcomponent'in k:fp2.write(k[1:]+'\n'
fp1.close()
fp2.close()
這樣就獲得了全部的下載鏈接 最后把這兩個過程整合一下
import requests
from bs4 import BeautifulSoupwithopen(r'foobar2000_components_get.txt','w',encoding='utf-8')as fp:r=requests.get('http://www.foobar2000.org/components')soup=BeautifulSoup(r.text,'lxml')urls=[]for item in soup.find_all('a'):k=item.get('href')if'tag'notin k and'components/view'in k:if k notin urls:#去重urls.append(k)headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'}for url in urls:r=requests.get('http://www.foobar2000.org'+url, headers = headers)soup=BeautifulSoup(r.text,'lxml')for item in soup.find_all('a'):k=item.get('href')if'getcomponent'in k:fp.write('http://www.foobar2000.org'+k+'\n')
from urllib import request withopen(r'foobar2000_components_get.txt','r')as fp:urls=fp.readlines()local='d:\\components\\'for url in urls:i=url.rfind('/'request.urlretrieve(url[0:-1],local+url[i+1:-1])
最后整合后的全部代碼如下:
import requests
from bs4 import BeautifulSoup
from urllib import requestr=requests.get('http://www.foobar2000.org/components')
soup=BeautifulSoup(r.text,'lxml')
urls=[]for item in soup.find_all('a'):k=item.get('href')if'tag'notin k and'components/view'in k:if k notin urls:#去重urls.append(k)#獲取下載頁面網址
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'}
download_urls=[]for url in urls:try:r=requests.get('http://www.foobar2000.org'+url, headers = headers)soup=BeautifulSoup(r.text,'lxml')for item in soup.find_all('a'):k=item.get('href')if'getcomponent'in k:download_urls.append('http://www.foobar2000.org'+k)#獲取下載鏈接except:continue
local='d:\\components\\'for url in download_urls:i=url.rfind('/')request.urlretrieve(url,local+url[i+1:]#下載文件