百度文库API免费下载百度文库收费资料【python】
生活随笔
收集整理的這篇文章主要介紹了
百度文库API免费下载百度文库收费资料【python】
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
? ? ?
import requests import re import argparse import sys import json import osparser = argparse.ArgumentParser() parser.add_argument("url", help="Target Url,你所需要文檔的URL",type=str) parser.add_argument('type', help="Target Type,你所需要文檔的的類型(DOC|PPT|TXT|PDF)",type=str) args = parser.parse_args()url = args.url type = args.type#根據(jù)文件決定函數(shù) y = 0 def DOC(url):doc_id = re.findall('view/(.*).html', url)[0]html = requests.get(url).textlists=re.findall('(https.*?0.json.*?)\\\\x22}',html)lenth = (len(lists)//2)NewLists = lists[:lenth]for i in range(len(NewLists)) :NewLists[i] = NewLists[i].replace('\\','')txts=requests.get(NewLists[i]).texttxtlists = re.findall('"c":"(.*?)".*?"y":(.*?),',txts)for i in range(0,len(txtlists)):global yprint(txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore'))if y != txtlists[i][1]:y = txtlists[i][1]n = '\n'else:n = ''filename = doc_id + '.txt'with open(filename,'a',encoding='utf-8') as f:f.write(n+txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore').replace('\\',''))print("文檔保存在"+filename)def PPT(url):doc_id = re.findall('view/(.*).html',url)[0]url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt"html = requests.get(url).textlists=re.findall('{"zoom":"(.*?)","page"',html)for i in range(0,len(lists)):lists[i] = lists[i].replace("\\",'')try:os.mkdir(doc_id)except:passfor i in range(0,len(lists)):img=requests.get(lists[i]).contentwith open(doc_id+'\img'+str(i)+'.jpg','wb') as m:m.write(img)print("PPT圖片保存在" + doc_id +"文件夾")def TXT(url):doc_id = re.findall('view/(.*).html', url)[0]url = "https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id="+doc_idhtml = requests.get(url).textmd5 = re.findall('"md5sum":"(.*?)"',html)[0]pn = re.findall('"totalPageNum":"(.*?)"',html)[0]rsign = re.findall('"rsign":"(.*?)"',html)[0]NewUrl = 'https://wkretype.bdimg.com/retype/text/'+doc_id+'?rn='+pn+'&type=txt'+md5+'&rsign='+rsigntxt = requests.get(NewUrl).textjsons = json.loads(txt)texts=re.findall("'c': '(.*?)',",str(jsons))print(texts)filename=doc_id+'.txt'with open(filename,'a',encoding='utf-8') as f:for i in range(0,len(texts)):texts[i] = texts[i].replace('\\r','\r')texts[i] = texts[i].replace('\\n','\n')f.write(texts[i])print("文檔保存在" + filename)def PDF(url):doc_id = re.findall('view/(.*).html',url)[0]url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt"html = requests.get(url).textlists=re.findall('{"zoom":"(.*?)","page"',html)for i in range(0,len(lists)):lists[i] = lists[i].replace("\\",'')try:os.mkdir(doc_id)except:passfor i in range(0,len(lists)):img=requests.get(lists[i]).contentwith open(doc_id+'\img'+str(i)+'.jpg','wb') as m:m.write(img)print("FPD圖片保存在" + doc_id + "文件夾")if __name__ == "__main__":try:print(""" ###Athor:52pojie ###TIPS:PDF|PPT只能下載圖片""")eval(type.upper())(url)except:print("獲取出錯,可能URL錯誤\n使用格式name.exe url type\n請使用--help查看幫助")如果有更好的接口可以在評論下面提供,感謝各位的支持!
? 另外分享自己做的,爬取專利技術的文章庫:http://zhimo.yuanzhumuban.cc/blog/
總結(jié)
以上是生活随笔為你收集整理的百度文库API免费下载百度文库收费资料【python】的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: centos6下的vsftpd配置
- 下一篇: websocket python爬虫_p