爬取新浪财经个股的历史财报摘要
生活随笔
收集整理的這篇文章主要介紹了
爬取新浪财经个股的历史财报摘要
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
爬取新浪財經個股的歷史財報摘要
網頁的內容為:
想要的內容為:
matplotlib繪圖:
代碼:
def get_gg_fin_abs(code='000911'):u'''Note------ xpath表達式: 'td[1]' 表示第一個td標簽, 其中的'[n]': 聲明第幾個標簽, 1-based- etree對象操作: html的上層文字的獲取: 用 .text屬性 比 .xpath('text()')[0]方法 簡明得多, 前提是: etree.element要具有text屬性- .find(_path) and .findall(_path) 方法也很好用, 他們分別返回一個etree._Element對象, 或者etree._Element對象的list- <tbody> tag in <table> is usually added by the browser, not actually in the html source. so you can not .find() or .xpath() it. Use it's parent tag which is <table> to work with.Ref------ Get all td content inside tr of tbody in python using lxml - Stack Overflow - http://stackoverflow.com/questions/37080910/get-all-td-content-inside-tbody-of-tr-in-python-using-lxml'''url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_FinanceSummary/stockid/%s.html'url = url%(code)craw=crawler.Crawler(url)craw.idom()tr_path='//table[@id="FundHoldSharesTable"]//tr'trs = craw.dom.xpath(tr_path)print len(trs)# -------- 提取日期行的注意事項 ---------# etree.tostring()方法可以查看: html文本# print etree.tostring( trs[53].xpath('td')[1])# <td align="left" class="tdr"><strong>2015-09-30</strong></td> # 得知: 該文本是加粗的文本: 位于<strong>路徑下面: 所以:需要帶上/strong后綴# k_date = tr_nodes[53].xpath('td[1]/strong').text # v_date = tr_nodes[53].xpath('td[2]/strong').text#txt = 'text()'#txts = 'strong/text()'from collections import OrderedDict as Odictfdata= Odict()#for i,tr in enumerate(trs[53:65+20]):# //*[@id="FundHoldSharesTable"]/tbody/tr[1] : # paste this xpath from copy of Chrome F12 of 審查元素# # handle header: ------------------------------#header= trs[0].find('th').textheader=header.strip()# ------ handle body data -------for i,tr in enumerate(trs[1:]): # 從第2個tr的數據行開始if tr.xpath('td[@height="5px"]') != []: # empty row#print 'this is a empty row'continueif tr.find('td[1]').find('strong') is not None: # 截止日期行vdate = tr.find('td[2]/strong').text # 找到第二個td的文本dict2 = Odict() # 創建一個空的字典, for 本季度的財務摘要數據continueelse:k2= tr.find('td[1]').text #============================================================================== # # if tr.find('td[2]/a').text is not None: # 千萬不要帶屬性測試, 因為太貪婪 # if tr.find('td[2]/a') is not None: # v2= tr.find('td[2]/a').text # else: # v2= tr.find('td[2]').text #==============================================================================# 可以更簡明地編寫為:v2= tr.find('td[2]/a').text if tr.find('td[2]/a') is not None else \tr.find('td[2]').text dict2[k2]=v2# print i,k2,v2 # for debug purposeif k2==u'凈利潤': # 本季度的最后一行數據, 需要保存小字典到大字典fdata[vdate] = dict2continuedf=pd.DataFrame(fdata.values(), index=fdata.keys()) df.index.name=header#print df.head().ix[:,:2]#print df #//*[@id="FundHoldSharesTable"]/tbody/tr[870]def mapper_strdatetime10_2_datetime(s):u'''para------ s, str, '1998-12-31'return------ datetime.datetime(1998, 12, 31, 0, 0)'''y= int(s[0:4])m= int(s[5:7])d= int(s[8:10])return datetime(y,m,d)def mapper_html_table_td_2_float(td):if td.strip() is not u'':td = td.strip().replace(u'元','')td = float(td)else: td = np.nanreturn tddef mapper_html_table_td_with_comma_2_float(td):if td.strip() is not u'':td = td.strip().replace(u'元','')if td.find(',')>0:td = td.replace(',','')td = float(td)/1000000.0return tdtd = float(td)/1000000.0return tdelse: td = np.nanreturn tddf.index = map(mapper_strdatetime10_2_datetime, df.index)df.index.name = headerfor i in np.arange(4): #len(df.columns)):df.ix[:,i] = map(mapper_html_table_td_2_float, df.ix[:,i])for i in np.arange(4, len(df.columns)): #len(df.columns)):df.ix[:,i] = map(mapper_html_table_td_with_comma_2_float, df.ix[:,i])# df.ix[:, :2].plot()return df.sort_index()#============================================================================== # print df.tail() # print df.columns # print df.index.name # # fig,(ax1,ax2)=plt.subplots(2,1) # # fig.set_figheight(fig.get_figheight()*2) # df.ix[:,(0,3)].plot(ax=ax1) # df.ix[:,5:7].plot(ax=ax2) # ax1.set_ylabel(u'(元)') # ax2.set_ylabel(u'(百萬元)') #==============================================================================轉載于:https://www.cnblogs.com/duan-qs/p/6740525.html
總結
以上是生活随笔為你收集整理的爬取新浪财经个股的历史财报摘要的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Event-B建模(一)——概念与基础
- 下一篇: 常用EXCEl数据分析函数