import urllib2
from bs4 import BeautifulSoup
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/ width=1000 height=500></iframe>')
# the webpage we would like to crawl
<td width="274" class="bl">· <a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a></td>
# get the link for each year
url = "http://www.hprc.org.cn/wxzl/wxysl/lczf/"
content = urllib2.urlopen(url).read().decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
# links = soup.find_all('td', {'class', 'bl'})
links = soup.select('.bl a')
links[0]['href']
u'./d12qgrdzfbg/201603/t20160318_369509.html'
print len(links)
47
print links[0]['href']
./d12qgrdzfbg/201603/t20160318_369509.html
print links[0].a
<a href="./d12qgrdzfbg/201603/t20160318_369509.html" target="_blank" title="2016年政府工作报告">2016年政府工作报告</a>
print links[0].a['href']
./d12qgrdzfbg/201603/t20160318_369509.html
print links[0]['href'].split('./')[1]
d12qgrdzfbg/201603/t20160318_369509.html
print links[0]['href'].split('./')[1]
d12qgrdzfbg/201603/t20160318_369509.html
print url + links[0]['href'].split('./')[1]
http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html
hyperlinks = [url + i['href'].split('./')[1] for i in links]
hyperlinks[:5]
[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html']
hyperlinks
[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201103/t20110315_153641.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201003/t20100315_44772.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27504.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27495.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27765.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27757.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27756.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27753.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27744.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27741.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27738.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27737.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27736.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27709.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27708.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27707.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27706.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27705.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27702.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27700.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27699.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27678.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27644.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27642.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27640.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27616.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27615.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27614.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27613.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27612.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27611.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27567.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27566.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_7/200908/t20090818_27565.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_8/200908/t20090818_27564.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27562.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27563.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27561.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27560.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27559.html', u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27558.html']
hyperlinks[9] # 2007年有分页
u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
from IPython.display import display_html, HTML
HTML('<iframe src=http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html \
width=1000 height=500></iframe>')
# 2007年有分页
url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'
content = urllib2.urlopen(url_i).read().decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
#scripts = soup.find_all('script')
#scripts[0]
scripts = soup.select('td script')[0]
scripts
<script>\n\tvar currentPage = 0;//\u6240\u5728\u9875\u4ece0\u5f00\u59cb\n\tvar prevPage = currentPage-1//\u4e0a\u4e00\u9875\n\tvar \u4e0b\u4e00\u9875Page = currentPage+1//\u4e0b\u4e00\u9875\n\tvar countPage = 4//\u5171\u591a\u5c11\u9875\n\t//document.write("\u5171"+countPage+"\u9875 ");\n\t\n\t//\u5faa\u73af\n\tvar num = 17;\n\tfor(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){\n\t\tif(countPage >1){\n\t\t\tif(currentPage==i)\n\t\t\t\tdocument.write("\u3010<span style=\\"color:#FF0000;\\" class=\\"hui14_30_h\\">"+(i+1)+"</span>\u3011 ");\n\t\t\telse if(i==0)\n\t\t\t\tdocument.write("<a href=\\"t20090818_27775.html\\" class=\\"hui14_30_h\\">\u3010"+(i+1)+"\u3011</a> ");\n\t\t\telse\n\t\t\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + i + "."+"html\\" class=\\"hui14_30_h\\">\u3010"+(i+1)+"\u3011</a> ");\n\t\t}\t\n\t}\n\t\n\tdocument.write("<br><br>");\n\t//\u8bbe\u7f6e\u4e0a\u4e00\u9875\u4ee3\u7801\n\tif(countPage>1&¤tPage!=0&¤tPage!=1)\n\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + prevPage + "."+"html\\"><span style=\\"color:#0033FF;font-weight:bold\\">\u4e0a\u4e00\u9875</span></a> ");\n\telse if(countPage>1&¤tPage!=0&¤tPage==1)\n\t\tdocument.write("<a href=\\"t20090818_27775.html\\"><span style=\\"color:#0033FF;font-weight:bold\\">\u4e0a\u4e00\u9875</span></a> ");\n\t//else\n\t//\tdocument.write("\u4e0a\u4e00\u9875 ");\n\t\n\t\n\t//\u8bbe\u7f6e\u4e0b\u4e00\u9875\u4ee3\u7801 \n\tif(countPage>1&¤tPage!=(countPage-1))\n\t\tdocument.write("<a href=\\"t20090818_27775"+"_" + \u4e0b\u4e00\u9875Page + "."+"html\\" ><span style=\\"color:#0033FF;font-weight:bold\\">\u4e0b\u4e00\u9875</span></a> ");\n\t//else\n\t//\tdocument.write("\u4e0b\u4e00\u9875 ");\n\t\t\t\t\t \n\t</script>
print scripts.text
var currentPage = 0;//所在页从0开始 var prevPage = currentPage-1//上一页 var 下一页Page = currentPage+1//下一页 var countPage = 4//共多少页 //document.write("共"+countPage+"页 "); //循环 var num = 17; for(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i<countPage) ; i++){ if(countPage >1){ if(currentPage==i) document.write("【<span style=\"color:#FF0000;\" class=\"hui14_30_h\">"+(i+1)+"</span>】 "); else if(i==0) document.write("<a href=\"t20090818_27775.html\" class=\"hui14_30_h\">【"+(i+1)+"】</a> "); else document.write("<a href=\"t20090818_27775"+"_" + i + "."+"html\" class=\"hui14_30_h\">【"+(i+1)+"】</a> "); } } document.write("<br><br>"); //设置上一页代码 if(countPage>1&¤tPage!=0&¤tPage!=1) document.write("<a href=\"t20090818_27775"+"_" + prevPage + "."+"html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a> "); else if(countPage>1&¤tPage!=0&¤tPage==1) document.write("<a href=\"t20090818_27775.html\"><span style=\"color:#0033FF;font-weight:bold\">上一页</span></a> "); //else // document.write("上一页 "); //设置下一页代码 if(countPage>1&¤tPage!=(countPage-1)) document.write("<a href=\"t20090818_27775"+"_" + 下一页Page + "."+"html\" ><span style=\"color:#0033FF;font-weight:bold\">下一页</span></a> "); //else // document.write("下一页 ");
countPage = int(''.join(scripts).split('countPage = ')[1].split('//')[0])
countPage
4
def crawler(url_i):
content = urllib2.urlopen(url_i).read().decode('gb18030')
soup = BeautifulSoup(content, 'html.parser')
year = soup.find('span', {'class', 'huang16c'}).text[:4]
year = int(year)
report = ''.join(s.text for s in soup('p'))
# 找到分页信息
scripts = soup.find_all('script')
countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])
if countPage == 1:
pass
else:
for i in range(1, countPage):
url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'
content = urllib2.urlopen(url_child).read().decode('gb18030')
soup = BeautifulSoup(content)
report_child = ''.join(s.text for s in soup('p'))
report = report + report_child
return year, report
# 抓取47年政府工作报告内容
reports = {}
for link in hyperlinks:
year, report = crawler(link)
print year
reports[year] = report
2016 2015 2014 2013 2012
/Users/chengjun/anaconda/lib/python2.7/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently. To get rid of this warning, change this: BeautifulSoup([your markup]) to this: BeautifulSoup([your markup], "lxml") markup_type=markup_type))
2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000 1999 1998 1997 1996 1995 1994 1993 1992 1991 1990 1989 1988 1987 1986 1985 1984 1983 1982 1981 1980 1979 1978 1975 1964 1959 1960 1957 1956 1955 1954
url2016 = 'http://news.xinhuanet.com/fortune/2016-03/05/c_128775704.htm'
content = urllib2.urlopen(url2016).read()
soup = BeautifulSoup(content, 'html.parser')
report2016 = ''.join(s.text for s in soup('p'))
with open('/Users/chengjun/github/cjc2016/data/gov_reports1954-2016.txt', 'wb') as f:
for r in reports:
line = str(r)+'\t'+reports[r].replace('\n', '\t') +'\n'
f.write(line.encode('utf-8'))