import urllib2
from bs4 import BeautifulSoup
http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=%E9%9B%BE%E9%9C%BE - 第二页是: http://bbs.tianya.cn/list.jsp?item=free&nextid=1&order=8&k=%E9%9B%BE%E9%9C%BE
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX width=1000 height=500></iframe>')
# the webpage we would like to crawl
page_num = 0
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
print articles[0]
<tr> <th scope="col"> 标题</th> <th scope="col">作者</th> <th scope="col">点击</th> <th scope="col">回复</th> <th scope="col">发表时间</th> </tr>
print articles[1]
<tr class="bg"> <td class="td-title "> <span class="face" title=""> </span> <a href="/post-free-2849477-1.shtml" target="_blank"> 【民间语文第161期】宁波px启示:船进港湾人应上岸<span class="art-ico art-ico-3" title="内有0张图片"></span> </a> </td> <td><a class="author" href="http://www.tianya.cn/50499450" target="_blank">贾也</a></td> <td>194699</td> <td>2703</td> <td title="2012-10-29 07:59">10-29 07:59</td> </tr>
len(articles[1:])
50
http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=PX
for t in articles[1].find_all('td'): print t
<td class="td-title "> <span class="face" title=""> </span> <a href="/post-free-2849477-1.shtml" target="_blank"> 【民间语文第161期】宁波px启示:船进港湾人应上岸<span class="art-ico art-ico-3" title="内有0张图片"></span> </a> </td> <td><a class="author" href="http://www.tianya.cn/50499450" target="_blank">贾也</a></td> <td>194699</td> <td>2703</td> <td title="2012-10-29 07:59">10-29 07:59</td>
td = articles[1].find_all('td')
print td[0]
<td class="td-title "> <span class="face" title=""> </span> <a href="/post-free-2849477-1.shtml" target="_blank"> 【民间语文第161期】宁波px启示:船进港湾人应上岸<span class="art-ico art-ico-3" title="内有0张图片"></span> </a> </td>
print td[0]
<td class="td-title "> <span class="face" title=""> </span> <a href="/post-free-2849477-1.shtml" target="_blank"> 【民间语文第161期】宁波px启示:船进港湾人应上岸<span class="art-ico art-ico-3" title="内有0张图片"></span> </a> </td>
print td[0].text
【民间语文第161期】宁波px启示:船进港湾人应上岸
print td[0].text.strip()
【民间语文第161期】宁波px启示:船进港湾人应上岸
print td[0].a['href']
/post-free-2849477-1.shtml
print td[1]
<td><a class="author" href="http://www.tianya.cn/50499450" target="_blank">贾也</a></td>
print td[2]
<td>194699</td>
print td[3]
<td>2703</td>
print td[4]
<td title="2012-10-29 07:59">10-29 07:59</td>
records = []
for i in articles[1:]:
td = i.find_all('td')
title = td[0].text.strip()
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = title + '\t' + title_url+ '\t' + author + '\t'+ author_url + '\t' + views+ '\t' + replies+ '\t'+ date
records.append(record)
print records[2]
宁波准备停止PX项目了,元芳,你怎么看? /post-free-2848797-1.shtml 牧阳光 http://www.tianya.cn/36535656 82888 625 2012-10-28 19:11
def crawler(page_num, file_name):
try:
# open the browser
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
# write down info
for i in articles[1:]:
td = i.find_all('td')
title = td[0].text.strip()
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = title + '\t' + title_url+ '\t' + author + '\t'+ \
author_url + '\t' + views+ '\t' + replies+ '\t'+ date
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception, e:
print e
pass
# crawl all pages
for page_num in range(10):
print (page_num)
crawler(page_num, '/Users/chengjun/bigdata/tianya_bbs_threads_list.txt')
0 1 2 3 4 5 6 7 8 9
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:2]
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 |
len(df)
467
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
title | link | author | author_page | click | reply | time | |
---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 |
len(df.link)
467
df.author_page[:5]
0 http://www.tianya.cn/50499450 1 http://www.tianya.cn/74341835 2 http://www.tianya.cn/36535656 3 http://www.tianya.cn/36959960 4 http://www.tianya.cn/53134970 Name: author_page, dtype: object
# user_info
url = df.author_page[1]
content = urllib2.urlopen(url).read() #获取网页的html文本
soup1 = BeautifulSoup(content, "lxml")
user_info = soup.find('div', {'class', 'userinfo'})('p')
area, nid, freq_use, last_login_time, reg_time = [i.get_text()[4:] for i in user_info]
print area, nid, freq_use, last_login_time, reg_time
link_info = soup1.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
浙江杭州市 259643 5832 2016-04-16 16:53:46 2011-04-14 20:49:00
activity = soup1.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
print post_num, reply_num
2 5
print activity[2]
<span class="subtitle"> <a href="http://blog.tianya.cn/blog-3644295-1.shtml" target="_blank">贾也的博客</a> </span>
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print followed_num, fans_num
152 27451
link_info[0].a.text
u'152'
# user_info = soup.find('div', {'class', 'userinfo'})('p')
# user_infos = [i.get_text()[4:] for i in user_info]
def author_crawler(url, file_name):
try:
content = urllib2.urlopen(url).read() #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
try:
activity = soup.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
except:
post_num, reply_num = 1, 0
record = '\t'.join([url, followed_num, fans_num, post_num, reply_num])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception, e:
print e, url
record = '\t'.join([url, 'na', 'na', 'na', 'na'])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record.encode('utf-8')+"\n") ##!!encode here to utf-8 to avoid encoding
pass
for k, url in enumerate(df.author_page):
if k % 10==0:
print k
author_crawler(url, '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_author_info2.txt')
0 10 20 30 40 need more than 0 values to unpack http://www.tianya.cn/67896263 need more than 0 values to unpack http://www.tianya.cn/42330613 sequence item 3: expected string or Unicode, int found http://www.tianya.cn/26517664 50 need more than 0 values to unpack http://www.tianya.cn/75591747 60 need more than 0 values to unpack http://www.tianya.cn/24068399 70 80 90 need more than 0 values to unpack http://www.tianya.cn/67896263 sequence item 3: expected string or Unicode, int found http://www.tianya.cn/62237033 100 110 120 130 140 150 160 170 180 190 need more than 0 values to unpack http://www.tianya.cn/67896263 200 need more than 0 values to unpack http://www.tianya.cn/85353911 210 220 230 240 250 260 270 280 need more than 0 values to unpack http://www.tianya.cn/67896263 290 need more than 0 values to unpack http://www.tianya.cn/67896263 300 310 320 need more than 0 values to unpack http://www.tianya.cn/67896263 330 340 350 360 370 need more than 0 values to unpack http://www.tianya.cn/67896263 380 390 400 410 420 430 440 450 460
http://www.tianya.cn/50499450/follow
还可抓取他们的关注列表和粉丝列表
df.link[0]
'/post-free-2849477-1.shtml'
url = 'http://bbs.tianya.cn' + df.link[2]
url
'http://bbs.tianya.cn/post-free-2848797-1.shtml'
from IPython.display import display_html, HTML
HTML('<iframe src=http://bbs.tianya.cn/post-free-2848797-1.shtml width=1000 height=500></iframe>')
# the webpage we would like to crawl
post = urllib2.urlopen(url).read() #获取网页的html文本
post_soup = BeautifulSoup(post, "lxml")
#articles = soup.find_all('tr')
print (post_soup.prettify())[:5000]
<!DOCTYPE HTML> <html> <head> <meta charset="utf-8"/> <title> 宁波准备停止PX项目了,元芳,你怎么看?_天涯杂谈_天涯论坛 </title> <meta content="宁波准备停止PX项目了,元芳,你怎么看? 从宁波市政府新闻发言人处获悉,宁波市经与项目投资方研究决定:(1)坚决不上PX项目;(2)炼化一体化项目前期工作停止推进,再作科学论证。..." name="description"/> <meta content="IE=EmulateIE9" http-equiv="X-UA-Compatible"/> <meta content="牧阳光" name="author"/> <meta content="format=xhtml; url=http://bbs.tianya.cn/m/post-free-2848797-1.shtml" http-equiv="mobile-agent"/> <link href="http://static.tianyaui.com/global/ty/TY.css" rel="stylesheet" type="text/css"/> <link href="http://static.tianyaui.com/global/bbs/web/static/css/bbs_article_c55fffc.css" rel="stylesheet" type="text/css"/> <link href="http://static.tianyaui.com/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/> <link href="http://bbs.tianya.cn/post-free-2848797-2.shtml" rel="next"/> <script type="text/javascript"> var bbsGlobal = { isEhomeItem : false, isNewArticle : false, authorId : "36535656", authorName : "牧阳光", blocktype : "主版", itemType : "主版", laibaType : "null", page : "1", permission : true, permissionStatus : "开放", itemPermission : 1, itemCategory : "社会", isWeiLun : false, isSealItem : false, item : "free", itemName : "天涯杂谈", artId : 2848797, media : 0, subType : "", ad : 0, adshow : 0, adblock : 0, words : [{"link":"http://ebook.tianya.cn/html2/chapter.aspx?bookid=75204&comefrom=tianya ","word":"养鬼"},{"link":"http://ebook.tianya.cn/html2/chapter.aspx?bookid=78822&comefrom=tianya ","word":"爱情"},{"link":"http://ebook.tianya.cn/html2/chapter.aspx?bookid=78829&comefrom=tianya ","word":"离婚"},{"link":"http://zan.tianya.cn/","word":"原创"},{"link":"http://bbs.tianya.cn/list-410-1.shtml","word":"主播"},{"link":"http://groups.tianya.cn/list-163029-1.shtml","word":"日记"},{"link":"http://groups.tianya.cn/list-163907-1.shtml","word":"吐糟"},{"link":"http://groups.tianya.cn/list-65695-1.shtml","word":"青春"},{"link":"http://groups.tianya.cn/list-86723-1.shtml","word":"EXO"},{"link":"http://groups.tianya.cn/list-81007-1.shtml","word":"李易峰"},{"link":"http://groups.tianya.cn/list-9999-1.shtml","word":"乔振宇"},{"link":"http://groups.tianya.cn/list-9619-1.shtml","word":"陈晓"},{"link":"http://groups.tianya.cn/list-9214-1.shtml","word":"历史"},{"link":"http://groups.tianya.cn/list-163222-1.shtml","word":"哲学"},{"link":"http://groups.tianya.cn/list-163503-1.shtml","word":"感动"},{"link":"http://bbs.tianya.cn/list-730-1.shtml","word":"中山大学"},{"link":"http://groups.tianya.cn/list-14270-1.shtml","word":"雨后"},{"link":"http://groups.tianya.cn/list-9797-1.shtml","word":"钟汉良"}], pageCount : 7, dashang : { merId : "%E5%A4%A9%E6%B6%AF%E8%AE%BA%E5%9D%9B", merNum : "free-2848797", getName : "%E7%89%A7%E9%98%B3%E5%85%89", time : "1463208857581", ext1 : "2848797", ext2 : "free", ext4 : "%E5%AE%81%E6%B3%A2%E5%87%86%E5%A4%87%E5%81%9C%E6%AD%A2PX%E9%A1%B9%E7%9B%AE%E4%BA%86%EF%BC%8C%E5%85%83%E8%8A%B3%EF%BC%8C%E4%BD%A0%E6%80%8E%E4%B9%88%E7%9C%8B%EF%BC%9F", sign : "4fd8fdddad1703e34ab83df926976edb", amount : 0, zhiyinCount : 0, newestRecords : null }, isWenda : false, isSlide : true, trueName : "", artProtectedTips : "" }; var adsGlobal = { itemId : "free", pageType : "02", popWinId: "16" }; </script> <script charset="utf-8" src="http://static.tianyaui.com/global/ty/TY.js" type="text/javascript"> </script> <script charset="utf-8" src="http://static.tianyaui.com/global/bbs/web/static/js/main_e88627e.js" type="text/javascript"> </script> </head> <body> <div id="top_nav_wrap"> <div class="clearfix" id="top_nav"> <div class="top-nav-logo"> <a _tystat="新版顶导航/Logo" href="http://focus.tianya.cn/"> </a> </div> <div class="top-nav-main clearfix"> <div class="top-nav-menu clearfix"> <div class="top-nav-fl clearfix"> <ul class="top-nav-menu-list clearfix"> <li class="top-nav-menu-li top-nav-menu-li-first"> <a _checklocation="1" _tystat="新版顶导航/论坛" appstr="bbs" class="top-nav-main-menu" href="http://bbs.tianya.cn/"> 论坛 </a> </li> <li class="top-nav-menu-li"> <a _checklocation="1" _tystat="新版顶导航/分社区/聚焦" appstr="focus" class="top-nav-main-menu" href="http://focus.tianya.cn/"> 聚焦 </a> </li> <li class="top-nav-menu-li"> <a _tystat="新版顶导航/部落" class="top-nav-main-menu" href="http://groups.tianya.cn"> 部落 </a> </li> <li class="top-nav-menu-li"> <a _checklocation="1" _tystat="新版顶导航/博客" appstr="blog" class="top-nav-main-menu" href="http://blog.tianya.cn/"> 博客 </a> </li> <li class="top-nav-menu-li"> <a _checklocation="1" _tystat="新版顶导航/问答" appstr="wenda" class="top-nav-main-menu" href="http://wenda.tianya.cn/"> 问答 </a> </li> <!-- <li class="top-nav-menu-li"><a _tystat="新版顶导航/天涯客" href="http://travel.tianya.cn/" cl
pa = post_soup.find_all('div', {'class', 'atl-item'})
len(pa)
90
print pa[0]
<div _host="%E7%89%A7%E9%98%B3%E5%85%89" class="atl-item host-item"> <div class="atl-content"> <div class="atl-con-hd clearfix"> <div class="atl-con-hd-l"></div> <div class="atl-con-hd-r"></div> </div> <div class="atl-con-bd clearfix"> <div class="bbs-content clearfix"> <br/> 从宁波市政府新闻发言人处获悉,宁波市经与项目投资方研究决定:(1)坚决不上PX项目;(2)炼化一体化项目前期工作停止推进,再作科学论证。<br/> <br/> </div> <div class="clearfix" id="alt_action"></div> <div class="clearfix"> <div class="host-data"> <span>楼主发言:11次</span> <span>发图:0张</span> </div> <div class="atl-reply" id="alt_reply"> <a author="牧阳光" authorid="36535656" class="reportme a-link" href="javascript:void(0);" replyid="0" replytime="2012-10-28 19:11:00"> 举报</a> | <a class="a-link acl-share" href="javascript:void(0);">分享</a> | <a class="a-link acl-more" href="javascript:void(0);">更多</a> | <span><a class="a-link" name="0">楼主</a></span> <a _name="牧阳光" _time="2012-10-28 19:11:00" class="a-link2 replytop" href="#fabu_anchor">回复</a> </div> </div> <div id="ds-quick-box" style="display:none;"></div> </div> <div class="atl-con-ft clearfix"> <div class="atl-con-ft-l"></div> <div class="atl-con-ft-r"></div> <div id="niuren_ifm"></div> </div> </div> </div>
print pa[1]
<div _host="%E6%80%A8%E9%AD%82%E9%AC%BC" class="atl-item" id="1" js_restime="2012-10-28 19:17:56" js_username="%E6%80%A8%E9%AD%82%E9%AC%BC" replyid="92725226"> <div class="atl-head" id="ea93038aa568ef2bf7a8cf6b6853b744"> <div class="atl-head-reply"></div> <div class="atl-info"> <span>作者:<a class="js-vip-check" href="http://www.tianya.cn/73157063" target="_blank" uid="73157063" uname="怨魂鬼">怨魂鬼</a> </span> <span>时间:2012-10-28 19:17:56</span> </div> </div> <div class="atl-content"> <div class="atl-con-hd clearfix"> <div class="atl-con-hd-l"></div> <div class="atl-con-hd-r"></div> </div> <div class="atl-con-bd clearfix"> <div class="bbs-content"> 图片分享<img original="http://img3.laibafile.cn/p/m/122161321.jpg" src="http://static.tianyaui.com/img/static/2011/imgloading.gif"/><br/><br/> </div> <div class="atl-reply"> 来自 <a _stat="/stat/bbs/post/来自" class="a-link" href="http://www.tianya.cn/mobile/" rel="nofollow" target="_blank">天涯社区(微论)客户端</a> | <a author="怨魂鬼" authorid="73157063" class="reportme a-link" href="javascript:void(0);" replyid="92725226" replytime="2012-10-28 19:17:56">举报</a> | <span>1楼</span> | <a class="a-link-2 ir-shang" floor="1" href="javascript:void(0);" title="打赏层主"> 打赏 </a> | <a class="a-link-2 reply" href="#fabu_anchor" title="引用回复">回复</a> | <a _stat="/stat/bbs/post/评论" class="a-link-2 ir-remark" floor="1" href="javascript:void(0);" title="插入评论"> 评论 </a> </div> </div> <div class="atl-con-ft clearfix"> <div class="atl-con-ft-l"></div> <div class="atl-con-ft-r"></div> </div> </div> </div>
print pa[89]
<div _host="%E5%B1%B1%E9%9B%A8%E6%AC%B2%E6%BB%A1%E6%A5%BC" class="atl-item" id="100" js_restime="2012-10-28 21:34:21" js_username="%E5%B1%B1%E9%9B%A8%E6%AC%B2%E6%BB%A1%E6%A5%BC" replyid="92725457"> <div class="atl-head" id="8cc4c81381b126cc08dd65759233d6de"> <div class="atl-head-reply"></div> <div class="atl-info"> <span>作者:<a class="js-vip-check" href="http://www.tianya.cn/74980506" target="_blank" uid="74980506" uname="山雨欲满楼">山雨欲满楼</a> </span> <span>时间:2012-10-28 21:34:21</span> </div> </div> <div class="atl-content"> <div class="atl-con-hd clearfix"> <div class="atl-con-hd-l"></div> <div class="atl-con-hd-r"></div> </div> <div class="atl-con-bd clearfix"> <div class="bbs-content"> 围观也是力量,得道多助失道寡助 </div> <div class="atl-reply"> <a author="山雨欲满楼" authorid="74980506" class="reportme a-link" href="javascript:void(0);" replyid="92725457" replytime="2012-10-28 21:34:21">举报</a> | <span>100楼</span> | <a class="a-link-2 ir-shang" floor="100" href="javascript:void(0);" title="打赏层主"> 打赏 </a> | <a class="a-link-2 reply" href="#fabu_anchor" title="引用回复">回复</a> | <a _stat="/stat/bbs/post/评论" class="a-link-2 ir-remark" floor="100" href="javascript:void(0);" title="插入评论"> 评论 </a> </div> </div> <div class="atl-con-ft clearfix"> <div class="atl-con-ft-l"></div> <div class="atl-con-ft-r"></div> </div> </div> </div>
作者:柠檬在追逐 时间:2012-10-28 21:33:55
@lice5 2012-10-28 20:37:17
作为宁波人 还是说一句:革命尚未成功 同志仍需努力
-----------------------------
对 现在说成功还太乐观,就怕说一套做一套
作者:lice5 时间:2012-10-28 20:37:17
作为宁波人 还是说一句:革命尚未成功 同志仍需努力
4 /post-free-4242156-1.shtml 2014-04-09 15:55:35 61943225 野渡自渡人 @Y雷政府34楼2014-04-0422:30:34 野渡君雄文!支持是必须的。 ----------------------------- @清坪过客16楼2014-04-0804:09:48 绝对的权力导致绝对的腐败! ----------------------------- @T大漠鱼T35楼2014-04-0810:17:27 @周丕东@普欣@拾月霜寒2012@小摸包@姚文嚼字@四號@凌宸@乔志峰@野渡自渡人@曾兵2010@缠绕夜色@曾颖@风青扬请关注
print pa[0].find('div', {'class', 'bbs-content'}).text.strip()
从宁波市政府新闻发言人处获悉,宁波市经与项目投资方研究决定:(1)坚决不上PX项目;(2)炼化一体化项目前期工作停止推进,再作科学论证。
print pa[87].find('div', {'class', 'bbs-content'}).text.strip()
@lice5 2012-10-28 20:37:17 作为宁波人 还是说一句:革命尚未成功 同志仍需努力 ----------------------------- 对 现在说成功还太乐观,就怕说一套做一套
pa[1].a
<a class="js-vip-check" href="http://www.tianya.cn/73157063" target="_blank" uid="73157063" uname="\u6028\u9b42\u9b3c">\u6028\u9b42\u9b3c</a>
print pa[0].find('a', class_ = 'reportme a-link')
<a author="牧阳光" authorid="36535656" class="reportme a-link" href="javascript:void(0);" replyid="0" replytime="2012-10-28 19:11:00"> 举报</a>
print pa[0].find('a', class_ = 'reportme a-link')['replytime']
2012-10-28 19:11:00
print pa[0].find('a', class_ = 'reportme a-link')['author']
牧阳光
for i in pa[:10]:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '')
print p_time, '--->', p_author_id, '--->', p_author_name,'--->', p_content, '\n'
2012-10-28 19:11:00 ---> 36535656 ---> 牧阳光 ---> 从宁波市政府新闻发言人处获悉,宁波市经与项目投资方研究决定:(1)坚决不上PX项目;(2)炼化一体化项目前期工作停止推进,再作科学论证。 2012-10-28 19:17:56 ---> 73157063 ---> 怨魂鬼 ---> 图片分享 2012-10-28 19:18:17 ---> 73157063 ---> 怨魂鬼 ---> @怨魂鬼 2012-10-28 19:17:56 图片分享 [发自掌中天涯客户端 ] ----------------------------- 2楼我的天下! 2012-10-28 19:18:46 ---> 36535656 ---> 牧阳光 ---> 。。。沙发板凳这么快就被坐了~~ 2012-10-28 19:19:04 ---> 41774471 ---> zgh0213 ---> 元芳你怎么看 2012-10-28 19:19:37 ---> 73157063 ---> 怨魂鬼 ---> @牧阳光 2012-10-28 19:18:46 。。。沙发板凳这么快就被坐了~~ ----------------------------- 运气、 2012-10-28 19:20:04 ---> 36535656 ---> 牧阳光 ---> @怨魂鬼 5楼 运气、 ----------------------------- 哈哈。。。 2012-10-28 19:20:07 ---> 54060837 ---> 帆小叶 ---> 八卦的被和谐了。帖个链接http://api.pwnz.org/0/?url=bG10aC4 wOTIyNzQvNzIvMDEvMjEvc3dlbi9tb2MuYW5 paGN0ZXJjZXMud3d3Ly9BMyVwdHRo 2012-10-28 19:20:33 ---> 36535656 ---> 牧阳光 ---> @怨魂鬼 2楼 2楼我的天下! ----------------------------- 。。。还是掌中天涯,NB的~~ 2012-10-28 19:25:22 ---> 36535656 ---> 牧阳光 ---> 消息来源,官方微博@宁波发布
http://bbs.tianya.cn/post-free-2848797-1.shtml
http://bbs.tianya.cn/post-free-2848797-2.shtml
http://bbs.tianya.cn/post-free-2848797-3.shtml
post_soup.find('div', {'class', 'atl-pages'})#.['onsubmit']
<div class="atl-pages"><form action="" method="get" onsubmit="return goPage(this,'free',2848797,7);">\n<span>\u4e0a\u9875</span>\n<strong>1</strong>\n<a href="/post-free-2848797-2.shtml">2</a>\n<a href="/post-free-2848797-3.shtml">3</a>\n<a href="/post-free-2848797-4.shtml">4</a>\n\u2026\n<a href="/post-free-2848797-7.shtml">7</a>\n<a class="js-keyboard-next" href="/post-free-2848797-2.shtml">\u4e0b\u9875</a>\n\xa0\u5230<input class="pagetxt" name="page" type="text"/>\u9875\xa0<input class="pagebtn" maxlength="6" name="gopage" type="submit" value="\u786e\u5b9a"/></form></div>
post_pages = post_soup.find('div', {'class', 'atl-pages'})
post_pages = post_pages.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages
#post_soup.select('.atl-pages')[0].select('form')[0].select('onsubmit')
'7'
url = 'http://bbs.tianya.cn' + df.link[2]
url_base = ''.join(url.split('-')[:-1]) + '-%d.shtml'
url_base
'http://bbs.tianya.cn/postfree2848797-%d.shtml'
def parsePage(pa):
records = []
for i in pa:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '').replace('\n', '')#.replace(' ', '')
record = p_time + '\t' + p_author_id+ '\t' + p_author_name + '\t'+ p_content
records.append(record)
return records
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
url_1 = 'http://bbs.tianya.cn' + df.link[10]
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
b = post_soup.find('div', class_= 'atl-pages')
b
<div class="atl-pages host-pages"></div>
url_1 = 'http://bbs.tianya.cn' + df.link[0]
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
a = post_soup.find('div', {'class', 'atl-pages'})
a
<div class="atl-pages"><form action="" method="get" onsubmit="return goPage(this,'free',2849477,28);">\n<span>\u4e0a\u9875</span>\n<strong>1</strong>\n<a href="/post-free-2849477-2.shtml">2</a>\n<a href="/post-free-2849477-3.shtml">3</a>\n<a href="/post-free-2849477-4.shtml">4</a>\n\u2026\n<a href="/post-free-2849477-28.shtml">28</a>\n<a class="js-keyboard-next" href="/post-free-2849477-2.shtml">\u4e0b\u9875</a>\n\xa0\u5230<input class="pagetxt" name="page" type="text"/>\u9875\xa0<input class="pagebtn" maxlength="6" name="gopage" type="submit" value="\u786e\u5b9a"/></form></div>
a.form
<form action="" method="get" onsubmit="return goPage(this,'free',2849477,28);">\n<span>\u4e0a\u9875</span>\n<strong>1</strong>\n<a href="/post-free-2849477-2.shtml">2</a>\n<a href="/post-free-2849477-3.shtml">3</a>\n<a href="/post-free-2849477-4.shtml">4</a>\n\u2026\n<a href="/post-free-2849477-28.shtml">28</a>\n<a class="js-keyboard-next" href="/post-free-2849477-2.shtml">\u4e0b\u9875</a>\n\xa0\u5230<input class="pagetxt" name="page" type="text"/>\u9875\xa0<input class="pagebtn" maxlength="6" name="gopage" type="submit" value="\u786e\u5b9a"/></form>
if b.form:
print 'true'
else:
print 'false'
false
import random
import time
def crawler(url, file_name):
try:
# open the browser
url_1 = 'http://bbs.tianya.cn' + url
content = urllib2.urlopen(url_1).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
# how many pages in a post
post_form = post_soup.find('div', {'class', 'atl-pages'})
if post_form.form:
post_pages = post_form.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages = int(post_pages)
url_base = '-'.join(url_1.split('-')[:-1]) + '-%d.shtml'
else:
post_pages = 1
# for the first page
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
for record in records:
p.write('1'+ '\t' + url + '\t' + record.encode('utf-8')+"\n")
# for the 2nd+ pages
if post_pages > 1:
for page_num in range(2, post_pages+1):
time.sleep(random.random())
flushPrint(page_num)
url2 =url_base % page_num
content = urllib2.urlopen(url2).read() #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
for record in records:
p.write(str(page_num) + '\t' +url + '\t' + record.encode('utf-8')+"\n")
else:
pass
except Exception, e:
print e
pass
url = df.link[2]
file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_2test.txt'
crawler(url, file_name)
7
for k, link in enumerate(df.link):
flushPrint(link)
if k % 10== 0:
print 'This it the post of : ' + str(k)
file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt'
crawler(link, file_name)
/post-free-2849477-1.shtmlThis it the post of : 0 /post-free-2842180-1.shtmlThis it the post of : 10 /post-free-3316698-1.shtmlThis it the post of : 20 /post-free-923387-1.shtmlThis it the post of : 30 /post-free-4236026-1.shtmlThis it the post of : 40 /post-free-2850721-1.shtmlThis it the post of : 50 /post-free-5054821-1.shtmlThis it the post of : 60 /post-free-3326274-1.shtmlThis it the post of : 70 /post-free-4236793-1.shtmlThis it the post of : 80 /post-free-4239792-1.shtmlThis it the post of : 90 /post-free-5042110-1.shtmlThis it the post of : 100 /post-free-2241144-1.shtmlThis it the post of : 110 /post-free-3324561-1.shtmlThis it the post of : 120 /post-free-3835452-1.shtmlThis it the post of : 130 /post-free-5045950-1.shtmlThis it the post of : 140 /post-free-2848818-1.shtmlThis it the post of : 150 /post-free-3281916-1.shtmlThis it the post of : 160 /post-free-949151-1.shtmlThis it the post of : 170 /post-free-2848839-1.shtmlThis it the post of : 180 /post-free-3228423-1.shtmlThis it the post of : 190 /post-free-2852970-1.shtmlThis it the post of : 200 /post-free-3325388-1.shtmlThis it the post of : 210 /post-free-3835748-1.shtmlThis it the post of : 220 /post-free-3833431-1.shtmlThis it the post of : 230 /post-free-3378998-1.shtmlThis it the post of : 240 /post-free-3359022-1.shtmlThis it the post of : 250 /post-free-3365791-1.shtmlThis it the post of : 260 /post-free-3396378-1.shtmlThis it the post of : 270 /post-free-3835212-1.shtmlThis it the post of : 280 /post-free-4248593-1.shtmlThis it the post of : 290 /post-free-3833373-1.shtmlThis it the post of : 300 /post-free-3847600-1.shtmlThis it the post of : 310 /post-free-3832970-1.shtmlThis it the post of : 320 /post-free-4076130-1.shtmlThis it the post of : 330 /post-free-3835673-1.shtmlThis it the post of : 340 /post-free-3835434-1.shtmlThis it the post of : 350 /post-free-3368554-1.shtmlThis it the post of : 360 /post-free-3832938-1.shtmlThis it the post of : 370 /post-free-3835075-1.shtmlThis it the post of : 380 /post-free-3832963-1.shtmlThis it the post of : 390 /post-free-4250604-1.shtmlThis it the post of : 400 /post-free-3834828-1.shtmlThis it the post of : 410 /post-free-3835007-1.shtmlThis it the post of : 420 /post-free-3838253-1.shtmlThis it the post of : 430 /post-free-3835167-1.shtmlThis it the post of : 440 /post-free-3835898-1.shtmlThis it the post of : 450 /post-free-3835123-1.shtmlThis it the post of : 460 /post-free-3835031-1.shtml
dtt = []
with open('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt', 'r') as f:
for line in f:
pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
8079
dt = pd.DataFrame(dtt)
dt[:5]
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1 | /post-free-2849477-1.shtml | 2012-10-29 07:59:00 | 50499450 | 贾也 | 先生是一位真爷们!第161期导语:人人宁波,面朝大海,春暖花开! 宁波的事,怎谈?无从谈,... |
1 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:13:54 | 22122799 | 三平67 | 我们中国人都在一条船,颠簸已久,我们都想做宁波人,希望有一个风平浪静的港湾,面朝大海,春暖花... |
2 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:27:02 | 39027950 | 赶浪头 | 默默围观~ |
3 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:43:15 | 53986501 | m408833176 | 不能收藏? |
4 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:55:52 | 39073643 | 兰质薰心 | 楼主好文! 相信政府一定有能力解决好这些问题. |
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
page_num | link | time | author | author_name | reply | |
---|---|---|---|---|---|---|
0 | 1 | /post-free-2849477-1.shtml | 2012-10-29 07:59:00 | 50499450 | 贾也 | 先生是一位真爷们!第161期导语:人人宁波,面朝大海,春暖花开! 宁波的事,怎谈?无从谈,... |
1 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:13:54 | 22122799 | 三平67 | 我们中国人都在一条船,颠簸已久,我们都想做宁波人,希望有一个风平浪静的港湾,面朝大海,春暖花... |
2 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:27:02 | 39027950 | 赶浪头 | 默默围观~ |
3 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:43:15 | 53986501 | m408833176 | 不能收藏? |
4 | 1 | /post-free-2849477-1.shtml | 2012-10-29 08:55:52 | 39073643 | 兰质薰心 | 楼主好文! 相信政府一定有能力解决好这些问题. |
dt.reply[:100]
0 先生是一位真爷们!第161期导语:人人宁波,面朝大海,春暖花开! 宁波的事,怎谈?无从谈,... 1 我们中国人都在一条船,颠簸已久,我们都想做宁波人,希望有一个风平浪静的港湾,面朝大海,春暖花... 2 默默围观~ 3 不能收藏? 4 楼主好文! 相信政府一定有能力解决好这些问题. 5 人民在觉醒。 6 理性的文字,向楼主致敬! 7 呼唤变革,人民需要的是服务型政府! 8 顶贾兄!让我们携手努力保卫家园! 9 围观就是力量,顶起就有希望. 10 文章写得太有力量了,支持你! 11 @贾也 2012-10-29 7:59:00 导语:人人宁波,面朝大海,春暖花开 ...... 12 中国人从文盲走向民粹,实在是太快了。 13 杀死娘胎里的,毒死已出生的,这个社会怎么了? 14 3 15 环境比什么都可贵,每一次呼吸,每一顿粮食,都息息相关,若任其恶化,而无从改观,那遑谈国家之未... 16 写的很好 17 未来这里将是全球最大的垃圾场,而他们早已放浪西方。苟活的将面临数不清的癌症,无助的死亡。悲哀... 18 媒体失声,高压维稳,只保留微博和论坛可以说这件事!因为那些人知道,网上的人和事就只能热几天,... 19 说的太好了,看的我泪流满面! 20 “我相信官场中,许多官员应该葆有社会正能量” 通篇好文,顶!唯这句,不说也罢.... 21 先占一环,然后看帖 22 说的太好了 23 我上的小学,隔壁就是一家水泥厂,到处飞扬的水泥灰是我最熟悉的颜色;坐一站地车,就是一家造纸厂... 24 我们中国人都在一条船,颠簸已久,我们都想做宁波人,希望有一个风平浪静的港湾,面朝大海,春暖花开! 25 前排占座~~ 26 贾也先生是一位真爷们! 27 28 为什么我的眼里常含着泪水?因为我对这片土地爱得深沉! 29 又是因为环保的群体事件,影响面大,危害严重,理由叫的响,取得阶段性胜利。 那些拆迁的、城管... ... 70 这是我近几年看过的写的最好的文章,!!!不多说了,危险 71 @shdsb 2012-10-29 10:17:43 媒体失声,高压维稳,只保留微博和论坛... 72 @pals2009 48楼 每次看到这样的消息,都很痛心,很堵很堵。为什么在经济发展的同... 73 成都啊成都 74 是不得人心呀 75 真爷们。。顶 我是宁。波人,楼主说的是我们的心声。。。 现在看到人民警察,不是有安全感,... 76 作者:弱水三千chen 回复日期:2012-10-29 11:43:18 回复 @兰... 77 泣血之作! 谢。 78 作者:文蛮子 时间:2012-10-29 11:42:58 摆明了,带路党们难以煽动民众... 79 字字真切! 80 @曾开贵 2012-10-29 11:40:09 没有ZF,哪来新ZG,没有新ZG,你们吃... 81 好文,顶一下,为了我的故乡 82 0 83 好文章,顶一个! 84 作者:文蛮子 时间:2012-10-29 11:42:58 摆明了,带路党们难以煽动民众... 85 一定要顶。在被和谐前让多点人知道吧 86 围观也是一种力量 天涯,也是好样的 87 很沉重 88 生不逢国 89 很好的文章。 90 我们中国人都在一条船,颠簸已久,我们都想做宁波人,希望有一个风平浪静的港湾,面朝大海,春暖花开! 91 路过 92 民间语文,狗屁有点通。 似是而非,点风扇火,实是不该。 排污排毒,环境污染,确实严重。 ... 93 @横冲节度使 2012-10-29 12:11:50 楼主这种汉奸、带路党,成天就做梦盼着... 94 @赶浪头 2楼 默默围观~ ---------------------------... 95 午休时间静静看完了,心中莫名地压抑。 96 好文!必须顶起! 97 扎口了。 98 谢谢分享 楼主辛苦了 99 看不到我的回复哦 Name: reply, dtype: object
http://search.tianya.cn/bbs?q=PX 共有18459 条内容
18459/50
369
实际上到第10页就没有了 http://bbs.tianya.cn/list.jsp?item=free&order=1&nextid=9&k=PX, 原来那只是天涯论坛,还有其它各种版块,如天涯聚焦: http://focus.tianya.cn/ 等等。