#!/usr/bin/env python
# coding: utf-8
# ***
# # 数据抓取:
#
# > # 使用Python编写网络爬虫
# ***
#
# 王成军
#
# wangchengjun@nju.edu.cn
#
# 计算传播网 http://computational-communication.com
# # 需要解决的问题
#
# - 页面解析
# - 获取Javascript隐藏源数据
# - 自动翻页
# - 自动登录
# - 连接API接口
#
# In[1]:
import requests
from bs4 import BeautifulSoup
# - 一般的数据抓取,使用urllib2和beautifulsoup配合就可以了。
# - 尤其是对于翻页时url出现规则变化的网页,只需要处理规则化的url就可以了。
# - 以简单的例子是抓取天涯论坛上关于某一个关键词的帖子。
# - 在天涯论坛,关于雾霾的帖子的第一页是:
# http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=雾霾
# - 第二页是:
# http://bbs.tianya.cn/list.jsp?item=free&nextid=1&order=8&k=雾霾
#
# ***
# # 数据抓取:
# > # 抓取天涯回帖网络
# ***
#
# 王成军
#
# wangchengjun@nju.edu.cn
#
# 计算传播网 http://computational-communication.com
# In[2]:
from IPython.display import display_html, HTML
HTML('')
# the webpage we would like to crawl
# In[3]:
page_num = 0
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX"% page_num
content = requests.get(url).text
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
#td[0].find('a', {'class', ""})
# In[4]:
articles[0]
# In[6]:
articles[1]
# In[7]:
len(articles[1:])
# ![](./img/inspect.png)
# http://bbs.tianya.cn/list.jsp?item=free&nextid=0&order=8&k=PX
#
# # 通过分析帖子列表的源代码,使用inspect方法,会发现所有要解析的内容都在‘td’这个层级下
#
# In[10]:
for t in articles[1].find_all('td'):
print(t)
# In[11]:
td = articles[1].find_all('td')
# In[27]:
td
# In[12]:
td[0]
# In[13]:
td[0].text
# In[14]:
td[0].text.strip()
# In[15]:
td[0].a['href']
# In[21]:
td[1]
# In[23]:
td[1].find('a', {'class', "author"})['href']
# In[28]:
td[2]
# In[29]:
td[3]
# In[30]:
td[4]
# In[32]:
records = []
for k, i in enumerate(articles[1:]):
td = i.find_all('td')
title = td[0].text.strip()
try:
title_url = td[0].find('a', {'class', "author"})['href']
except:
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = '\t'.join([title, title_url, author, author_url, views, replies, date])
# record = title + '\t' + title_url+ '\t' + author +
# '\t'+ author_url + '\t' + views+ '\t' + replies+ '\t'+ date
records.append(record)
# In[34]:
records[:3]
# In[35]:
print(records[2])
# # 抓取天涯论坛PX帖子列表
#
# 回帖网络(Thread network)的结构
# - 列表
# - 主帖
# - 回帖
# In[45]:
articles[2]
# In[36]:
def crawler(page_num, file_name):
try:
# open the browser
url = "http://bbs.tianya.cn/list.jsp?item=free&nextid=%d&order=8&k=PX" % page_num
content = requests.get(url).text #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('tr')
# write down info
for i in articles[1:]:
td = i.find_all('td')
title = td[0].text.strip()
title_url = td[0].a['href']
author = td[1].text
author_url = td[1].a['href']
views = td[2].text
replies = td[3].text
date = td[4]['title']
record = title + '\t' + title_url+ '\t' + author + '\t'+ \
author_url + '\t' + views+ '\t' + replies+ '\t'+ date
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception as e:
print(e)
pass
# In[37]:
# crawl all pages
for page_num in range(10):
print(page_num)
crawler(page_num, '../data/tianya_bbs_threads_list2018.txt')
# In[38]:
import pandas as pd
df = pd.read_csv('../data/tianya_bbs_threads_list2018.txt',
sep = "\t", names = ['title', 'link', 'author', \
'author_page', 'click', 'reply', 'time'])
df[:3]
# In[39]:
len(df)
# In[40]:
len(df.link)
# # 抓取作者信息
# In[41]:
df.author_page[:5]
# http://www.tianya.cn/62237033
#
# http://www.tianya.cn/67896263
#
# In[42]:
# user_info
# In[43]:
url = df.author_page[10]
content = requests.get(url).text #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
# In[44]:
soup
# In[49]:
print(url)
# In[47]:
# In[45]:
user_info = soup.find('div', {'class', 'userinfo'})('p')
score, reg_time = [i.get_text()[4:] for i in user_info]
print(score, reg_time )
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print(followed_num, fans_num)
# In[51]:
#activity = soup.find_all('span', {'class', 'subtitle'})
#post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
#print(post_num, reply_num)
#activity
# In[53]:
#activity[0]
# In[54]:
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
print(followed_num, fans_num)
# In[55]:
link_info[0].a.text
# In[56]:
# user_info = soup.find('div', {'class', 'userinfo'})('p')
# user_infos = [i.get_text()[4:] for i in user_info]
def author_crawler(url, file_name):
try:
content = requests.get(url).text #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
try:
activity = soup.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
except:
post_num, reply_num = '1', '0'
record = '\t'.join([url, followed_num, fans_num, post_num, reply_num])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record+"\n") ##!!encode here to utf-8 to avoid encoding
except Exception as e:
print(e, url)
record = '\t'.join([url, 'na', 'na', 'na', 'na'])
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
p.write(record+"\n") ##!!encode here to utf-8 to avoid encoding
pass
# In[58]:
#soup
# In[137]:
url = df.author_page[10]
content = requests.get(url).text #获取网页的html文本
soup = BeautifulSoup(content, "lxml")
link_info = soup.find_all('div', {'class', 'link-box'})
followed_num, fans_num = [i.a.text for i in link_info]
try:
activity = soup.find_all('span', {'class', 'subtitle'})
post_num, reply_num = [j.text[2:] for i in activity[:1] for j in i('a')]
except:
post_num, reply_num = '1', '0'
record = '\t'.join([url, followed_num, fans_num, post_num, reply_num])
# In[74]:
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
# In[61]:
import time, random
for k, url in enumerate(df.author_page[:15]):
time.sleep(random.random()) # 天涯存在反抓取机制,需要降低抓取速度!
#flushPrint(k)
author_crawler(url, '../data/tianya_bbs_threads_author_info2018.txt')
# http://www.tianya.cn/50499450/follow
#
# 还可抓取他们的关注列表和粉丝列表
# ***
# ***
# # 数据抓取:
# > # 使用Python抓取回帖
# ***
# ***
#
# 王成军
#
# wangchengjun@nju.edu.cn
#
# 计算传播网 http://computational-communication.com
# In[62]:
df.link[2]
# In[64]:
url = 'http://bbs.tianya.cn' + df.link[2]
print(url)
# In[65]:
from IPython.display import display_html, HTML
HTML('')
# the webpage we would like to crawl
# In[66]:
post = requests.get(url).text #获取网页的html文本
post_soup = BeautifulSoup(post, "lxml")
#articles = soup.find_all('tr')
# In[67]:
print (post_soup.prettify())[:5000]
# In[68]:
pa = post_soup.find_all('div', {'class', 'atl-item'})
len(pa)
# In[69]:
print(pa[0])
# In[70]:
pa[1]
# In[71]:
pa[70]
# 作者:柠檬在追逐 时间:2012-10-28 21:33:55
#
# @lice5 2012-10-28 20:37:17
#
# 作为宁波人 还是说一句:革命尚未成功 同志仍需努力
#
# -----------------------------
#
# 对 现在说成功还太乐观,就怕说一套做一套
# 作者:lice5 时间:2012-10-28 20:37:17
#
# 作为宁波人 还是说一句:革命尚未成功 同志仍需努力
# 4 /post-free-4242156-1.shtml 2014-04-09 15:55:35 61943225 野渡自渡人 @Y雷政府34楼2014-04-0422:30:34 野渡君雄文!支持是必须的。 ----------------------------- @清坪过客16楼2014-04-0804:09:48 绝对的权力导致绝对的腐败! ----------------------------- @T大漠鱼T35楼2014-04-0810:17:27 @周丕东@普欣@拾月霜寒2012@小摸包@姚文嚼字@四號@凌宸@乔志峰@野渡自渡人@曾兵2010@缠绕夜色@曾颖@风青扬请关注
# In[233]:
pa[0].find('div', {'class', 'bbs-content'}).text.strip()
# In[76]:
print(pa[0].text.strip())#.find('div', {'class', 'marg'}).text.strip()
# In[77]:
pa[67].find('div', {'class', 'bbs-content'}).text.strip()
# In[78]:
pa[1].a
# In[79]:
pa[0].find('a', class_ = 'reportme a-link')
# In[80]:
pa[0].find('a', class_ = 'reportme a-link')['replytime']
# In[81]:
pa[0].find('a', class_ = 'reportme a-link')['author']
# In[82]:
for i in pa[:10]:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '')
print(p_time, '--->', p_author_id, '--->', p_author_name,'--->', p_content, '\n')
# # 如何翻页
#
# http://bbs.tianya.cn/post-free-2848797-1.shtml
#
# http://bbs.tianya.cn/post-free-2848797-2.shtml
#
# http://bbs.tianya.cn/post-free-2848797-3.shtml
# In[83]:
post_soup.find('div', {'class', 'atl-pages'})#.['onsubmit']
# In[84]:
post_pages = post_soup.find('div', {'class', 'atl-pages'})
post_pages = post_pages.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages
#post_soup.select('.atl-pages')[0].select('form')[0].select('onsubmit')
# In[85]:
url = 'http://bbs.tianya.cn' + df.link[2]
url_base = ''.join(url.split('-')[:-1]) + '-%d.shtml'
url_base
# In[86]:
def parsePage(pa):
records = []
for i in pa:
p_info = i.find('a', class_ = 'reportme a-link')
p_time = p_info['replytime']
p_author_id = p_info['authorid']
p_author_name = p_info['author']
p_content = i.find('div', {'class', 'bbs-content'}).text.strip()
p_content = p_content.replace('\t', '').replace('\n', '')#.replace(' ', '')
record = p_time + '\t' + p_author_id+ '\t' + p_author_name + '\t'+ p_content
records.append(record)
return records
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
# In[87]:
url_1 = 'http://bbs.tianya.cn' + df.link[10]
content = requests.get(url_1).text #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
b = post_soup.find('div', class_= 'atl-pages')
b
# In[88]:
url_0 = 'http://bbs.tianya.cn' + df.link[2]
content = requests.get(url_0).text #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
a = post_soup.find('div', class_= 'atl-pages')
a
# In[89]:
a.form
# In[90]:
if b.form:
print('true')
else:
print('false')
# In[91]:
import random
import time
def crawler(url, file_name):
try:
# open the browser
url_1 = 'http://bbs.tianya.cn' + url
content = requests.get(url_0).text #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
# how many pages in a post
post_form = post_soup.find('div', {'class', 'atl-pages'})
if post_form.form:
post_pages = post_form.form['onsubmit'].split(',')[-1].split(')')[0]
post_pages = int(post_pages)
url_base = '-'.join(url_1.split('-')[:-1]) + '-%d.shtml'
else:
post_pages = 1
# for the first page
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
for record in records:
p.write('1'+ '\t' + url + '\t' + record+"\n")
# for the 2nd+ pages
if post_pages > 1:
for page_num in range(2, post_pages+1):
time.sleep(random.random())
flushPrint(page_num)
url2 =url_base % page_num
content = requests.get(url2).text #获取网页的html文本
post_soup = BeautifulSoup(content, "lxml")
pa = post_soup.find_all('div', {'class', 'atl-item'})
records = parsePage(pa)
with open(file_name,'a') as p: # '''Note''':Append mode, run only once!
for record in records:
p.write(str(page_num) + '\t' +url + '\t' + record+"\n")
else:
pass
except Exception as e:
print(e)
pass
# # 测试
# In[92]:
url = df.link[2]
file_name = '../data/tianya_bbs_threads_2018test.txt'
crawler(url, file_name)
# # 正式抓取!
# In[94]:
for k, link in enumerate(df.link):
flushPrint(link)
if k % 10== 0:
print('This it the post of : ' + str(k))
file_name = '../data/tianya_bbs_threads_network_2018.txt'
crawler(link, file_name)
# # 读取数据
# In[95]:
dtt = []
with open('../data/tianya_bbs_threads_network_2018.txt', 'r') as f:
for line in f:
pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
# In[96]:
dt = pd.DataFrame(dtt)
dt[:5]
# In[97]:
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
# In[98]:
dt.reply[:100]
# ## 总帖数是多少?
# http://search.tianya.cn/bbs?q=PX 共有18459 条内容
# In[93]:
18459/50
# 实际上到第10页就没有了 http://bbs.tianya.cn/list.jsp?item=free&order=1&nextid=9&k=PX, 原来那只是天涯论坛,还有其它各种版块,如天涯聚焦: http://focus.tianya.cn/ 等等。
#
# - 娱乐八卦 512
# - 股市论坛 187
# - 情感天地 242
# - 天涯杂谈 1768
# 在天涯杂谈搜索雾霾,有41页 http://bbs.tianya.cn/list.jsp?item=free&order=20&nextid=40&k=%E9%9B%BE%E9%9C%BE
# # 天涯SDK
# http://open.tianya.cn/wiki/index.php?title=SDK%E4%B8%8B%E8%BD%BD
# In[ ]: