#!/usr/bin/env python # coding: utf-8 # *** # *** # # 数据清洗之推特数据 # *** # *** # # 王成军 # # wangchengjun@nju.edu.cn # # 计算传播网 http://computational-communication.com # ## 数据清洗(data cleaning) # 是数据分析的重要步骤,其主要目标是将混杂的数据清洗为可以被直接分析的数据,一般需要将数据转化为数据框(data frame)的样式。 # # 本章将以推特文本的清洗作为例子,介绍数据清洗的基本逻辑。 # # - 清洗错误行 # - 正确分列 # - 提取所要分析的内容 # - 介绍通过按行、chunk的方式对大规模数据进行预处理 # # # # # 1. 抽取tweets样本做实验 # 此节学生略过 # In[23]: bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-raw.txt', 'r') chunkSize = 1000000 chunk = bigfile.readlines(chunkSize) print(len(chunk)) with open("/Users/chengjun/GitHub/cjc/data/ows_tweets_sample.txt", 'w') as f: for i in chunk: f.write(i) # # Lazy Method for Reading Big File in Python? # In[6]: # https://stackoverflow.com/questions/519633/lazy-method-for-reading-big-file-in-python?lq=1 import csv bigfile = open('/Users/datalab/bigdata/cjc/ows-raw.txt', 'r') chunkSize = 10**8 chunk = bigfile.readlines(chunkSize) num, num_lines = 0, 0 while chunk: lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"') #do sth. num_lines += len(list(lines)) print(num, num_lines) num += 1 chunk = bigfile.readlines(chunkSize) # read another chunk # # 字节(Byte /bait/) # # 计算机信息技术用于计量存储容量的一种计量单位,通常情况下一字节等于有八位, [1] 也表示一些计算机编程语言中的数据类型和语言字符。 # - 1B(byte,字节)= 8 bit; # - 1KB=1000B;1MB=1000KB=1000×1000B。其中1000=10^3。 # - 1KB(kilobyte,千字节)=1000B= 10^3 B; # - 1MB(Megabyte,兆字节,百万字节,简称“兆”)=1000KB= 10^6 B; # - 1GB(Gigabyte,吉字节,十亿字节,又称“千兆”)=1000MB= 10^9 B; # ## 用Pandas的get_chunk功能来处理亿级数据 # # > 只有在超过5TB数据量的规模下,Hadoop才是一个合理的技术选择。 # In[ ]: import pandas as pd f = open('../bigdata/OWS/ows-raw.txt',encoding='utf-8') reader = pd.read_table(f, sep=',', iterator=True, error_bad_lines=False) #跳过报错行 loop = True chunkSize = 100000 data = [] while loop: try: chunk = reader.get_chunk(chunkSize) dat = data_cleaning_funtion(chunk) # do sth. data.append(dat) except StopIteration: loop = False print("Iteration is stopped.") df = pd.concat(data, ignore_index=True) # # 2. 清洗错行的情况 # In[7]: with open("../data/ows_tweets_sample.txt", 'r') as f: lines = f.readlines() # In[8]: # 总行数 len(lines) # In[9]: # 查看第一行 lines[15] # In[40]: help(lines[1].split) # # 问题: 第一行是变量名 # > ## 1. 如何去掉换行符? # > ## 2. 如何获取每一个变量名? # # In[10]: varNames = lines[0].replace('\n', '').split(',') varNames # In[11]: len(varNames) # In[12]: lines[1344] # # 如何来处理错误换行情况? # In[7]: with open("../data/ows_tweets_sample_clean.txt", 'w') as f: right_line = '' # 正确的行,它是一个空字符串 blocks = [] # 确认为正确的行会被添加到blocks里面 for line in lines: right_line += line.replace('\n', ' ') line_length = len(right_line.split(',')) if line_length >= 14: blocks.append(right_line) right_line = '' for i in blocks: f.write(i + '\n') # In[8]: len(blocks) # In[9]: blocks[1344] # # 同时考虑分列符和引用符 # # - 分列符🔥分隔符:sep, delimiter # - 引用符☁️:quotechar # # In[13]: import re re.split(',"|",', lines[15]) # In[14]: import re with open("../data/ows_tweets_sample.txt",'r') as f: lines = f.readlines() for i in range(35,50): i_ = re.split(',"|",', lines[i]) print('line =',i,' length =', len(i_)) # In[15]: with open("../data/ows_tweets_sample_clean4.txt", 'w') as f: right_line = '' # 正确的行,它是一个空字符串 blocks = [] # 确认为正确的行会被添加到blocks里面 for line in lines: right_line += line.replace('\n', ' ').replace('\r', ' ') #line_length = len(right_line.split(',')) i_ = re.split(',"|",', right_line) line_length = len(i_) if line_length >= 6: blocks.append(right_line) right_line = '' # for i in blocks: # f.write(i + '\n') # In[16]: len(blocks) # # 3. 读取数据、正确分列 # In[18]: # 提示:你可能需要修改以下路径名 with open("../data/ows_tweets_sample.txt", 'r') as f: chunk = f.readlines() # In[19]: len(chunk) # In[20]: chunk[:3] # In[21]: import csv lines_csv = csv.reader(chunk, delimiter=',', quotechar='"') print(len(list(lines_csv))) # next(lines_csv) # next(lines_csv) # In[27]: import re import csv from collections import defaultdict def extract_rt_user(tweet): rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE) rt_user_name = rt_patterns.findall(tweet) if rt_user_name: rt_user_name = rt_user_name[0][1].strip(' @') else: rt_user_name = None return rt_user_name rt_network = defaultdict(int) f = open("../data/ows_tweets_sample.txt", 'r') chunk = f.readlines(100000) while chunk: #lines = csv.reader(chunk, delimiter=',', quotechar='"') lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"') for line in lines: tweet = line[1] from_user = line[8] rt_user = extract_rt_user(tweet) rt_network[(from_user, rt_user)] += 1 chunk = f.readlines(100000) # In[22]: import pandas as pd df = pd.read_csv("../data/ows_tweets_sample.txt", sep = ',', quotechar='"') df[:3] # In[23]: len(df) # In[24]: df.Text[0] # In[25]: df['From User'][:10] # # 4. 统计数量 # ### 统计发帖数量所对应的人数的分布 # > 人数在发帖数量方面的分布情况 # In[26]: from collections import defaultdict data_dict = defaultdict(int) for i in df['From User']: data_dict[i] +=1 # In[27]: list(data_dict.items())[:5] #data_dict # In[28]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # ### 安装微软雅黑字体 # 为了在绘图时正确显示中文,需要安装/data/文件夹中的微软雅黑字体(msyh.ttf) # # 详见[common questions](0.common_questions.ipynb) # In[29]: plt.hist(data_dict.values()) #plt.yscale('log') #plt.xscale('log') plt.xlabel(u'发帖数', fontsize = 20) plt.ylabel(u'人数', fontsize = 20) plt.show() # In[30]: tweet_dict = defaultdict(int) for i in data_dict.values(): tweet_dict[i] += 1 plt.loglog(tweet_dict.keys(), tweet_dict.values(), 'ro')#linewidth=2) plt.xlabel(u'推特数', fontsize=20) plt.ylabel(u'人数', fontsize=20 ) plt.show() # In[31]: import numpy as np import statsmodels.api as sm def powerPlot(d_value, d_freq, color, marker): d_freq = [i + 1 for i in d_freq] d_prob = [float(i)/sum(d_freq) for i in d_freq] #d_rank = ss.rankdata(d_value).astype(int) x = np.log(d_value) y = np.log(d_prob) xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant,beta = res.params r2 = res.rsquared plt.plot(d_value, d_prob, linestyle = '',\ color = color, marker = marker) plt.plot(d_value, np.exp(constant+x*beta),"red") plt.xscale('log'); plt.yscale('log') plt.text(max(d_value)/2,max(d_prob)/10, r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)), fontsize = 20) # In[31]: histo, bin_edges = np.histogram(list(data_dict.values()), 15) bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1]) powerPlot(bin_center,histo, 'r', '^') #lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20) plt.ylabel(u'概率', fontsize=20) plt.xlabel(u'推特数', fontsize=20) plt.show() # In[32]: import statsmodels.api as sm from collections import defaultdict import numpy as np def powerPlot2(data): d = sorted(data, reverse = True ) d_table = defaultdict(int) for k in d: d_table[k] += 1 d_value = sorted(d_table) d_value = [i+1 for i in d_value] d_freq = [d_table[i]+1 for i in d_value] d_prob = [float(i)/sum(d_freq) for i in d_freq] x = np.log(d_value) y = np.log(d_prob) xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant,beta = res.params r2 = res.rsquared plt.plot(d_value, d_prob, 'ro') plt.plot(d_value, np.exp(constant+x*beta),"red") plt.xscale('log'); plt.yscale('log') plt.text(max(d_value)/2,max(d_prob)/5, 'Beta = ' + str(round(beta,2)) +'\n' + 'R squared = ' + str(round(r2, 2))) plt.title('Distribution') plt.ylabel('P(K)') plt.xlabel('K') plt.show() # In[33]: powerPlot2(data_dict.values()) # In[34]: import powerlaw def plotPowerlaw(data,ax,col,xlab): fit = powerlaw.Fit(data,xmin=2) #fit = powerlaw.Fit(data) fit.plot_pdf(color = col, linewidth = 2) a,x = (fit.power_law.alpha,fit.power_law.xmin) fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax, \ label = r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x)) ax.set_xlabel(xlab, fontsize = 20) ax.set_ylabel('$Probability$', fontsize = 20) plt.legend(loc = 0, frameon = False) # In[35]: from collections import defaultdict data_dict = defaultdict(int) for i in df['From User']: data_dict[i] += 1 # In[36]: import matplotlib.cm as cm cmap = cm.get_cmap('rainbow_r',6) fig = plt.figure(figsize=(6, 4),facecolor='white') ax = fig.add_subplot(1, 1, 1) plotPowerlaw(list(data_dict.values()), ax,cmap(1), '$Tweets$') # # 5. 清洗tweets文本 # In[1]: tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' # In[32]: import re import twitter_text # https://github.com/dryan/twitter-text-py/issues/21 #Macintosh HD ▸ 用户 ▸ datalab ▸ 应用程序 ▸ anaconda ▸ lib ▸ python3.5 ▸ site-packages # # 安装twitter_text # # [twitter-text-py](https://github.com/dryan/twitter-text-py/issues/21) could not be used for python 3 # # # > ### pip install twitter-text # # Glyph debug the problem, and make [a new repo of twitter-text-py3](https://github.com/glyph/twitter-text-py). # # > ## pip install twitter-text # # # 无法正常安装的同学 # ## 可以在spyder中打开terminal安装 # # pip install twitter-text # In[35]: import re tweet = '''RT @AnonKitsu: @who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \ re.IGNORECASE) rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')#.split(':')[0] rt_user_name # In[28]: import re tweet = '''RT @AnonKitsu: @who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \ re.IGNORECASE) rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @').split(':')[0] rt_user_name # In[36]: import re tweet = '''@chengjun:@who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE) rt_user_name = rt_patterns.findall(tweet) print(rt_user_name) if rt_user_name: print('it exits.') else: print('None') # In[37]: import re def extract_rt_user(tweet): rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE) rt_user_name = rt_patterns.findall(tweet) if rt_user_name: rt_user_name = rt_user_name[0][1].strip(' @').split(':')[0] else: rt_user_name = None return rt_user_name # In[38]: tweet = '''RT @chengjun: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' extract_rt_user(tweet) # In[39]: tweet = '''@chengjun: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' print(extract_rt_user(tweet) ) # In[40]: import csv with open("../data/ows_tweets_sample.txt", 'r') as f: chunk = f.readlines() rt_network = [] lines = csv.reader(chunk[1:], delimiter=',', quotechar='"') tweet_user_data = [(i[1], i[8]) for i in lines] tweet_user_data[:3] # In[41]: from collections import defaultdict rt_network = [] rt_dict = defaultdict(int) for k, i in enumerate(tweet_user_data): tweet,user = i rt_user = extract_rt_user(tweet) if rt_user: rt_network.append((user, rt_user)) #(rt_user,' ', user, end = '\n') rt_dict[(user, rt_user)] += 1 #rt_network[:5] list(rt_dict.items())[:3] # # 获得清洗过的推特文本 # # 不含人名、url、各种符号(如RT @等) # In[42]: def extract_tweet_text(tweet, at_names, urls): for i in at_names: tweet = tweet.replace(i, '') for j in urls: tweet = tweet.replace(j, '') marks = ['RT @', '@', '"', '#', '\n', '\t', ' '] for k in marks: tweet = tweet.replace(k, '') return tweet # In[43]: import twitter_text tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com http://ccc.nju.edu.cn RT !!HELP!!!!''' ex = twitter_text.Extractor(tweet) at_names = ex.extract_mentioned_screen_names() urls = ex.extract_urls() hashtags = ex.extract_hashtags() rt_user = extract_rt_user(tweet) #tweet_text = extract_tweet_text(tweet, at_names, urls) print(at_names, urls, hashtags, rt_user,'-------->')#, tweet_text) # In[44]: import csv lines = csv.reader(chunk,delimiter=',', quotechar='"') tweets = [i[1] for i in lines] # In[45]: for tweet in tweets[:5]: ex = twitter_text.Extractor(tweet) at_names = ex.extract_mentioned_screen_names() urls = ex.extract_urls() hashtags = ex.extract_hashtags() rt_user = extract_rt_user(tweet) #tweet_text = extract_tweet_text(tweet, at_names, urls) print(at_names, urls, hashtags, rt_user) #print(tweet_text) # # 思考: # # ### 提取出raw tweets中的rtuser与user的转发网络 # # ## 格式: # rt_user1, user1, 3 # # rt_user2, user3, 2 # # rt_user2, user4, 1 # # ... # # 数据保存为csv格式 # # 阅读文献 # In[ ]: