#!/usr/bin/env python # coding: utf-8 # *** # # 数据清洗: # > # 对占中新闻进行数据清洗 # *** # *** # # 王成军 # # wangchengjun@nju.edu.cn # # 计算传播网 http://computational-communication.com # In[70]: # 使用with open读取每一行数据 with open("/Users/chengjun/github/cjc2016/data/occupycentral/zz-hk-2014-10.rtf") as f: news = f.readlines() # In[71]: # 查看总共有多少行 len(news) # In[73]: # 注意:标题和版面之间存在一个空行!所以title是block的第4个元素。 for i in range(1, 8): print news[i].decode('gb18030')[:500] # In[74]: # 需要对中文编码的对象使用中文的方式进行解码 print news[17].decode('gb18030')[:500] # In[75]: # 定义一个函数:实现解码、编码、清洗效果 def stringclean(s): s = s.decode('gb18030').encode('utf8') s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b\cf6 ', '') s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b0\cf0 ', '') s = s.replace('\par', '').replace('\n', '') return s # In[87]: 'aabbccdd ee'.strip('a') # In[94]: 'aabbccdd ee'.strip('ab') # In[103]: 'aabbccdd ee'.replace('ab', '') # In[76]: # 调用stringclean函数 print stringclean(news[17]) # In[77]: # 列表内的for循环 news_clean = [stringclean(n) for n in news] len(news_clean) # In[85]: print news_clean[17][:120] # In[79]: # 定义两个函数 def deletetab(s): return s.replace('\t', '') import sys def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush() # 清洗掉 # In[102]: help(sys.stdout) # In[95]: # 调用deletetab deletetab('\ta') # In[97]: # 演示:flushPrint import time, random for i in range(10): time.sleep(random.random()) flushPrint(i) # In[80]: from collections import defaultdict def readblocks(data): copy = False n = 0 block = [] chunk = defaultdict(lambda:[]) for i in data: try: if "~~~~~~~~~~~~~~~~~~~~~~~~~~ #" in i: copy = True elif "文章编号:" in i: id = i.replace('文章编号: ', '') source = block[0].split('|')[0] info = block[1] title = deletetab(block[3]) # body = [j for j in block[6:] if j != '\n'] body = ' '.join(body) body = deletetab(body) body = '"' + body + '"' line = '\t'.join([id, source, info, title, body]) chunk[id] = line block = [] n += 1 if n%10 == 0: flushPrint(n) copy = False elif copy: block.append(i) except Exception, e: print i, e pass return chunk # In[73]: # 注意:标题和版面之间存在一个空行!所以title是block的第4个元素。 for i in range(1, 8): print news[i].decode('gb18030')[:500] # In[81]: # 按block清洗新闻报道 news_result = readblocks(news_clean) # In[82]: # 新闻的数量 len(news_result) # In[83]: # 查看字典的keys news_result.keys()[:5] # In[84]: # 查看字典的values print news_result.values()[10] # In[39]: # 保存数据:将数据写入硬盘 with open('/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt','a') as p: for record in news_result.values(): p.write(record+"\n") # In[58]: # 使用pandas读取数据,并查看。 import pandas as pd file_path = '/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt' df = pd.read_csv(file_path, sep = "\t", header=None) df[:10] # In[59]: # 使用os改变默认的工作路径 import os os.chdir('/Users/chengjun/github/cjc2016/data/occupycentral/') # 使用glob读取某一类文件的所有名称 import glob filenames = glob.glob('*.rtf') filenames # In[60]: for i in filenames: print i with open(i) as f: news = f.readlines() news = [stringclean(n) for n in news] news_result = readblocks(news) with open('/Users/chengjun/github/cjc2016/data/zz-hk-all-clean2.txt','a') as p: for record in news_result.values(): p.write(record+"\n") # # This is the End. # > Thank you for your attention.