#!/usr/bin/env python
# coding: utf-8

# ***
# # 数据清洗：
# > # 对占中新闻进行数据清洗
# ***
# ***
# 
# 王成军
# 
# wangchengjun@nju.edu.cn
# 
# 计算传播网 http://computational-communication.com

# In[70]:


# 使用with open读取每一行数据
with open("/Users/chengjun/github/cjc2016/data/occupycentral/zz-hk-2014-10.rtf") as f:
   news = f.readlines()


# In[71]:


# 查看总共有多少行
len(news)


# In[73]:


# 注意：标题和版面之间存在一个空行！所以title是block的第4个元素。
for i in range(1, 8):
    print news[i].decode('gb18030')[:500]


# In[74]:


# 需要对中文编码的对象使用中文的方式进行解码
print news[17].decode('gb18030')[:500]


# In[75]:


# 定义一个函数：实现解码、编码、清洗效果
def stringclean(s):
    s = s.decode('gb18030').encode('utf8')
    s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b\cf6 ', '')
    s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b0\cf0 ', '')
    s = s.replace('\par', '').replace('\n', '')
    return s


# In[87]:


'aabbccdd ee'.strip('a')


# In[94]:


'aabbccdd ee'.strip('ab')


# In[103]:


'aabbccdd ee'.replace('ab', '')


# In[76]:


# 调用stringclean函数
print stringclean(news[17])


# In[77]:


# 列表内的for循环
news_clean = [stringclean(n) for n in news]
len(news_clean)


# In[85]:


print news_clean[17][:120]


# In[79]:


# 定义两个函数

def deletetab(s):
    return s.replace('\t', '')


import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush() # 清洗掉 


# In[102]:


help(sys.stdout)


# In[95]:


# 调用deletetab
deletetab('\ta')


# In[97]:


# 演示：flushPrint
import time, random
for i in range(10):
    time.sleep(random.random())
    flushPrint(i) 


# In[80]:


from collections import defaultdict

def readblocks(data):
    copy = False
    n = 0
    block = []
    chunk = defaultdict(lambda:[])
    for i in data:
        try:
            if "~~~~~~~~~~~~~~~~~~~~~~~~~~  #" in i:
                copy = True
            elif "文章编号:" in i:
                id = i.replace('文章编号: ', '')
                source = block[0].split('|')[0]
                info = block[1]
                title = deletetab(block[3]) # 
                body = [j for j in block[6:] if j != '\n']
                body = ' '.join(body)
                body = deletetab(body)
                body = '"' + body  + '"'
                line = '\t'.join([id, source, info, title, body])
                chunk[id] = line
                block = []
                n += 1
                if n%10 == 0:
                    flushPrint(n)
                copy = False
            elif copy:
                block.append(i)
        except Exception, e:
            print i, e
            pass
    return chunk


# In[73]:


# 注意：标题和版面之间存在一个空行！所以title是block的第4个元素。
for i in range(1, 8):
    print news[i].decode('gb18030')[:500]


# In[81]:


# 按block清洗新闻报道
news_result = readblocks(news_clean)


# In[82]:


# 新闻的数量
len(news_result)


# In[83]:


# 查看字典的keys
news_result.keys()[:5]


# In[84]:


# 查看字典的values
print news_result.values()[10]


# In[39]:


# 保存数据：将数据写入硬盘
with open('/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt','a') as p:
     for record in news_result.values():
         p.write(record+"\n")


# In[58]:


# 使用pandas读取数据，并查看。
import pandas as pd
file_path = '/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt'
df = pd.read_csv(file_path, sep = "\t", header=None)
df[:10]


# In[59]:


# 使用os改变默认的工作路径
import os
os.chdir('/Users/chengjun/github/cjc2016/data/occupycentral/')
# 使用glob读取某一类文件的所有名称
import glob
filenames = glob.glob('*.rtf')
filenames


# In[60]:


for i in filenames:
    print i
    with open(i) as f:
        news = f.readlines()
        news = [stringclean(n) for n in news]
        news_result = readblocks(news)
        with open('/Users/chengjun/github/cjc2016/data/zz-hk-all-clean2.txt','a') as p:
            for record in news_result.values():
                p.write(record+"\n")


# # This is the End.
# > Thank you for your attention.