# 将评论从数据库读入
from sqlalchemy import create_engine
engine = create_engine('oracle://user:password@bi_data')
import pandas as pd
df = pd.read_sql_query(str_sql, engine)

df.shape

df.head()

import jieba
def cutword(x):
    if isinstance(x, str): x = x.decode('utf8') # 解码为unicode 
    x = unicode(x) #将数字变字符
    seg = jieba.cut(x)
    return ' '.join(seg)
# cutword(string)

x = '中国' #外部输入
y = x.decode('utf8') #解码为unicode  在python内部处理
z = y.encode('utf8') #编码为str 再输出 

df['seg_word'] = df.good_cntnt.map(cutword)
df.head()

txt = df['seg_word'].values
txtlist = []
for x in txt:
    txtlist.append(x.split())

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

num_features = 300
min_word_count = 10
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
print "traing model..."
model = word2vec.Word2Vec(txtlist, workers = num_workers, size= num_features, min_count=min_word_count, window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = 'allcomword2vec'
model.save(model_name)

for word, word_simi in model.most_similar(u'丰满', topn=20):
    print word, word_simi