#!/usr/bin/env python
# coding: utf-8

# ## Анализ данных в задаче Сбербанк (sdsj)
# 
# примеры визуализации
# 
# *Александр Дьяконов (2016)*

# In[4]:


# подгружаем все нужные пакеты
import pandas as pd
import numpy as np


# для встроенных картинок
get_ipython().run_line_magic('pylab', 'inline')
# чуть покрасивше картинки:
pd.set_option('display.mpl_style', 'default')
figsize(12, 9)

import warnings
warnings.filterwarnings("ignore")

#plt.rcParams['figure.figsize'] = 10, 7.5
#plt.rcParams['axes.grid'] = True
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rcParams['font.family'] = 'Ubuntu'

plt.rc('text', usetex=False)
plt.rc('font', family='serif')
plt.rc('font', weight='bold')
plt.rc('xtick', labelsize=14) 
plt.rc('ytick', labelsize=14)


# In[19]:


# чтобы был русский шрифт
from matplotlib import rc
 
font = {'family': 'Verdana',
        'weight': 'normal'}
rc('font', **font)


# In[99]:


# ## Загрузка данных

# In[5]:


transactions = pd.read_csv('transactions.csv') # d:\\Competitions\\sdsj\\
print (transactions.shape)
transactions[:5]


# In[7]:


tr_mcc_codes = pd.read_csv('tr_mcc_codes.csv', sep=';')
print (tr_mcc_codes.shape, transactions.mcc_code.nunique())
tr_mcc_codes[:5]


# In[10]:


tr_types = pd.read_csv('tr_types.csv', sep=';')
print (tr_types.shape, transactions.tr_type.nunique())
tr_types[:5]


# In[12]:


customers_gender_train = pd.read_csv('customers_gender_train.csv')
print (customers_gender_train.shape)
customers_gender_train[:5]


# ## Формирование таблицы

# In[13]:


# присоедниение пола
transactions = pd.merge(transactions, customers_gender_train, on='customer_id', how='outer')
print (transactions.shape)
transactions[:5]


# In[14]:


transactions['num_day'] = transactions['tr_datetime'].apply(lambda x: int(x.split(' ')[0]))
transactions['datetime'] = transactions['tr_datetime'].apply(lambda x: x.split(' ')[1])
transactions['hour'] = transactions['datetime'].apply(lambda x: int(x.split(':')[0]))
transactions['minute'] = transactions['datetime'].apply(lambda x: int(x.split(':')[1]))
transactions['second'] = transactions['datetime'].apply(lambda x: int(x.split(':')[2]))
del transactions['tr_datetime']
transactions[:5]


# In[15]:


# день недели
transactions['dayofweek'] = transactions['num_day'].apply(lambda x: (x + 4) % 7)


# In[16]:


transactions[:5]


# # Визуализации

# In[24]:


figsize(14, 5)
p = transactions.dayofweek.hist(bins=7) # там есть 60!
p.set_xlabel('dayofweek')
p.set_ylabel('count')
p.set_xlim([0, 6])
p.set_xticklabels([u'пн', u'вт', u'ср', u'чт', u'птн', u'сб', u'вск'])
p.set_title(u'Число транзакций в разные дни недели')
transactions.dayofweek.value_counts()


# In[60]:


figsize(12,5)
plt.hist(transactions.dayofweek.values, bins=7, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].dayofweek.values, bins=7, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].dayofweek.values, bins=7, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xticks(0.25+ 0.85*np.arange(7), [u'пн', u'вт', u'ср', u'чт', u'птн', u'сб', u'вск']) # , rotation='vertical'
plt.title(u'Число транзакций в разные дни недели')
plt.legend()


# In[72]:


figsize(12,5)
# там есть 60!
plt.hist(transactions.second.values, bins=61, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].second.values, bins=61, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].second.values, bins=61, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('second')
plt.ylabel('count')
plt.title(u'Вхождения секунд во времена транзакций')
plt.legend()
plt.xlim([0, 60.5])
transactions.second.value_counts()[:5]


# In[70]:


print (transactions[transactions.gender==0].second.value_counts()[:5])
print (transactions[transactions.gender==1].second.value_counts()[:5]) # у мужчин это чаще!!!


# In[ ]:


# In[73]:


figsize(12,5)
# там есть 60!
plt.hist(transactions.minute.values, bins=60, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].minute.values, bins=60, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].minute.values, bins=60, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('minute')
plt.ylabel('count')
plt.title(u'Вхождения минут во времена транзакций')
plt.legend()
plt.xlim([0, 60.5])
transactions.minute.value_counts()[:5]


# In[89]:


transactions['new'] = (transactions.minute==0) & (transactions.second==0) & (transactions.hour==0)
pd.crosstab(transactions.new, transactions.gender)


# In[99]:


# что это за нулевое время...
transactions[transactions.new==True][::50000]


# In[93]:


# transactions[transactions.new==True]['tr_type'].unique()


# In[78]:


figsize(12,5)
# там есть 60!
plt.hist(transactions.hour.values, bins=24, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].hour.values, bins=24, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].hour.values, bins=24, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('hour')
plt.ylabel('count')
plt.title(u'Вхождения часов во времена транзакций')
plt.legend()
plt.xlim([0, 24])
transactions.hour.value_counts()[:5]


# In[79]:


# число дней
transactions.num_day.nunique()


# In[87]:


figsize(12,5)
# там есть 60!
plt.hist(transactions.num_day.values, bins=457, color='#770000', label=u'все', alpha=0.3, width=0.9) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].num_day.values, bins=457, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].num_day.values, bins=457, color='#000077', label=u'м', alpha=1.0, width=0.5)
plt.xlabel('day')
plt.ylabel('count')
plt.title(u'Вхождения дней во времена транзакций')
plt.legend()
plt.xlim([0, 457])
# plt.xlim([153-10, 153+(120-1) + 4 + 10])
transactions.num_day.value_counts()[:5]


# In[88]:


457-153


# In[101]:


tmp = transactions.groupby('customer_id')
f1 = tmp.amount.count().values
f2 = tmp.amount.mean().values
g = tmp.gender.mean().values


# In[112]:


plt.scatter(np.log(f1[g==0]+1), np.sign(f2[g==0])*np.log(np.abs(f2[g==0])+1), c='#990000')
plt.scatter(np.log(f1[g==1]+1), np.sign(f2[g==1])*np.log(np.abs(f2[g==1])+1), c='#000099')
plt.xlabel(u'~ log число транзакций')
plt.ylabel(u'~ log средняя транзакция')


# In[17]:


tmp = transactions.groupby('customer_id')
f1 = tmp.amount.apply(lambda x: sum(x[x>0])).values
f2 = tmp.amount.apply(lambda x: sum(x[x<0])).values
g = tmp.gender.mean().values


# In[22]:


figsize(7, 6)
plt.scatter(np.log(f1[g==0] + 1.0), np.log(1.0 - f2[g==0]), c='#990000', s=20, alpha = 0.3, label=u'ж')
plt.scatter(np.log(f1[g==1] + 1.0), np.log(1.0 - f2[g==1]), c='#000099', s=20, alpha = 0.3, label=u'м')
plt.xlabel(u'~ log сумма начислений')
plt.ylabel(u'~ log сумма трат')
plt.xlim([-0.5, 25])
plt.ylim([-0.5, 25])
plt.title(u'Общий баланс', fontsize=12)
plt.legend()


# In[34]:


figsize(12, 5)
plt.hist(np.log(f1[g==1] + 1.0), bins=100, color='#000077', label=u'м', alpha=0.6, normed=True)
plt.hist(np.log(f1[g==0] + 1.0), bins=100, color='#007700', label=u'ж', alpha=0.3, normed=True)
plt.legend()
plt.xlabel(u'~ log сумма начислений')
plt.show()


# In[43]:


figsize(12, 5)
plt.hist(np.log(f1[g==1] + 1.0) - np.log(-f2[g==1] + 1.0), bins=100, color='#000077', label=u'м', alpha=0.6, normed=True)
plt.hist(np.log(f1[g==0] + 1.0) - np.log(-f2[g==0] + 1.0), bins=100, color='#007700', label=u'ж', alpha=0.3, normed=True)
plt.legend()
plt.xlabel(u'~ log сумма начислений - log сумма трат')
plt.show()


# In[41]:


tmp = np.log(f1[g==1] + 1.0) - np.log(f2[g==1] + 1.0)
max(tmp), min(tmp)


# In[132]:


tmp = transactions.groupby('customer_id')
f1 = tmp.amount.apply(lambda x: mean(x[x>0])).values
f2 = tmp.amount.apply(lambda x: mean(x[x<0])).values
g = tmp.gender.mean().values


# In[140]:


figsize(7, 6)
plt.scatter(np.log(f1[g==0] + 1.0), np.log(1.0 - f2[g==0]), c='#990000', s=5, alpha = 0.5, label=u'ж')
plt.scatter(np.log(f1[g==1] + 1.0), np.log(1.0 - f2[g==1]), c='#000099', s=5, alpha = 0.5, label=u'м')
plt.xlabel(u'~ log среднее начисление')
plt.ylabel(u'~ log средняя трата')
plt.xlim([-0.5, 20])
plt.ylim([-0.5, 20])
plt.title(u'Общий баланс средних')
plt.legend()


# In[148]:


# просто эксперимент
df = pd.DataFrame({'a':[1,1,1,1,1,1,2,2,2,2,2], 'b':[3,2,4,2,3,1,2,3,4,3,2]})
df.groupby('a').b.apply(lambda x: sum(x==1))


# In[183]:


# Самые популярные tr_types
tmp = transactions.groupby('tr_type').amount.count()
tmp.sort(ascending=False)
figsize(12, 5)
tmp.plot(kind='bar')
# plt.bar(np.arange(len(f)), f) f = tmp.values
plt.xlabel('tr_type')
plt.ylabel('count')
plt.title(u'Число транзакций с разными tr_type')
pd.merge(tmp[:15].reset_index(), tr_types)
#plt.plot(np.sort(tmp.amount.sum().values))


# In[189]:


# Гендерные tr_types

tmp = pd.merge(pd.crosstab(transactions.tr_type, transactions.gender).reset_index(), tr_types)
tmp[:5]
abs(tmp[0.0] - tmp[1.0])/(tmp[0.0] + tmp[1.0])
tmp['k'] = (abs(tmp[0.0] - tmp[1.0])/(tmp[0.0] + tmp[1.0]))

tmp = tmp[(tmp[0.0] + tmp[1.0]) > 50] # е. достаточно статистики


tmp.sort('k',  ascending=False, inplace=True)
#del tmp['gender']
tmp.columns = [u'tr_type', u'ж', u'м', u'tr_type_description', u'k']
tmp.set_index(tmp.tr_type, inplace=True)
del tmp['tr_type']
tmp[:10]


# In[186]:


p = tmp[:50].k.plot(kind='bar')
p.set_ylabel(u'коэффициент различаемости')
p.set_title(u'tr_type мужчин и женщин')


# In[ ]:


# In[214]:


# Популярные mcc_code

tmp = transactions.groupby('mcc_code').amount.count()
tmp.sort(ascending=False)
figsize(15, 5)
tmp[:50].plot(kind='bar') ##### 50!
# plt.bar(np.arange(len(f)), f) f = tmp.values
plt.xlabel('mcc_code')
plt.ylabel('count')
plt.title('Число транзакций с разными mcc_code (выведены первые 50)')
pd.merge(tmp[:15].reset_index(), tr_mcc_codes)
#plt.plot(np.sort(tmp.amount.sum().values))


# 

# In[187]:


# гендерные mcc_code
tmp = pd.merge(pd.crosstab(transactions.mcc_code, transactions.gender).reset_index(), tr_mcc_codes)
tmp[:5]
abs(tmp[0.0] - tmp[1.0])/(tmp[0.0] + tmp[1.0])
tmp['k'] = (abs(tmp[0.0] - tmp[1.0])/(tmp[0.0] + tmp[1.0]))

tmp.sort('k',  ascending=False, inplace=True)
#del tmp['gender']
tmp.columns = [u'mcc_code', u'ж', u'м', u'mcc_description', u'k']
tmp.set_index(tmp.mcc_code, inplace=True)
del tmp['mcc_code']
tmp[:10]


# In[182]:


p = tmp[:50].k.plot(kind='bar')
p.set_ylabel(u'коэффициент различаемости')
p.set_title(u'mmc_code мужчин и женщин')


# In[51]:


figsize(12,5)
# там есть 60!
plt.hist(transactions[transactions.amount.abs()<70].amount.values, bins=(69*2+1), color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.amount.abs()<70][transactions.gender==0].amount.values, bins=(69*2+1), color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.amount.abs()<70][transactions.gender==1].amount.values, bins=(69*2+1), color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('сумма')
plt.ylabel('число транзакций')
plt.title(u'Распределение сумм транзакций')
plt.legend()
#plt.xlim([0, 24])
#transactions.amount.value_counts()[:5]


# In[ ]:


# In[239]:


tmp = transactions.amount.values


# In[271]:


p = transactions[transactions.amount.abs()<1000].amount.hist(bins=200)
p.set_xlabel('сумма')
p.set_ylabel('число транзакций')
p.set_title('Распределение сумм транзакций (небольшие суммы)')


# In[264]:


p = transactions[transactions.amount.abs()<1000].amount.apply(lambda x: np.log(np.abs(x))).hist(bins=100)
p.set_xlabel('сумма')
p.set_ylabel('число транзакций')
p.set_title('Распределение сумм транзакций (небольшие суммы)')


# In[56]:


tmp = transactions.groupby(['num_day', 'customer_id']).amount.count()
p = tmp[tmp<=30].hist(bins=30)
p.set_xlabel('число транзакций в день')
p.set_ylabel('число таких случаев (пар "клиент, день")')
p.set_title('Встречаемость определённого количества транзакций в день')
tmp.value_counts()[:10]


# In[64]:


tmp = transactions.groupby(['num_day', 'customer_id']).amount.count()
tmp1 = transactions[transactions.gender==0].groupby(['num_day', 'customer_id']).amount.count()
tmp2 = transactions[transactions.gender==1].groupby(['num_day', 'customer_id']).amount.count()

figsize(12,5)
# там есть 60!
plt.hist(tmp[tmp<31], bins=32, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(tmp1[tmp<31], bins=32, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(tmp2[tmp<31], bins=32, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel(u'число транзакций в день')
plt.ylabel(u'число таких случаев (пар "клиент, день")')
plt.title(u'Встречаемость определённого количества транзакций в день')
plt.legend()
plt.xlim([0, 30])


# In[66]:


tmp = transactions.groupby(['num_day', 'customer_id']).amount.count()
tmp = tmp.unstack()


# In[68]:


tmp[:5]


# In[ ]:


# In[ ]:


# In[221]:


user = transactions.customer_id.unique()[100]

tr = transactions[transactions.customer_id == user]
tr[:20]


# In[276]:


tmp = tr.mcc_code.value_counts()
tmp.to_dict()

df_mcc = pd.DataFrame({'mcc':transactions.mcc_code.unique()})
df_mcc['new'] = df_mcc.mcc.map(tmp.to_dict())
df_mcc.new.fillna(0, inplace=True)

#df_mcc.new.tolist()
df_mcc['new'] = df_mcc['new']/df_mcc['new'].sum()
df_mcc


# In[146]:


tmp = tr.groupby('num_day').amount.sum()
#lt.plot(tr.d)
p = tmp.plot(kind='bar')
p.set_xlim([0, 100])


# In[128]:


print ('Среднее число транзакций:')
print (mean(tmp.amount.count().values))
print (median(tmp.amount.count().values))
print (tr.shape[0])