# подгружаем все нужные пакеты
import pandas as pd
import numpy as np
# для встроенных картинок
%pylab inline
# чуть покрасивше картинки:
pd.set_option('display.mpl_style', 'default')
figsize(12, 9)
import warnings
warnings.filterwarnings("ignore")
#plt.rcParams['figure.figsize'] = 10, 7.5
#plt.rcParams['axes.grid'] = True
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Ubuntu'
plt.rc('text', usetex=False)
plt.rc('font', family='serif')
plt.rc('font', weight='bold')
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
Populating the interactive namespace from numpy and matplotlib
# чтобы был русский шрифт
from matplotlib import rc
font = {'family': 'Verdana',
'weight': 'normal'}
rc('font', **font)
transactions = pd.read_csv('transactions.csv') # d:\\Competitions\\sdsj\\
print (transactions.shape)
transactions[:5]
(6849346, 6)
customer_id | tr_datetime | mcc_code | tr_type | amount | term_id | |
---|---|---|---|---|---|---|
0 | 39026145 | 0 10:23:26 | 4814 | 1030 | -2245.92 | NaN |
1 | 39026145 | 1 10:19:29 | 6011 | 7010 | 56147.89 | NaN |
2 | 39026145 | 1 10:20:56 | 4829 | 2330 | -56147.89 | NaN |
3 | 39026145 | 1 10:39:54 | 5499 | 1010 | -1392.47 | NaN |
4 | 39026145 | 2 15:33:42 | 5499 | 1010 | -920.83 | NaN |
tr_mcc_codes = pd.read_csv('tr_mcc_codes.csv', sep=';')
print (tr_mcc_codes.shape, transactions.mcc_code.nunique())
tr_mcc_codes[:5]
(184, 2) 184
mcc_code | mcc_description | |
---|---|---|
0 | 742 | Ветеринарные услуги |
1 | 1711 | Генеральные подрядчики по вентиляции, теплосна... |
2 | 1731 | Подрядчики по электричеству |
3 | 1799 | Подрядчики, специализированная торговля — нигд... |
4 | 2741 | Разнообразные издательства/печатное дело |
tr_types = pd.read_csv('tr_types.csv', sep=';')
print (tr_types.shape, transactions.tr_type.nunique())
tr_types[:5]
(155, 2) 77
tr_type | tr_description | |
---|---|---|
0 | 3200 | Плата за предоставление услуг посредством моби... |
1 | 3210 | Плата за предоставление отчета по счету карты ... |
2 | 3800 | Плата за обслуживание банковской карты (за пер... |
3 | 4000 | Плата за получение наличных в Сбербанке |
4 | 4001 | Плата за получение наличных в Сбербанке (в дру... |
customers_gender_train = pd.read_csv('customers_gender_train.csv')
print (customers_gender_train.shape)
customers_gender_train[:5]
(12000, 2)
customer_id | gender | |
---|---|---|
0 | 75562265 | 0 |
1 | 10928546 | 1 |
2 | 69348468 | 1 |
3 | 84816985 | 1 |
4 | 61009479 | 0 |
# присоедниение пола
transactions = pd.merge(transactions, customers_gender_train, on='customer_id', how='outer')
print (transactions.shape)
transactions[:5]
(6849346, 7)
customer_id | tr_datetime | mcc_code | tr_type | amount | term_id | gender | |
---|---|---|---|---|---|---|---|
0 | 39026145 | 0 10:23:26 | 4814 | 1030 | -2245.92 | NaN | 1.0 |
1 | 39026145 | 1 10:19:29 | 6011 | 7010 | 56147.89 | NaN | 1.0 |
2 | 39026145 | 1 10:20:56 | 4829 | 2330 | -56147.89 | NaN | 1.0 |
3 | 39026145 | 1 10:39:54 | 5499 | 1010 | -1392.47 | NaN | 1.0 |
4 | 39026145 | 2 15:33:42 | 5499 | 1010 | -920.83 | NaN | 1.0 |
transactions['num_day'] = transactions['tr_datetime'].apply(lambda x: int(x.split(' ')[0]))
transactions['datetime'] = transactions['tr_datetime'].apply(lambda x: x.split(' ')[1])
transactions['hour'] = transactions['datetime'].apply(lambda x: int(x.split(':')[0]))
transactions['minute'] = transactions['datetime'].apply(lambda x: int(x.split(':')[1]))
transactions['second'] = transactions['datetime'].apply(lambda x: int(x.split(':')[2]))
del transactions['tr_datetime']
transactions[:5]
customer_id | mcc_code | tr_type | amount | term_id | gender | num_day | datetime | hour | minute | second | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39026145 | 4814 | 1030 | -2245.92 | NaN | 1.0 | 0 | 10:23:26 | 10 | 23 | 26 |
1 | 39026145 | 6011 | 7010 | 56147.89 | NaN | 1.0 | 1 | 10:19:29 | 10 | 19 | 29 |
2 | 39026145 | 4829 | 2330 | -56147.89 | NaN | 1.0 | 1 | 10:20:56 | 10 | 20 | 56 |
3 | 39026145 | 5499 | 1010 | -1392.47 | NaN | 1.0 | 1 | 10:39:54 | 10 | 39 | 54 |
4 | 39026145 | 5499 | 1010 | -920.83 | NaN | 1.0 | 2 | 15:33:42 | 15 | 33 | 42 |
# день недели
transactions['dayofweek'] = transactions['num_day'].apply(lambda x: (x + 4) % 7)
transactions[:5]
customer_id | mcc_code | tr_type | amount | term_id | gender | num_day | datetime | hour | minute | second | dayofweek | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39026145 | 4814 | 1030 | -2245.92 | NaN | 1.0 | 0 | 10:23:26 | 10 | 23 | 26 | 4 |
1 | 39026145 | 6011 | 7010 | 56147.89 | NaN | 1.0 | 1 | 10:19:29 | 10 | 19 | 29 | 5 |
2 | 39026145 | 4829 | 2330 | -56147.89 | NaN | 1.0 | 1 | 10:20:56 | 10 | 20 | 56 | 5 |
3 | 39026145 | 5499 | 1010 | -1392.47 | NaN | 1.0 | 1 | 10:39:54 | 10 | 39 | 54 | 5 |
4 | 39026145 | 5499 | 1010 | -920.83 | NaN | 1.0 | 2 | 15:33:42 | 15 | 33 | 42 | 6 |
figsize(14, 5)
p = transactions.dayofweek.hist(bins=7) # там есть 60!
p.set_xlabel('dayofweek')
p.set_ylabel('count')
p.set_xlim([0, 6])
p.set_xticklabels([u'пн', u'вт', u'ср', u'чт', u'птн', u'сб', u'вск'])
p.set_title(u'Число транзакций в разные дни недели')
transactions.dayofweek.value_counts()
4 1100489 3 1033376 2 1028562 1 1014304 0 965424 5 923670 6 783521 Name: dayofweek, dtype: int64
figsize(12,5)
plt.hist(transactions.dayofweek.values, bins=7, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].dayofweek.values, bins=7, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].dayofweek.values, bins=7, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xticks(0.25+ 0.85*np.arange(7), [u'пн', u'вт', u'ср', u'чт', u'птн', u'сб', u'вск']) # , rotation='vertical'
plt.title(u'Число транзакций в разные дни недели')
plt.legend()
<matplotlib.legend.Legend at 0xdfbcaef0>
figsize(12,5)
# там есть 60!
plt.hist(transactions.second.values, bins=61, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].second.values, bins=61, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].second.values, bins=61, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('second')
plt.ylabel('count')
plt.title(u'Вхождения секунд во времена транзакций')
plt.legend()
plt.xlim([0, 60.5])
transactions.second.value_counts()[:5]
0 895545 29 100111 14 99811 47 99775 18 99760 Name: second, dtype: int64
print (transactions[transactions.gender==0].second.value_counts()[:5])
print (transactions[transactions.gender==1].second.value_counts()[:5]) # у мужчин это чаще!!!
0 323133 23 42586 16 42445 51 42390 33 42368 Name: second, dtype: int64 0 390156 47 36365 44 36331 14 36282 18 36281 Name: second, dtype: int64
figsize(12,5)
# там есть 60!
plt.hist(transactions.minute.values, bins=60, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].minute.values, bins=60, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].minute.values, bins=60, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('minute')
plt.ylabel('count')
plt.title(u'Вхождения минут во времена транзакций')
plt.legend()
plt.xlim([0, 60.5])
transactions.minute.value_counts()[:5]
0 896413 13 102054 12 102011 16 101938 25 101922 Name: minute, dtype: int64
transactions['new'] = (transactions.minute==0) & (transactions.second==0) & (transactions.hour==0)
pd.crosstab(transactions.new, transactions.gender)
gender | 0.0 | 1.0 |
---|---|---|
new | ||
False | 2563868 | 2193579 |
True | 281466 | 354424 |
# что это за нулевое время...
transactions[transactions.new==True][::50000]
customer_id | mcc_code | tr_type | amount | term_id | gender | num_day | datetime | hour | minute | second | dayofweek | new | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
28 | 39026145 | 5331 | 1110 | -6288.56 | NaN | 1 | 15 | 00:00:00 | 0 | 0 | 0 | 5 | True |
351586 | 1521146 | 4814 | 1100 | -2245.92 | 22866601 | 1 | 448 | 00:00:00 | 0 | 0 | 0 | 4 | True |
864613 | 99289665 | 7995 | 1100 | -8085.30 | 77062921 | NaN | 321 | 00:00:00 | 0 | 0 | 0 | 3 | True |
1237730 | 60105418 | 4814 | 1100 | -10541.21 | 808444 | NaN | 262 | 00:00:00 | 0 | 0 | 0 | 0 | True |
1633848 | 95088517 | 5411 | 1110 | -8792.76 | 549433 | 0 | 285 | 00:00:00 | 0 | 0 | 0 | 2 | True |
2043057 | 15042508 | 5812 | 1110 | -3627.15 | 60002172 | 1 | 259 | 00:00:00 | 0 | 0 | 0 | 4 | True |
2469154 | 51301561 | 5814 | 1110 | -8197.59 | 26021804 | 0 | 235 | 00:00:00 | 0 | 0 | 0 | 1 | True |
2929955 | 64756863 | 5964 | 1200 | -16219.33 | NaN | 1 | 18 | 00:00:00 | 0 | 0 | 0 | 1 | True |
3398608 | 79492634 | 5441 | 1110 | -17095.91 | 25006972 | 0 | 264 | 00:00:00 | 0 | 0 | 0 | 2 | True |
3796662 | 63823503 | 7832 | 1110 | -7636.11 | 21479734 | NaN | 320 | 00:00:00 | 0 | 0 | 0 | 2 | True |
4285633 | 49495176 | 7311 | 1100 | -2245.92 | 808276 | 1 | 433 | 00:00:00 | 0 | 0 | 0 | 3 | True |
4752562 | 11341232 | 5533 | 1110 | -4491.83 | NaN | 1 | 134 | 00:00:00 | 0 | 0 | 0 | 5 | True |
5182846 | 63801566 | 5411 | 1110 | -16597.32 | 23697538 | 0 | 443 | 00:00:00 | 0 | 0 | 0 | 6 | True |
5604376 | 70192661 | 6051 | 1100 | -8961.20 | 03170966 | 1 | 345 | 00:00:00 | 0 | 0 | 0 | 6 | True |
6025278 | 6916077 | 5045 | 1110 | -55923.30 | 00723019 | 1 | 385 | 00:00:00 | 0 | 0 | 0 | 4 | True |
6457435 | 91437038 | 5735 | 1200 | -2358.21 | NaN | 1 | 90 | 00:00:00 | 0 | 0 | 0 | 3 | True |
# transactions[transactions.new==True]['tr_type'].unique()
figsize(12,5)
# там есть 60!
plt.hist(transactions.hour.values, bins=24, color='#770000', label=u'все', alpha=0.3, width=0.8) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].hour.values, bins=24, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].hour.values, bins=24, color='#000077', label=u'м', alpha=1.0, width=0.6)
plt.xlabel('hour')
plt.ylabel('count')
plt.title(u'Вхождения часов во времена транзакций')
plt.legend()
plt.xlim([0, 24])
transactions.hour.value_counts()[:5]
0 842343 13 449586 12 443782 15 432270 14 430720 Name: hour, dtype: int64
# число дней
transactions.num_day.nunique()
457
figsize(12,5)
# там есть 60!
plt.hist(transactions.num_day.values, bins=457, color='#770000', label=u'все', alpha=0.3, width=0.9) # , histtype='stepfilled', normed=True,
plt.hist(transactions[transactions.gender==0].num_day.values, bins=457, color='#007700', label=u'ж', alpha=0.6, width=0.7)
plt.hist(transactions[transactions.gender==1].num_day.values, bins=457, color='#000077', label=u'м', alpha=1.0, width=0.5)
plt.xlabel('day')
plt.ylabel('count')
plt.title(u'Вхождения дней во времена транзакций')
plt.legend()
plt.xlim([0, 457])
# plt.xlim([153-10, 153+(120-1) + 4 + 10])
transactions.num_day.value_counts()[:5]
314 20552 272 20368 441 20203 448 20178 425 19979 Name: num_day, dtype: int64
457-153
304
tmp = transactions.groupby('customer_id')
f1 = tmp.amount.count().values
f2 = tmp.amount.mean().values
g = tmp.gender.mean().values
plt.scatter(np.log(f1[g==0]+1), np.sign(f2[g==0])*np.log(np.abs(f2[g==0])+1), c='#990000')
plt.scatter(np.log(f1[g==1]+1), np.sign(f2[g==1])*np.log(np.abs(f2[g==1])+1), c='#000099')
plt.xlabel(u'~ log число транзакций')
plt.ylabel(u'~ log средняя транзакция')
<matplotlib.text.Text at 0xdff5b5f8>
tmp = transactions.groupby('customer_id')
f1 = tmp.amount.apply(lambda x: sum(x[x>0])).values
f2 = tmp.amount.apply(lambda x: sum(x[x<0])).values
g = tmp.gender.mean().values
figsize(7, 6)
plt.scatter(np.log(f1[g==0] + 1.0), np.log(1.0 - f2[g==0]), c='#990000', s=20, alpha = 0.3, label=u'ж')
plt.scatter(np.log(f1[g==1] + 1.0), np.log(1.0 - f2[g==1]), c='#000099', s=20, alpha = 0.3, label=u'м')
plt.xlabel(u'~ log сумма начислений')
plt.ylabel(u'~ log сумма трат')
plt.xlim([-0.5, 25])
plt.ylim([-0.5, 25])
plt.title(u'Общий баланс', fontsize=12)
plt.legend()
<matplotlib.legend.Legend at 0x40952da0>