データ解析¶

傾向の可視化

pd.read_csvはparse_dateに列を渡したら自動的に日付形式にしてくれるぞ

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.plotly as py
from datetime import date
import random
import warnings
import gc
import math
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
init_notebook_mode()

train_df = pd.read_csv("input/train.csv", parse_dates=["activation_date"])
pr_train = pd.read_csv("input/periods_train.csv",
                       parse_dates=["activation_date", "date_from", "date_to"])
pr_test = pd.read_csv("input/periods_test.csv",
                       parse_dates=["activation_date", "date_from", "date_to"])
test_df = pd.read_csv('input/test.csv', parse_dates=["activation_date"])

C:\Users\tmy19\Miniconda3\envs\tensorflow\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning:

This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

曜日，日，月の行を追加

In [2]:

train_df['weekday'] = train_df.activation_date.dt.weekday
train_df['month'] = train_df.activation_date.dt.month
train_df['day'] = train_df.activation_date.dt.day

NaNを空白に変換

applyで行に対して関数を適用

In [3]:

train_df['description'] = train_df['description'].fillna(' ')
train_df['description_len'] = train_df['description'].apply(lambda x : len(x.split()))

train_df['title'] = train_df['title'].fillna(' ')
train_df['title_len'] = train_df['title'].apply(lambda x : len(x.split()))

In [4]:

train_df['deal_class'] = train_df['deal_probability'].apply(
    lambda x : ">=0.5" if x>=0.5 else "<0.5")
pr_train['total_period'] = pr_train['date_to'] - pr_train['date_from']

In [5]:

parent_category_name_map = {"Личные вещи" : "Personal belongings",
                            "Для дома и дачи" : "For the home and garden",
                            "Бытовая электроника" : "Consumer electronics",
                            "Недвижимость" : "Real estate",
                            "Хобби и отдых" : "Hobbies & leisure",
                            "Транспорт" : "Transport",
                            "Услуги" : "Services",
                            "Животные" : "Animals",
                            "Для бизнеса" : "For business"}

region_map = {"Свердловская область" : "Sverdlovsk oblast",
            "Самарская область" : "Samara oblast",
            "Ростовская область" : "Rostov oblast",
            "Татарстан" : "Tatarstan",
            "Волгоградская область" : "Volgograd oblast",
            "Нижегородская область" : "Nizhny Novgorod oblast",
            "Пермский край" : "Perm Krai",
            "Оренбургская область" : "Orenburg oblast",
            "Ханты-Мансийский АО" : "Khanty-Mansi Autonomous Okrug",
            "Тюменская область" : "Tyumen oblast",
            "Башкортостан" : "Bashkortostan",
            "Краснодарский край" : "Krasnodar Krai",
            "Новосибирская область" : "Novosibirsk oblast",
            "Омская область" : "Omsk oblast",
            "Белгородская область" : "Belgorod oblast",
            "Челябинская область" : "Chelyabinsk oblast",
            "Воронежская область" : "Voronezh oblast",
            "Кемеровская область" : "Kemerovo oblast",
            "Саратовская область" : "Saratov oblast",
            "Владимирская область" : "Vladimir oblast",
            "Калининградская область" : "Kaliningrad oblast",
            "Красноярский край" : "Krasnoyarsk Krai",
            "Ярославская область" : "Yaroslavl oblast",
            "Удмуртия" : "Udmurtia",
            "Алтайский край" : "Altai Krai",
            "Иркутская область" : "Irkutsk oblast",
            "Ставропольский край" : "Stavropol Krai",
            "Тульская область" : "Tula oblast"}


category_map = {"Одежда, обувь, аксессуары":"Clothing, shoes, accessories",
"Детская одежда и обувь":"Children's clothing and shoes",
"Товары для детей и игрушки":"Children's products and toys",
"Квартиры":"Apartments",
"Телефоны":"Phones",
"Мебель и интерьер":"Furniture and interior",
"Предложение услуг":"Offer services",
"Автомобили":"Cars",
"Ремонт и строительство":"Repair and construction",
"Бытовая техника":"Appliances",
"Товары для компьютера":"Products for computer",
"Дома, дачи, коттеджи":"Houses, villas, cottages",
"Красота и здоровье":"Health and beauty",
"Аудио и видео":"Audio and video",
"Спорт и отдых":"Sports and recreation",
"Коллекционирование":"Collecting",
"Оборудование для бизнеса":"Equipment for business",
"Земельные участки":"Land",
"Часы и украшения":"Watches and jewelry",
"Книги и журналы":"Books and magazines",
"Собаки":"Dogs",
"Игры, приставки и программы":"Games, consoles and software",
"Другие животные":"Other animals",
"Велосипеды":"Bikes",
"Ноутбуки":"Laptops",
"Кошки":"Cats",
"Грузовики и спецтехника":"Trucks and buses",
"Посуда и товары для кухни":"Tableware and goods for kitchen",
"Растения":"Plants",
"Планшеты и электронные книги":"Tablets and e-books",
"Товары для животных":"Pet products",
"Комнаты":"Room",
"Фототехника":"Photo",
"Коммерческая недвижимость":"Commercial property",
"Гаражи и машиноместа":"Garages and Parking spaces",
"Музыкальные инструменты":"Musical instruments",
"Оргтехника и расходники":"Office equipment and consumables",
"Птицы":"Birds",
"Продукты питания":"Food",
"Мотоциклы и мототехника":"Motorcycles and bikes",
"Настольные компьютеры":"Desktop computers",
"Аквариум":"Aquarium",
"Охота и рыбалка":"Hunting and fishing",
"Билеты и путешествия":"Tickets and travel",
"Водный транспорт":"Water transport",
"Готовый бизнес":"Ready business",
"Недвижимость за рубежом":"Property abroad"}

In [6]:

train_df['region_en'] = train_df['region'].apply(lambda x: region_map[x])
train_df['parent_category_name_en'] = train_df['parent_category_name'].apply(
    lambda x:parent_category_name_map[x])
train_df['category_name_en'] = train_df['category_name'].apply(
    lambda x: category_map[x])

In [7]:

train_df.head()

Out[7]:

	item_id	user_id	region	city	parent_category_name	category_name	param_1	param_2	param_3	title	...	deal_probability	weekday	month	day	description_len	title_len	deal_class	region_en	parent_category_name_en	category_name_en
0	b912c3c6a6ad	e00f8ff2eaf9	Свердловская область	Екатеринбург	Личные вещи	Товары для детей и игрушки	Постельные принадлежности	NaN	NaN	Кокоби(кокон для сна)	...	0.12789	1	3	28	7	3	<0.5	Sverdlovsk oblast	Personal belongings	Children's products and toys
1	2dac0150717d	39aeb48f0017	Самарская область	Самара	Для дома и дачи	Мебель и интерьер	Другое	NaN	NaN	Стойка для Одежды	...	0.00000	6	3	26	7	3	<0.5	Samara oblast	For the home and garden	Furniture and interior
2	ba83aefab5dc	91e2f88dd6e3	Ростовская область	Ростов-на-Дону	Бытовая электроника	Аудио и видео	Видео, DVD и Blu-ray плееры	NaN	NaN	Philips bluray	...	0.43177	0	3	20	17	2	<0.5	Rostov oblast	Consumer electronics	Audio and video
3	02996f1dd2ea	bf5cccea572d	Татарстан	Набережные Челны	Личные вещи	Товары для детей и игрушки	Автомобильные кресла	NaN	NaN	Автокресло	...	0.80323	5	3	25	3	1	>=0.5	Tatarstan	Personal belongings	Children's products and toys
4	7c90be56d2ab	ef50846afc0b	Волгоградская область	Волгоград	Транспорт	Автомобили	С пробегом	ВАЗ (LADA)	2110	ВАЗ 2110, 2003	...	0.20797	3	3	16	4	3	<0.5	Volgograd oblast	Transport	Cars

5 rows × 27 columns

In [8]:

pr_train.head()

Out[8]:

	item_id	activation_date	date_from	date_to	total_period
0	8f5caef7afb0	2017-02-14	2017-03-15	2017-03-16	1 days
1	66218ff526d1	2017-02-16	2017-03-15	2017-03-18	3 days
2	b237d9539b21	2017-03-01	2017-03-15	2017-03-28	13 days
3	80bf58082ad3	2017-03-19	2017-03-19	2017-03-28	9 days
4	67a9944a7373	2017-03-14	2017-03-15	2017-03-28	13 days

データ分布を可視化する¶

In [9]:

def generate_histgram_plot(df, col, title):
    trace = go.Histogram(x = df[col])
    layout = go.Layout(title=title, height=400,
                       legend=dict(orientation='h'))
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

Target:Deal Probability(広告が実際に何かを売ったと見込める確率)¶

In [10]:

plt.figure(figsize=(15,5))
sns.distplot(train_df['deal_probability'].values, bins=120, color='#ff201e')
plt.xlabel('Deal Probability', fontsize=14)
plt.title("Distribution of Deal Probability", fontsize=14)
plt.show()

In [11]:

def _generate_bar_plot_hor(df, col, title, color, w, h, lm=0, limit=100,
                          need_trace=False):
    cnt_srs = df[col].value_counts()[:limit]
    trace = go.Bar(y=cnt_srs.index[::-1], x=cnt_srs.values[::-1],
                  orientation='h', marker=dict(color=color))
    if need_trace:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    
def _generate_bar_plot_ver(df, col, title, color, w, h, lm=0, limit=100,
                         need_trace=False):
    cnt_srs = df[col].value_counts()[:limit]
    trace = go.Bar(y=list(cnt_srs.values), x=list(cnt_srs.index),
                  marker=dict(color=color))
    if need_trace:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)    

In [12]:

_generate_bar_plot_ver(train_df, 'deal_class', "Value of Deal Probabilities"+
                     "(<0.5 or >=0.5)", ['#f32465', '#83f216'], 700, 400, 200)

特徴量の分布¶

In [13]:

cols = ['parent_category_name_en', 'category_name_en', 'region_en', 'city',
        'param_1', 'param_2', 'param_3', 'weekday', 'day','title_len',
        'description_len', 'image_top_1', 'user_id']

In [14]:

_generate_bar_plot_hor(train_df, cols[0], "Distribution of Parent Category",
                      "#ff8888", 600, 400, 200)

In [15]:

_generate_bar_plot_hor(train_df, cols[1], "Distribution of Category",
                      "#acff88", 600, 500, 200)

In [16]:

_generate_bar_plot_hor(train_df, cols[2], "Distribution of Region",
                      "#7484f3", 600, 600, 200, limit=30)

In [17]:

_generate_bar_plot_hor(train_df, cols[3], "Distribution of City",
                      "#c4f7c3", 600, 600, 200, limit=30)

In [18]:

trace1 = _generate_bar_plot_hor(train_df, cols[4],"Param 1 Values","#77f4e9",
                                700,400,200, limit=20,need_trace=True)
trace2 = _generate_bar_plot_hor(train_df, cols[5],"Param 2 Values","#77f4e9",
                                700,400,200, limit=20,need_trace=True)
trace3 = _generate_bar_plot_hor(train_df, cols[6],"Param 3 Values","#77f4e9",
                                700,400,200, limit=20,need_trace=True)
fig = tools.make_subplots(cols=1,rows=3, print_grid=False,
                         subplot_titles=['Param 1 Values',
                                         'Param 2 Values',
                                         'Param 3 Values'])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 3, 1)
fig['layout'].update(height=1000, title='Top of Values in Param 1,2,3 columns',
                    showlegend=False)
iplot(fig)

In [19]:

trace1 = _generate_bar_plot_ver(train_df, cols[7], "Week Days", "#e994ff",
                               700,400,200,limit=30,need_trace=True)
trace2 = _generate_bar_plot_ver(train_df, cols[8], "Month Days", "#e994ff",
                               700,400,200,limit=30,need_trace=True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ['Week Days','Month Days'])
fig.append_trace(trace1, 1, 1);
fig.append_trace(trace2, 1, 2);

fig['layout'].update(height=300,title='Ads Posted in different Week/Month Days',
                     showlegend=False);
iplot(fig)

In [20]:

trace1 = _generate_bar_plot_ver(train_df, cols[9], "Title Word Count",
                                "#ffb341",700,400,200,
                                limit=30,need_trace=True)
trace2 = _generate_bar_plot_ver(train_df, cols[10], "Description Count",
                                "#ffb341",700,400,200,
                                limit=30,need_trace=True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles =  ['Title Word Count',
                                             'Description Word Count'])
fig.append_trace(trace1, 1, 1);
fig.append_trace(trace2, 1, 2);

fig['layout'].update(height=400,title='',showlegend=False);
iplot(fig)

In [21]:

trace1 = _generate_bar_plot_ver(train_df, cols[11], "Image Top 1",
                                '#e2dfd9', 700, 400, 200, limit=30,
                                need_trace = True)
trace2 = _generate_bar_plot_ver(train_df, cols[12], "User Id",
                                '#e2dfd9', 700, 400, 200, limit=30,
                                need_trace = True)


fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
                          subplot_titles = ['Image Top 1','User Id'])
fig.append_trace(trace1, 1, 1);
fig.append_trace(trace2, 1, 2);

fig['layout'].update(height=400,
                     title='Ads having different ImageTop1 and User Id',
                     showlegend=False);
iplot(fig); 

In [22]:

pr_train['total_period'] = pr_train['total_period'].apply(
    lambda x: int(str(x).split()[0]))
t = pr_train['total_period'].value_counts()
labels = list(t.index)
values = list(t.values)
layout = go.Layout(title='For How Much Days Ads are Run', width=600, height=400,
                  margin=dict(l=100))
trace = go.Pie(labels=labels, values=values, 
               marker=dict(colors=['#f9c968', '#75e575', '#d693b4']))
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [23]:

def _create_pie_chart(df, col):
    tm = df[col].value_counts()
    labels = list(tm.index)
    values = list(tm.values)
    trace = go.Pie(labels=labels, values=values,
                  marker=dict(colors=['#f9c968', '#75e575', '#d693b3']))
    return trace
trace1 = _create_pie_chart(train_df, 'user_type')
layout = go.Layout(title='Distribution of User Type', width=600, height=400,
                  margin=dict(l=100))
data = [trace1]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

多変量解析¶

In [24]:

corr = train_df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set(style='white')

f, ax = plt.subplots(figsize=(10, 8))
cmap = sns.diverging_palette(20, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True,
           linewidths=.1, cbar_kws={"shrink": .5})

Out[24]:

<matplotlib.axes._subplots.AxesSubplot at 0x13f0c404d68>

In [ ]: