import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.plotly as py
from datetime import date
import random
import warnings
import gc
import math
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
init_notebook_mode()
train_df = pd.read_csv("input/train.csv", parse_dates=["activation_date"])
pr_train = pd.read_csv("input/periods_train.csv",
parse_dates=["activation_date", "date_from", "date_to"])
pr_test = pd.read_csv("input/periods_test.csv",
parse_dates=["activation_date", "date_from", "date_to"])
test_df = pd.read_csv('input/test.csv', parse_dates=["activation_date"])
C:\Users\tmy19\Miniconda3\envs\tensorflow\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
曜日,日,月の行を追加
train_df['weekday'] = train_df.activation_date.dt.weekday
train_df['month'] = train_df.activation_date.dt.month
train_df['day'] = train_df.activation_date.dt.day
NaN
を空白に変換
apply
で行に対して関数を適用
train_df['description'] = train_df['description'].fillna(' ')
train_df['description_len'] = train_df['description'].apply(lambda x : len(x.split()))
train_df['title'] = train_df['title'].fillna(' ')
train_df['title_len'] = train_df['title'].apply(lambda x : len(x.split()))
train_df['deal_class'] = train_df['deal_probability'].apply(
lambda x : ">=0.5" if x>=0.5 else "<0.5")
pr_train['total_period'] = pr_train['date_to'] - pr_train['date_from']
parent_category_name_map = {"Личные вещи" : "Personal belongings",
"Для дома и дачи" : "For the home and garden",
"Бытовая электроника" : "Consumer electronics",
"Недвижимость" : "Real estate",
"Хобби и отдых" : "Hobbies & leisure",
"Транспорт" : "Transport",
"Услуги" : "Services",
"Животные" : "Animals",
"Для бизнеса" : "For business"}
region_map = {"Свердловская область" : "Sverdlovsk oblast",
"Самарская область" : "Samara oblast",
"Ростовская область" : "Rostov oblast",
"Татарстан" : "Tatarstan",
"Волгоградская область" : "Volgograd oblast",
"Нижегородская область" : "Nizhny Novgorod oblast",
"Пермский край" : "Perm Krai",
"Оренбургская область" : "Orenburg oblast",
"Ханты-Мансийский АО" : "Khanty-Mansi Autonomous Okrug",
"Тюменская область" : "Tyumen oblast",
"Башкортостан" : "Bashkortostan",
"Краснодарский край" : "Krasnodar Krai",
"Новосибирская область" : "Novosibirsk oblast",
"Омская область" : "Omsk oblast",
"Белгородская область" : "Belgorod oblast",
"Челябинская область" : "Chelyabinsk oblast",
"Воронежская область" : "Voronezh oblast",
"Кемеровская область" : "Kemerovo oblast",
"Саратовская область" : "Saratov oblast",
"Владимирская область" : "Vladimir oblast",
"Калининградская область" : "Kaliningrad oblast",
"Красноярский край" : "Krasnoyarsk Krai",
"Ярославская область" : "Yaroslavl oblast",
"Удмуртия" : "Udmurtia",
"Алтайский край" : "Altai Krai",
"Иркутская область" : "Irkutsk oblast",
"Ставропольский край" : "Stavropol Krai",
"Тульская область" : "Tula oblast"}
category_map = {"Одежда, обувь, аксессуары":"Clothing, shoes, accessories",
"Детская одежда и обувь":"Children's clothing and shoes",
"Товары для детей и игрушки":"Children's products and toys",
"Квартиры":"Apartments",
"Телефоны":"Phones",
"Мебель и интерьер":"Furniture and interior",
"Предложение услуг":"Offer services",
"Автомобили":"Cars",
"Ремонт и строительство":"Repair and construction",
"Бытовая техника":"Appliances",
"Товары для компьютера":"Products for computer",
"Дома, дачи, коттеджи":"Houses, villas, cottages",
"Красота и здоровье":"Health and beauty",
"Аудио и видео":"Audio and video",
"Спорт и отдых":"Sports and recreation",
"Коллекционирование":"Collecting",
"Оборудование для бизнеса":"Equipment for business",
"Земельные участки":"Land",
"Часы и украшения":"Watches and jewelry",
"Книги и журналы":"Books and magazines",
"Собаки":"Dogs",
"Игры, приставки и программы":"Games, consoles and software",
"Другие животные":"Other animals",
"Велосипеды":"Bikes",
"Ноутбуки":"Laptops",
"Кошки":"Cats",
"Грузовики и спецтехника":"Trucks and buses",
"Посуда и товары для кухни":"Tableware and goods for kitchen",
"Растения":"Plants",
"Планшеты и электронные книги":"Tablets and e-books",
"Товары для животных":"Pet products",
"Комнаты":"Room",
"Фототехника":"Photo",
"Коммерческая недвижимость":"Commercial property",
"Гаражи и машиноместа":"Garages and Parking spaces",
"Музыкальные инструменты":"Musical instruments",
"Оргтехника и расходники":"Office equipment and consumables",
"Птицы":"Birds",
"Продукты питания":"Food",
"Мотоциклы и мототехника":"Motorcycles and bikes",
"Настольные компьютеры":"Desktop computers",
"Аквариум":"Aquarium",
"Охота и рыбалка":"Hunting and fishing",
"Билеты и путешествия":"Tickets and travel",
"Водный транспорт":"Water transport",
"Готовый бизнес":"Ready business",
"Недвижимость за рубежом":"Property abroad"}
train_df['region_en'] = train_df['region'].apply(lambda x: region_map[x])
train_df['parent_category_name_en'] = train_df['parent_category_name'].apply(
lambda x:parent_category_name_map[x])
train_df['category_name_en'] = train_df['category_name'].apply(
lambda x: category_map[x])
train_df.head()
item_id | user_id | region | city | parent_category_name | category_name | param_1 | param_2 | param_3 | title | ... | deal_probability | weekday | month | day | description_len | title_len | deal_class | region_en | parent_category_name_en | category_name_en | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | b912c3c6a6ad | e00f8ff2eaf9 | Свердловская область | Екатеринбург | Личные вещи | Товары для детей и игрушки | Постельные принадлежности | NaN | NaN | Кокоби(кокон для сна) | ... | 0.12789 | 1 | 3 | 28 | 7 | 3 | <0.5 | Sverdlovsk oblast | Personal belongings | Children's products and toys |
1 | 2dac0150717d | 39aeb48f0017 | Самарская область | Самара | Для дома и дачи | Мебель и интерьер | Другое | NaN | NaN | Стойка для Одежды | ... | 0.00000 | 6 | 3 | 26 | 7 | 3 | <0.5 | Samara oblast | For the home and garden | Furniture and interior |
2 | ba83aefab5dc | 91e2f88dd6e3 | Ростовская область | Ростов-на-Дону | Бытовая электроника | Аудио и видео | Видео, DVD и Blu-ray плееры | NaN | NaN | Philips bluray | ... | 0.43177 | 0 | 3 | 20 | 17 | 2 | <0.5 | Rostov oblast | Consumer electronics | Audio and video |
3 | 02996f1dd2ea | bf5cccea572d | Татарстан | Набережные Челны | Личные вещи | Товары для детей и игрушки | Автомобильные кресла | NaN | NaN | Автокресло | ... | 0.80323 | 5 | 3 | 25 | 3 | 1 | >=0.5 | Tatarstan | Personal belongings | Children's products and toys |
4 | 7c90be56d2ab | ef50846afc0b | Волгоградская область | Волгоград | Транспорт | Автомобили | С пробегом | ВАЗ (LADA) | 2110 | ВАЗ 2110, 2003 | ... | 0.20797 | 3 | 3 | 16 | 4 | 3 | <0.5 | Volgograd oblast | Transport | Cars |
5 rows × 27 columns
pr_train.head()
item_id | activation_date | date_from | date_to | total_period | |
---|---|---|---|---|---|
0 | 8f5caef7afb0 | 2017-02-14 | 2017-03-15 | 2017-03-16 | 1 days |
1 | 66218ff526d1 | 2017-02-16 | 2017-03-15 | 2017-03-18 | 3 days |
2 | b237d9539b21 | 2017-03-01 | 2017-03-15 | 2017-03-28 | 13 days |
3 | 80bf58082ad3 | 2017-03-19 | 2017-03-19 | 2017-03-28 | 9 days |
4 | 67a9944a7373 | 2017-03-14 | 2017-03-15 | 2017-03-28 | 13 days |
def generate_histgram_plot(df, col, title):
trace = go.Histogram(x = df[col])
layout = go.Layout(title=title, height=400,
legend=dict(orientation='h'))
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
plt.figure(figsize=(15,5))
sns.distplot(train_df['deal_probability'].values, bins=120, color='#ff201e')
plt.xlabel('Deal Probability', fontsize=14)
plt.title("Distribution of Deal Probability", fontsize=14)
plt.show()
def _generate_bar_plot_hor(df, col, title, color, w, h, lm=0, limit=100,
need_trace=False):
cnt_srs = df[col].value_counts()[:limit]
trace = go.Bar(y=cnt_srs.index[::-1], x=cnt_srs.values[::-1],
orientation='h', marker=dict(color=color))
if need_trace:
return trace
layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
def _generate_bar_plot_ver(df, col, title, color, w, h, lm=0, limit=100,
need_trace=False):
cnt_srs = df[col].value_counts()[:limit]
trace = go.Bar(y=list(cnt_srs.values), x=list(cnt_srs.index),
marker=dict(color=color))
if need_trace:
return trace
layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
_generate_bar_plot_ver(train_df, 'deal_class', "Value of Deal Probabilities"+
"(<0.5 or >=0.5)", ['#f32465', '#83f216'], 700, 400, 200)
cols = ['parent_category_name_en', 'category_name_en', 'region_en', 'city',
'param_1', 'param_2', 'param_3', 'weekday', 'day','title_len',
'description_len', 'image_top_1', 'user_id']
_generate_bar_plot_hor(train_df, cols[0], "Distribution of Parent Category",
"#ff8888", 600, 400, 200)
_generate_bar_plot_hor(train_df, cols[1], "Distribution of Category",
"#acff88", 600, 500, 200)
_generate_bar_plot_hor(train_df, cols[2], "Distribution of Region",
"#7484f3", 600, 600, 200, limit=30)
_generate_bar_plot_hor(train_df, cols[3], "Distribution of City",
"#c4f7c3", 600, 600, 200, limit=30)
trace1 = _generate_bar_plot_hor(train_df, cols[4],"Param 1 Values","#77f4e9",
700,400,200, limit=20,need_trace=True)
trace2 = _generate_bar_plot_hor(train_df, cols[5],"Param 2 Values","#77f4e9",
700,400,200, limit=20,need_trace=True)
trace3 = _generate_bar_plot_hor(train_df, cols[6],"Param 3 Values","#77f4e9",
700,400,200, limit=20,need_trace=True)
fig = tools.make_subplots(cols=1,rows=3, print_grid=False,
subplot_titles=['Param 1 Values',
'Param 2 Values',
'Param 3 Values'])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 3, 1)
fig['layout'].update(height=1000, title='Top of Values in Param 1,2,3 columns',
showlegend=False)
iplot(fig)
trace1 = _generate_bar_plot_ver(train_df, cols[7], "Week Days", "#e994ff",
700,400,200,limit=30,need_trace=True)
trace2 = _generate_bar_plot_ver(train_df, cols[8], "Month Days", "#e994ff",
700,400,200,limit=30,need_trace=True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ['Week Days','Month Days'])
fig.append_trace(trace1, 1, 1);
fig.append_trace(trace2, 1, 2);
fig['layout'].update(height=300,title='Ads Posted in different Week/Month Days',
showlegend=False);
iplot(fig)
trace1 = _generate_bar_plot_ver(train_df, cols[9], "Title Word Count",
"#ffb341",700,400,200,
limit=30,need_trace=True)
trace2 = _generate_bar_plot_ver(train_df, cols[10], "Description Count",
"#ffb341",700,400,200,
limit=30,need_trace=True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ['Title Word Count',
'Description Word Count'])
fig.append_trace(trace1, 1, 1);
fig.append_trace(trace2, 1, 2);
fig['layout'].update(height=400,title='',showlegend=False);
iplot(fig)
trace1 = _generate_bar_plot_ver(train_df, cols[11], "Image Top 1",
'#e2dfd9', 700, 400, 200, limit=30,
need_trace = True)
trace2 = _generate_bar_plot_ver(train_df, cols[12], "User Id",
'#e2dfd9', 700, 400, 200, limit=30,
need_trace = True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ['Image Top 1','User Id'])
fig.append_trace(trace1, 1, 1);
fig.append_trace(trace2, 1, 2);
fig['layout'].update(height=400,
title='Ads having different ImageTop1 and User Id',
showlegend=False);
iplot(fig);
pr_train['total_period'] = pr_train['total_period'].apply(
lambda x: int(str(x).split()[0]))
t = pr_train['total_period'].value_counts()
labels = list(t.index)
values = list(t.values)
layout = go.Layout(title='For How Much Days Ads are Run', width=600, height=400,
margin=dict(l=100))
trace = go.Pie(labels=labels, values=values,
marker=dict(colors=['#f9c968', '#75e575', '#d693b4']))
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
def _create_pie_chart(df, col):
tm = df[col].value_counts()
labels = list(tm.index)
values = list(tm.values)
trace = go.Pie(labels=labels, values=values,
marker=dict(colors=['#f9c968', '#75e575', '#d693b3']))
return trace
trace1 = _create_pie_chart(train_df, 'user_type')
layout = go.Layout(title='Distribution of User Type', width=600, height=400,
margin=dict(l=100))
data = [trace1]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
corr = train_df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set(style='white')
f, ax = plt.subplots(figsize=(10, 8))
cmap = sns.diverging_palette(20, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True,
linewidths=.1, cbar_kws={"shrink": .5})
<matplotlib.axes._subplots.AxesSubplot at 0x13f0c404d68>