# 6話で紹介したコードと共通
import pandas as pd
import datetime
# http://archive.ics.uci.edu/ml/datasets/Online+Retail# のデータを利用
data_dtype = {'InvoiceNo':'str', 'StockCode': 'str', 'Description': 'str', 'Quantity': 'int', 'InvoiceDate': 'datetime64', 'UnitPrice': 'float', 'CustomerID': 'str', 'Country': 'str'}
df = pd.read_excel("./Online Retail.xlsx", dtype = data_dtype, keep_default_na=False)
# 会員に限定して分析
df = df.loc[df["CustomerID"] != "", :]
# NOTE: データ加工をシンプルにするために、UnitPriceが負なデータは除外
df = df.loc[df["UnitPrice"] > 0, :]
# NOTE: データ加工をシンプルにするために、Quantityが負なデータは除外
df = df.loc[df["Quantity"] > 0, :]
# 2011年10月末までを来店の集計期間にする
train_df = df.loc[df["InvoiceDate"] < "2011-11-01",]
def transform(data_df):
days = max(data_df.loc[:, "InvoiceDate"]) - min(data_df.loc[:, "InvoiceDate"])
# Recency
recency = (datetime.datetime(2011, 11, 1) - data_df.groupby("CustomerID")["InvoiceDate"].max()).apply(lambda x: x.days)
recency.name = "recency"
# Fre
# quency
frequency = data_df.groupby("CustomerID")["InvoiceNo"].nunique() / days.days
frequency.name = "frequency"
# Monetary
data_process_df = data_df.copy()
data_process_df.loc[:, "total_price"] = data_process_df.loc[:, "Quantity"] * data_process_df.loc[:, "UnitPrice"]
monetary = data_process_df.groupby("CustomerID")["total_price"].sum()
monetary.name = "monetary"
return pd.merge(recency, frequency, left_index=True, right_index=True).merge(monetary, left_index=True, right_index=True)
def is_visit(data_df, visitors):
data_prcess_df = data_df.copy()
data_prcess_df.loc[:, "is_visit"] = data_prcess_df.loc[:, "CustomerID"].apply(lambda x: x in visitors)
return data_prcess_df.groupby("CustomerID")["is_visit"].max()
# RFM分析
df_rfm = transform(train_df)
# 11月に購入実績があるかの判定フラグをたてる
# 11月に購入実績のあるCustomerIDを取得
nov_customers = df.loc[(df["InvoiceDate"] >= "2011-11-01") & (df["InvoiceDate"] < "2011-12-01"), "CustomerID"].unique()
# 10月末までのCustomerで11月に購入実績があるかを判定
visit_df = is_visit(train_df, nov_customers)
# RFM分析の結果と結合する
all_df = pd.merge(df_rfm, visit_df, left_index=True, right_index=True).reset_index()
X = all_df.loc[:, ["recency", "frequency", "monetary"]]
y = all_df.loc[:, "is_visit"]
# クロスバリデーションで精度の確認
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
model = LogisticRegression()
# parameters = {"penalty": ["l1", "l2"], "solver":["liblinear"]}
parameters = {} # デフォルトパラメータを利用
clf = GridSearchCV(model, parameters, cv=3, scoring="accuracy")
clf.fit(X, y)
clf.cv_results_
{'mean_fit_time': array([0.01178439]), 'std_fit_time': array([0.00044802]), 'mean_score_time': array([0.00176684]), 'std_score_time': array([0.00019057]), 'params': [{}], 'split0_test_score': array([0.71849057]), 'split1_test_score': array([0.70415094]), 'split2_test_score': array([0.70015106]), 'mean_test_score': array([0.70759752]), 'std_test_score': array([0.00787374]), 'rank_test_score': array([1], dtype=int32)}
clf.predict
<function sklearn.model_selection._search.BaseSearchCV.predict(self, X)>