import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn import tree
data_mat = pd.read_csv("student-mat.csv", delimiter=';')
data_por = pd.read_csv("student-por.csv", delimiter=';')
categorical_features_name = [
"school",
"sex",
"address",
"famsize",
"Pstatus",
"Mjob",
"Fjob",
"reason",
"guardian",
"schoolsup",
"famsup",
"paid",
"activities",
"nursery",
"higher",
"internet",
"romantic"
]
choosed_features = [
]
target_features = [
"Dalc",
"Walc"
]
# Обработка данных. Кодирование категориальных признаков единичными векторами с помощью OneHotEncoder'а
def preprocessing(data, columns=data_mat.columns):
X = data.loc[:, data.columns.isin(columns)]
for target in target_features:
if X.columns.contains(target):
X = X.drop(columns=target)
X.loc[:, X.columns.isin(categorical_features_name)] = \
X.loc[:, X.columns.isin(categorical_features_name)].apply(LabelEncoder().fit_transform)
enc = OneHotEncoder()
X_cat = enc.fit_transform(X.loc[:, X.columns.isin(categorical_features_name)]).todense()
X = np.concatenate([X_cat, X.loc[:, ~X.columns.isin(categorical_features_name)].values], axis=1)
return X, data.loc[:, "Walc"], data.loc[:, "Dalc"]
l = ["paid", "schoolsup", "activities", "higher", "freetime", "goout", "male"]
Будем рассматривать задачи классификации по 5 классам. Предсказываемые переменные - "Dalc" и "Walc".
Применим для её решения ансамбль решающих деревьев, используя выбранные выше признаки.
X, y_w, y_d = preprocessing(data_mat, l)
rf_clf = RandomForestClassifier(class_weight = "balanced")
rf_param_dist ={"n_estimators" : np.arange(10, 100, 10),
"max_depth": sp_randint(1, 31),
"max_features": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"max_features" : ["auto", "sqrt", "log2"],
"criterion": ["gini", "entropy"]}
n_iter_search = 250
rf_random_search = RandomizedSearchCV(rf_clf, param_distributions = rf_param_dist,
n_iter = n_iter_search, random_state = 42)
print("Random forest for Walc")
x_train, x_test, y_train, y_test = train_test_split(X, y_w, test_size=0.3, random_state=42, stratify = y_w)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))
print("Random forest for Dalc")
x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=0.3, random_state=42, stratify = y_d)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))
Random forest for Walc Best randomized search score - 0.37 Validation score - 0.33 Random forest for Dalc Best randomized search score - 0.53 Validation score - 0.58
Постараемся сгенерировать новые признаки на базе старых.
Будем искать оптимальные параметры для них.
students = data_mat.append(data_por).reset_index(drop=True)
students_transformed = students
students_transformed['rural'] = students['address'] == "R"
students_transformed['big_family'] = students['famsize'] == 'GT3'
students_transformed['parents_together'] = students['Pstatus'] == "T"
students_transformed['studies_less'] = students['studytime'] < 3
students_transformed['more_failures'] = students['failures'] >= 2
students_transformed['bad_relationships'] = students['famrel'] <= 2
students_transformed['more_free_time'] = students['freetime'] > 3
students_transformed['goes_out_more'] = students['goout'] > 4
students_transformed['bad_health'] = students['health'] <= 2
students_transformed['high_absences'] = students['absences'] > (students['absences'].std() * 2)
students_transformed['mothers_low_edu'] = students['Medu'] <= 3
students_transformed['fathers_low_edu'] = students['Fedu'] <= 3
students_transformed['more_than_18'] = students['age'] > 18
students_transformed['long_road'] = students['traveltime'] >= 3
sum_grade = students['G1'] + students['G2'] + students['G3']
mean_grade = (sum_grade) / 3
students_transformed['low_grade'] = mean_grade <= (mean_grade.mean() + mean_grade.std())
students_transformed = students_transformed.drop(['sex',
'address',
'famsize',
'Pstatus',
'studytime',
'failures',
'famrel',
'freetime',
'goout',
'health',
'absences',
'G1',
'G2',
'G3',
'Medu',
'Fedu',
'age',
], axis=1)
X, y_w, y_d = preprocessing(students_transformed)
print("Random forest for Walc")
x_train, x_test, y_train, y_test = train_test_split(X, y_w, test_size=0.3, random_state=42, stratify = y_w)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))
print("Random forest for Dalc")
x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=0.3, random_state=42, stratify = y_d)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))
Random forest for Walc Best randomized search score - 0.5 Validation score - 0.59 Random forest for Dalc Best randomized search score - 0.74 Validation score - 0.74
Результаты для предсказания значения величины "Walc" оставляют желать лучшего.
Заметим, что подавляющие большинство значений целевой переменной - 1.
Возможно, правильнее будет для начала решить задачу:
Употребляет ли студент очень мало алкоголя или нет, то есть имеют ли "Dalc" и "Walc" значение не 1.
y = np.zeros_like(y_w)
for i in range(len(y_w)):
if y_d[i] != 1 or y_w[i] != 1:
y[i] = 1
else:
y[i] = -1
classifier = RandomForestClassifier(max_depth=4, bootstrap=False)
sc = cross_val_score(classifier, X, y, cv=5)
print("Random forest: ", round(sc.mean(), 2))
Random forest: 0.66
Попробуем провести такое для разных уровней алкоголизма
y = np.zeros_like(y_w)
for i in range(len(y_w)):
if y_d[i] + y_w[i] > 3:
y[i] = 1
else:
y[i] = -1
classifier = RandomForestClassifier(max_depth=4, bootstrap=False)
sc = cross_val_score(classifier, X, y, cv=5)
print("Random forest: ", round(sc.mean(), 2))
Random forest: 0.59
Подход к данной задаче как к задаче классификации не дал хороших результатов на данном этапе.
Вероятно, стоит подойти к проблеме с другой стороны и рассмотреть данную задачу как задачу регрессии.