#!/usr/bin/env python
# coding: utf-8
#
#
# ## Открытый курс по машинному обучению. Сессия № 2
# Автор материала: программист-исследователь Mail.ru Group, старший преподаватель Факультета Компьютерных Наук ВШЭ Юрий Кашницкий. Материал распространяется на условиях лицензии [Creative Commons CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Можно использовать в любых целях (редактировать, поправлять и брать за основу), кроме коммерческих, но с обязательным упоминанием автора материала.
# # Тема 10. Бустинг
# ## Часть 7. Xgboost и Hyperopt в соревновании Kaggle Forest Cover Type Prediction
# [Соревнование](https://www.kaggle.com/c/forest-cover-type-prediction).
# Задача учебная. Предлагается предсказывать тип лесного покрытия на участках 30х30 метров Национального заповедника Рузвельта в Колорадо.
#
# Признаки (подробней на [странице](https://www.kaggle.com/c/forest-cover-type-prediction/data) соревнования):
# - Elevation (высота) - Elevation in meters
# - Aspect - Aspect in degrees azimuth
# - Slope (наклон) - Slope in degrees
# - Horizontal_Distance_To_Hydrology (горизонтальное расстояние до воды) - Horz Dist to nearest surface water features
# - Vertical_Distance_To_Hydrology (вертикальное расстояние до воды) - Vert Dist to nearest surface water features
# - Horizontal_Distance_To_Roadways (горизонтальное расстояние до дорог) - Horz Dist to nearest roadway
# - Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice
# - Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice
# - Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice
# - Horizontal_Distance_To_Fire_Points (горизонтальное расстояние до центров воспламенения) - Horz Dist to nearest wildfire ignition points
# - Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation
# - Soil_Type (тип почвы) - (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation
# Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation
# **Подключаем библиотеки и загружаем данные. Используем [log_progress](https://github.com/alexanderkuk/log-progress) для отслеживания итераций в циклах.**
# In[ ]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
# **Файл forest_test.csv можно скачать [отсюда](https://drive.google.com/file/d/1Ktn5JjFlAjABp6kGDldAYI2aVG-OMgvW/view?usp=sharing)**
# In[ ]:
train_df = pd.read_csv("../../data/forest_train.csv")
test_df = pd.read_csv("../../data/forest_test.csv")
# In[ ]:
def write_to_submission_file(
predicted_labels, out_file, target="Cover_Type", index_label="Id", init_index=15121
):
# turn predictions into data frame and save as csv file
predicted_df = pd.DataFrame(
predicted_labels,
index=np.arange(init_index, predicted_labels.shape[0] + init_index),
columns=[target],
)
predicted_df.to_csv(out_file, index_label=index_label)
# **Создаем признаки.**
# In[ ]:
train_df["Under_water"] = train_df.Vertical_Distance_To_Hydrology < 0
test_df["Under_water"] = test_df.Vertical_Distance_To_Hydrology < 0
# In[ ]:
train_df["EVDtH"] = train_df.Elevation - train_df.Vertical_Distance_To_Hydrology
test_df["EVDtH"] = test_df.Elevation - test_df.Vertical_Distance_To_Hydrology
train_df["EHDtH"] = train_df.Elevation - train_df.Horizontal_Distance_To_Hydrology * 0.2
test_df["EHDtH"] = test_df.Elevation - test_df.Horizontal_Distance_To_Hydrology * 0.2
# In[ ]:
train_df["Distanse_to_Hydrolody"] = (
train_df["Horizontal_Distance_To_Hydrology"] ** 2
+ train_df["Vertical_Distance_To_Hydrology"] ** 2
) ** 0.5
test_df["Distanse_to_Hydrolody"] = (
test_df["Horizontal_Distance_To_Hydrology"] ** 2
+ test_df["Vertical_Distance_To_Hydrology"] ** 2
) ** 0.5
train_df["Hydro_Fire_1"] = (
train_df["Horizontal_Distance_To_Hydrology"]
+ train_df["Horizontal_Distance_To_Fire_Points"]
)
test_df["Hydro_Fire_1"] = (
test_df["Horizontal_Distance_To_Hydrology"]
+ test_df["Horizontal_Distance_To_Fire_Points"]
)
train_df["Hydro_Fire_2"] = abs(
train_df["Horizontal_Distance_To_Hydrology"]
- train_df["Horizontal_Distance_To_Fire_Points"]
)
test_df["Hydro_Fire_2"] = abs(
test_df["Horizontal_Distance_To_Hydrology"]
- test_df["Horizontal_Distance_To_Fire_Points"]
)
train_df["Hydro_Road_1"] = abs(
train_df["Horizontal_Distance_To_Hydrology"]
+ train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Hydro_Road_1"] = abs(
test_df["Horizontal_Distance_To_Hydrology"]
+ test_df["Horizontal_Distance_To_Roadways"]
)
train_df["Hydro_Road_2"] = abs(
train_df["Horizontal_Distance_To_Hydrology"]
- train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Hydro_Road_2"] = abs(
test_df["Horizontal_Distance_To_Hydrology"]
- test_df["Horizontal_Distance_To_Roadways"]
)
train_df["Fire_Road_1"] = abs(
train_df["Horizontal_Distance_To_Fire_Points"]
+ train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Fire_Road_1"] = abs(
test_df["Horizontal_Distance_To_Fire_Points"]
+ test_df["Horizontal_Distance_To_Roadways"]
)
train_df["Fire_Road_2"] = abs(
train_df["Horizontal_Distance_To_Fire_Points"]
- train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Fire_Road_2"] = abs(
test_df["Horizontal_Distance_To_Fire_Points"]
- test_df["Horizontal_Distance_To_Roadways"]
)
# In[ ]:
y = train_df["Cover_Type"]
train_df = train_df.drop(["Cover_Type", "Id"], axis=1)
test_df = test_df.drop(["Id"], axis=1)
# In[ ]:
y = y - 1 # Чтоб классы нумеровались от о до 6
# In[ ]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
# In[ ]:
def score(params):
from sklearn.metrics import log_loss
print("Training with params:")
print(params)
params["max_depth"] = int(params["max_depth"])
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
model = xgb.train(params, dtrain, params["num_round"])
predictions = model.predict(dvalid).reshape((X_test.shape[0], 7))
score = log_loss(y_test, predictions)
print("\tScore {0}\n\n".format(score))
return {"loss": score, "status": STATUS_OK}
# In[ ]:
def optimize(trials):
space = {
"num_round": 100,
"learning_rate": hp.quniform("eta", 0.005, 0.05, 0.005),
"max_depth": hp.quniform("max_depth", 3, 14, 1),
"min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
"subsample": hp.quniform("subsample", 0.5, 1, 0.05),
"gamma": hp.quniform("gamma", 0.5, 1, 0.01),
"colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
"num_class": 7,
"eval_metric": "merror",
"objective": "multi:softprob",
"nthread": 4,
"silent": 1,
}
best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)
return best
# In[ ]:
X_train, X_test, y_train, y_test = train_test_split(
train_df, y, test_size=0.3, random_state=17
)
# In[ ]:
trials = Trials()
best_params = optimize(trials)
best_params
# In[ ]:
best_params["max_depth"] = int(best_params["max_depth"])
best_params["num_class"] = 7
best_params["eval_metric"] = "merror"
best_params["objective"] = "multi:softprob"
best_params["nthread"] = 4
best_params["silent"] = 1
# In[ ]:
dtrain = xgb.DMatrix(train_df, y)
# In[ ]:
get_ipython().run_cell_magic('time', '', 'xgbCvResult = xgb.cv(\n best_params, dtrain, num_boost_round=500, nfold=3, early_stopping_rounds=50\n)\n')
# In[ ]:
plt.plot(range(xgbCvResult.shape[0]), xgbCvResult["test-merror-mean"])
plt.plot(range(xgbCvResult.shape[0]), xgbCvResult["train-merror-mean"]);
# In[ ]:
best_num_round = np.argmin(xgbCvResult["test-merror-mean"])
best_num_round
# In[ ]:
get_ipython().run_line_magic('pinfo', 'xgb.train')
# **Сделаем прогноз для всей тестовой выборки.**
# In[ ]:
bestXgb = xgb.train(best_params, dtrain, num_boost_round=best_num_round)
# In[ ]:
dtest = xgb.DMatrix(test_df)
# In[ ]:
xgboost_predict_proba = bestXgb.predict(dtest)
# In[ ]:
xgboost_prediction = np.argmax(xgboost_predict_proba, axis=1)
# **Мы вычитали из целевых меток 1, теперь добавляем.**
# In[ ]:
xgboost_prediction += 1
# In[ ]:
write_to_submission_file(xgboost_prediction, "forest_cover_type_xgboost.csv")
# **У такой посылки на Kaggle результат - 0.771.**