Соревнование. Задача учебная. Предлагается предсказывать тип лесного покрытия на участках 30х30 метров Национального заповедника Рузвельта в Колорадо.
Признаки (подробней на странице соревнования):
Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation
Подключаем библиотеки и загружаем данные. Используем log_progress для отслеживания итераций в циклах.
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
Файл forest_test.csv можно скачать отсюда
train_df = pd.read_csv("../../data/forest_train.csv")
test_df = pd.read_csv("../../data/forest_test.csv")
def write_to_submission_file(
predicted_labels, out_file, target="Cover_Type", index_label="Id", init_index=15121
):
# turn predictions into data frame and save as csv file
predicted_df = pd.DataFrame(
predicted_labels,
index=np.arange(init_index, predicted_labels.shape[0] + init_index),
columns=[target],
)
predicted_df.to_csv(out_file, index_label=index_label)
Создаем признаки.
train_df["Under_water"] = train_df.Vertical_Distance_To_Hydrology < 0
test_df["Under_water"] = test_df.Vertical_Distance_To_Hydrology < 0
train_df["EVDtH"] = train_df.Elevation - train_df.Vertical_Distance_To_Hydrology
test_df["EVDtH"] = test_df.Elevation - test_df.Vertical_Distance_To_Hydrology
train_df["EHDtH"] = train_df.Elevation - train_df.Horizontal_Distance_To_Hydrology * 0.2
test_df["EHDtH"] = test_df.Elevation - test_df.Horizontal_Distance_To_Hydrology * 0.2
train_df["Distanse_to_Hydrolody"] = (
train_df["Horizontal_Distance_To_Hydrology"] ** 2
+ train_df["Vertical_Distance_To_Hydrology"] ** 2
) ** 0.5
test_df["Distanse_to_Hydrolody"] = (
test_df["Horizontal_Distance_To_Hydrology"] ** 2
+ test_df["Vertical_Distance_To_Hydrology"] ** 2
) ** 0.5
train_df["Hydro_Fire_1"] = (
train_df["Horizontal_Distance_To_Hydrology"]
+ train_df["Horizontal_Distance_To_Fire_Points"]
)
test_df["Hydro_Fire_1"] = (
test_df["Horizontal_Distance_To_Hydrology"]
+ test_df["Horizontal_Distance_To_Fire_Points"]
)
train_df["Hydro_Fire_2"] = abs(
train_df["Horizontal_Distance_To_Hydrology"]
- train_df["Horizontal_Distance_To_Fire_Points"]
)
test_df["Hydro_Fire_2"] = abs(
test_df["Horizontal_Distance_To_Hydrology"]
- test_df["Horizontal_Distance_To_Fire_Points"]
)
train_df["Hydro_Road_1"] = abs(
train_df["Horizontal_Distance_To_Hydrology"]
+ train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Hydro_Road_1"] = abs(
test_df["Horizontal_Distance_To_Hydrology"]
+ test_df["Horizontal_Distance_To_Roadways"]
)
train_df["Hydro_Road_2"] = abs(
train_df["Horizontal_Distance_To_Hydrology"]
- train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Hydro_Road_2"] = abs(
test_df["Horizontal_Distance_To_Hydrology"]
- test_df["Horizontal_Distance_To_Roadways"]
)
train_df["Fire_Road_1"] = abs(
train_df["Horizontal_Distance_To_Fire_Points"]
+ train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Fire_Road_1"] = abs(
test_df["Horizontal_Distance_To_Fire_Points"]
+ test_df["Horizontal_Distance_To_Roadways"]
)
train_df["Fire_Road_2"] = abs(
train_df["Horizontal_Distance_To_Fire_Points"]
- train_df["Horizontal_Distance_To_Roadways"]
)
test_df["Fire_Road_2"] = abs(
test_df["Horizontal_Distance_To_Fire_Points"]
- test_df["Horizontal_Distance_To_Roadways"]
)
y = train_df["Cover_Type"]
train_df = train_df.drop(["Cover_Type", "Id"], axis=1)
test_df = test_df.drop(["Id"], axis=1)
y = y - 1 # Чтоб классы нумеровались от о до 6
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
def score(params):
from sklearn.metrics import log_loss
print("Training with params:")
print(params)
params["max_depth"] = int(params["max_depth"])
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
model = xgb.train(params, dtrain, params["num_round"])
predictions = model.predict(dvalid).reshape((X_test.shape[0], 7))
score = log_loss(y_test, predictions)
print("\tScore {0}\n\n".format(score))
return {"loss": score, "status": STATUS_OK}
def optimize(trials):
space = {
"num_round": 100,
"learning_rate": hp.quniform("eta", 0.005, 0.05, 0.005),
"max_depth": hp.quniform("max_depth", 3, 14, 1),
"min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
"subsample": hp.quniform("subsample", 0.5, 1, 0.05),
"gamma": hp.quniform("gamma", 0.5, 1, 0.01),
"colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
"num_class": 7,
"eval_metric": "merror",
"objective": "multi:softprob",
"nthread": 4,
"silent": 1,
}
best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=10)
return best
X_train, X_test, y_train, y_test = train_test_split(
train_df, y, test_size=0.3, random_state=17
)
trials = Trials()
best_params = optimize(trials)
best_params
best_params["max_depth"] = int(best_params["max_depth"])
best_params["num_class"] = 7
best_params["eval_metric"] = "merror"
best_params["objective"] = "multi:softprob"
best_params["nthread"] = 4
best_params["silent"] = 1
dtrain = xgb.DMatrix(train_df, y)
%%time
xgbCvResult = xgb.cv(
best_params, dtrain, num_boost_round=500, nfold=3, early_stopping_rounds=50
)
plt.plot(range(xgbCvResult.shape[0]), xgbCvResult["test-merror-mean"])
plt.plot(range(xgbCvResult.shape[0]), xgbCvResult["train-merror-mean"]);
best_num_round = np.argmin(xgbCvResult["test-merror-mean"])
best_num_round
xgb.train?
Сделаем прогноз для всей тестовой выборки.
bestXgb = xgb.train(best_params, dtrain, num_boost_round=best_num_round)
dtest = xgb.DMatrix(test_df)
xgboost_predict_proba = bestXgb.predict(dtest)
xgboost_prediction = np.argmax(xgboost_predict_proba, axis=1)
Мы вычитали из целевых меток 1, теперь добавляем.
xgboost_prediction += 1
write_to_submission_file(xgboost_prediction, "forest_cover_type_xgboost.csv")
У такой посылки на Kaggle результат - 0.771.