Information about the problem is at http://ti.arc.nasa.gov/tech/dash/pcoe/prognostic-data-repository/publications/#turbofan and original data is at http://ti.arc.nasa.gov/tech/dash/pcoe/prognostic-data-repository/#turbofan
The data was originally generated using the Commercial Modular Aero-Propulsion System Simulations (C-MAPPS) system.
The approach used in the turbofan engine degradation dataset was then used in the PHM08 challenge. Information about other research on the C-MAPSS data is available at https://www.phmsociety.org/sites/phmsociety.org/files/phm_submission/2014/phmc_14_063.pdf
import h2o
import imp
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.utils.shared_utils import _locate
try:
imp.find_module('pandas')
imp.find_module('numpy')
imp.find_module('seaborn')
imp.find_module('pykalman')
import numpy as np
import pandas as pd
import seaborn as sns
import pykalman as pyk
h2o_only = False
except:
h2o_only= True
%matplotlib inline
if not h2o_only:
sns.set()
doGridSearch = False
# Input files don't have column names
dependent_vars = ['RemainingUsefulLife']
index_columns_names = ["UnitNumber","Cycle"]
operational_settings_columns_names = ["OpSet"+str(i) for i in range(1,4)]
sensor_measure_columns_names =["SensorMeasure"+str(i) for i in range(1,22)]
input_file_column_names = index_columns_names + operational_settings_columns_names + sensor_measure_columns_names
# And we are going to add these columns
kalman_smoothed_mean_columns_names =["SensorMeasureKalmanMean"+str(i) for i in range(1,22)]
if not h2o_only:
train = pd.read_csv(_locate("train_FD001.txt"), sep=r"\s*", header=None,
names=input_file_column_names, engine='python')
test = pd.read_csv(_locate("test_FD001.txt"), sep=r"\s*", header=None,
names=input_file_column_names, engine='python')
test_rul = pd.read_csv(_locate("RUL_FD001.txt"), header=None, names=['RemainingUsefulLife'])
test_rul.index += 1 # set the index to be the unit number in the test data set
test_rul.index.name = "UnitNumber"
This puts all data on the same basis for supervised training
# Calculate the remaining useful life for each training sample based on last measurement being zero remaining
if not h2o_only:
grouped_train = train.groupby('UnitNumber', as_index=False)
useful_life_train = grouped_train.agg({'Cycle' : np.max })
useful_life_train.rename(columns={'Cycle': 'UsefulLife'}, inplace=True)
train_wfeatures = pd.merge(train, useful_life_train, on="UnitNumber")
train_wfeatures["RemainingUsefulLife"] = -(train_wfeatures.UsefulLife - train_wfeatures.Cycle)
train_wfeatures.drop('UsefulLife', axis=1, inplace=True)
grouped_test = test.groupby('UnitNumber', as_index=False)
useful_life_test = grouped_test.agg({'Cycle' : np.max })
useful_life_test.rename(columns={'Cycle': 'UsefulLife'}, inplace=True)
test_wfeatures = pd.merge(test, useful_life_test, on="UnitNumber")
test_wfeatures["RemainingUsefulLife"] = -(test_wfeatures.UsefulLife - test_wfeatures.Cycle)
test_wfeatures.drop('UsefulLife', axis=1, inplace=True)
Look at how the sensor measures evolve over time (first column) as well as how they relate to each other for a subset of the units.
These features were the top 3 and bottom 2 most important sensor features as discovered by H2O's GBM, later in the notebook.
if not h2o_only:
sns.set_context("notebook", font_scale=5)
p = sns.pairplot(train_wfeatures.query('UnitNumber < 10'),
vars=["RemainingUsefulLife", "SensorMeasure4", "SensorMeasure3",
"SensorMeasure9", "SensorMeasure8", "SensorMeasure13"], size=10,
hue="UnitNumber", palette=sns.color_palette("husl", 9));
Kalman parameters were determined using EM algorithm and then those parameters are used for smoothing the signal data.
This is applied repeatedly to each Unit, in both the training and test set.
kalman_smoothed_mean_columns_names =["SensorMeasureKalmanMean"+str(i) for i in range(1,22)]
def calcSmooth(measures):
if not h2o_only:
kf = pyk.KalmanFilter(initial_state_mean=measures[0], n_dim_obs=measures.shape[1])
(smoothed_state_means, smoothed_state_covariances) = kf.em(measures).smooth(measures)
return smoothed_state_means
else:
pass
def filterEachUnit(df):
if not h2o_only:
dfout = df.copy()
for newcol in kalman_smoothed_mean_columns_names:
dfout[newcol] = np.nan
for unit in dfout.UnitNumber.unique():
print 'Processing Unit: ' + str(unit)
unitmeasures = dfout[dfout.UnitNumber == unit][sensor_measure_columns_names]
smoothed_state_means = calcSmooth( np.asarray( unitmeasures ) )
dfout.loc[dfout.UnitNumber == unit, kalman_smoothed_mean_columns_names] = smoothed_state_means
print 'Finished Unit: ' + str(unit)
return dfout
else:
pass
Helps so preprocessing only has to be done once.
if not h2o_only:
# Get picky about the order of output columns
test_output_cols = index_columns_names + operational_settings_columns_names + sensor_measure_columns_names + \
kalman_smoothed_mean_columns_names
train_output_cols = test_output_cols + dependent_vars
train_wkalman = filterEachUnit(train_wfeatures)
test_wkalman = filterEachUnit(test_wfeatures)
train_output = train_wkalman[train_output_cols]
test_output = test_wkalman[test_output_cols]
Processing Unit: 1 Finished Unit: 1 Processing Unit: 2 Finished Unit: 2 Processing Unit: 3 Finished Unit: 3 Processing Unit: 4 Finished Unit: 4 Processing Unit: 5 Finished Unit: 5 Processing Unit: 6 Finished Unit: 6 Processing Unit: 7 Finished Unit: 7 Processing Unit: 8 Finished Unit: 8 Processing Unit: 9 Finished Unit: 9 Processing Unit: 10 Finished Unit: 10 Processing Unit: 11 Finished Unit: 11 Processing Unit: 12 Finished Unit: 12 Processing Unit: 13 Finished Unit: 13 Processing Unit: 14 Finished Unit: 14 Processing Unit: 15 Finished Unit: 15 Processing Unit: 16 Finished Unit: 16 Processing Unit: 17 Finished Unit: 17 Processing Unit: 18 Finished Unit: 18 Processing Unit: 19 Finished Unit: 19 Processing Unit: 20 Finished Unit: 20 Processing Unit: 21 Finished Unit: 21 Processing Unit: 22 Finished Unit: 22 Processing Unit: 23 Finished Unit: 23 Processing Unit: 24 Finished Unit: 24 Processing Unit: 25 Finished Unit: 25 Processing Unit: 26 Finished Unit: 26 Processing Unit: 27 Finished Unit: 27 Processing Unit: 28 Finished Unit: 28 Processing Unit: 29 Finished Unit: 29 Processing Unit: 30 Finished Unit: 30 Processing Unit: 31 Finished Unit: 31 Processing Unit: 32 Finished Unit: 32 Processing Unit: 33 Finished Unit: 33 Processing Unit: 34 Finished Unit: 34 Processing Unit: 35 Finished Unit: 35 Processing Unit: 36 Finished Unit: 36 Processing Unit: 37 Finished Unit: 37 Processing Unit: 38 Finished Unit: 38 Processing Unit: 39 Finished Unit: 39 Processing Unit: 40 Finished Unit: 40 Processing Unit: 41 Finished Unit: 41 Processing Unit: 42 Finished Unit: 42 Processing Unit: 43 Finished Unit: 43 Processing Unit: 44 Finished Unit: 44 Processing Unit: 45 Finished Unit: 45 Processing Unit: 46 Finished Unit: 46 Processing Unit: 47 Finished Unit: 47 Processing Unit: 48 Finished Unit: 48 Processing Unit: 49 Finished Unit: 49 Processing Unit: 50 Finished Unit: 50 Processing Unit: 51 Finished Unit: 51 Processing Unit: 52 Finished Unit: 52 Processing Unit: 53 Finished Unit: 53 Processing Unit: 54 Finished Unit: 54 Processing Unit: 55 Finished Unit: 55 Processing Unit: 56 Finished Unit: 56 Processing Unit: 57 Finished Unit: 57 Processing Unit: 58 Finished Unit: 58 Processing Unit: 59 Finished Unit: 59 Processing Unit: 60 Finished Unit: 60 Processing Unit: 61 Finished Unit: 61 Processing Unit: 62 Finished Unit: 62 Processing Unit: 63 Finished Unit: 63 Processing Unit: 64 Finished Unit: 64 Processing Unit: 65 Finished Unit: 65 Processing Unit: 66 Finished Unit: 66 Processing Unit: 67 Finished Unit: 67 Processing Unit: 68 Finished Unit: 68 Processing Unit: 69 Finished Unit: 69 Processing Unit: 70 Finished Unit: 70 Processing Unit: 71 Finished Unit: 71 Processing Unit: 72 Finished Unit: 72 Processing Unit: 73 Finished Unit: 73 Processing Unit: 74 Finished Unit: 74 Processing Unit: 75 Finished Unit: 75 Processing Unit: 76 Finished Unit: 76 Processing Unit: 77 Finished Unit: 77 Processing Unit: 78 Finished Unit: 78 Processing Unit: 79 Finished Unit: 79 Processing Unit: 80 Finished Unit: 80 Processing Unit: 81 Finished Unit: 81 Processing Unit: 82 Finished Unit: 82 Processing Unit: 83 Finished Unit: 83 Processing Unit: 84 Finished Unit: 84 Processing Unit: 85 Finished Unit: 85 Processing Unit: 86 Finished Unit: 86 Processing Unit: 87 Finished Unit: 87 Processing Unit: 88 Finished Unit: 88 Processing Unit: 89 Finished Unit: 89 Processing Unit: 90 Finished Unit: 90 Processing Unit: 91 Finished Unit: 91 Processing Unit: 92 Finished Unit: 92 Processing Unit: 93 Finished Unit: 93 Processing Unit: 94 Finished Unit: 94 Processing Unit: 95 Finished Unit: 95 Processing Unit: 96 Finished Unit: 96 Processing Unit: 97 Finished Unit: 97 Processing Unit: 98 Finished Unit: 98 Processing Unit: 99 Finished Unit: 99 Processing Unit: 100 Finished Unit: 100 Processing Unit: 1 Finished Unit: 1 Processing Unit: 2 Finished Unit: 2 Processing Unit: 3 Finished Unit: 3 Processing Unit: 4 Finished Unit: 4 Processing Unit: 5 Finished Unit: 5 Processing Unit: 6 Finished Unit: 6 Processing Unit: 7 Finished Unit: 7 Processing Unit: 8 Finished Unit: 8 Processing Unit: 9 Finished Unit: 9 Processing Unit: 10 Finished Unit: 10 Processing Unit: 11 Finished Unit: 11 Processing Unit: 12 Finished Unit: 12 Processing Unit: 13 Finished Unit: 13 Processing Unit: 14 Finished Unit: 14 Processing Unit: 15 Finished Unit: 15 Processing Unit: 16 Finished Unit: 16 Processing Unit: 17 Finished Unit: 17 Processing Unit: 18 Finished Unit: 18 Processing Unit: 19 Finished Unit: 19 Processing Unit: 20 Finished Unit: 20 Processing Unit: 21 Finished Unit: 21 Processing Unit: 22 Finished Unit: 22 Processing Unit: 23 Finished Unit: 23 Processing Unit: 24 Finished Unit: 24 Processing Unit: 25 Finished Unit: 25 Processing Unit: 26 Finished Unit: 26 Processing Unit: 27 Finished Unit: 27 Processing Unit: 28 Finished Unit: 28 Processing Unit: 29 Finished Unit: 29 Processing Unit: 30 Finished Unit: 30 Processing Unit: 31 Finished Unit: 31 Processing Unit: 32 Finished Unit: 32 Processing Unit: 33 Finished Unit: 33 Processing Unit: 34 Finished Unit: 34 Processing Unit: 35 Finished Unit: 35 Processing Unit: 36 Finished Unit: 36 Processing Unit: 37 Finished Unit: 37 Processing Unit: 38 Finished Unit: 38 Processing Unit: 39 Finished Unit: 39 Processing Unit: 40 Finished Unit: 40 Processing Unit: 41 Finished Unit: 41 Processing Unit: 42 Finished Unit: 42 Processing Unit: 43 Finished Unit: 43 Processing Unit: 44 Finished Unit: 44 Processing Unit: 45 Finished Unit: 45 Processing Unit: 46 Finished Unit: 46 Processing Unit: 47 Finished Unit: 47 Processing Unit: 48 Finished Unit: 48 Processing Unit: 49 Finished Unit: 49 Processing Unit: 50 Finished Unit: 50 Processing Unit: 51 Finished Unit: 51 Processing Unit: 52 Finished Unit: 52 Processing Unit: 53 Finished Unit: 53 Processing Unit: 54 Finished Unit: 54 Processing Unit: 55 Finished Unit: 55 Processing Unit: 56 Finished Unit: 56 Processing Unit: 57 Finished Unit: 57 Processing Unit: 58 Finished Unit: 58 Processing Unit: 59 Finished Unit: 59 Processing Unit: 60 Finished Unit: 60 Processing Unit: 61 Finished Unit: 61 Processing Unit: 62 Finished Unit: 62 Processing Unit: 63 Finished Unit: 63 Processing Unit: 64 Finished Unit: 64 Processing Unit: 65 Finished Unit: 65 Processing Unit: 66 Finished Unit: 66 Processing Unit: 67 Finished Unit: 67 Processing Unit: 68 Finished Unit: 68 Processing Unit: 69 Finished Unit: 69 Processing Unit: 70 Finished Unit: 70 Processing Unit: 71 Finished Unit: 71 Processing Unit: 72 Finished Unit: 72 Processing Unit: 73 Finished Unit: 73 Processing Unit: 74 Finished Unit: 74 Processing Unit: 75 Finished Unit: 75 Processing Unit: 76 Finished Unit: 76 Processing Unit: 77 Finished Unit: 77 Processing Unit: 78 Finished Unit: 78 Processing Unit: 79 Finished Unit: 79 Processing Unit: 80 Finished Unit: 80 Processing Unit: 81 Finished Unit: 81 Processing Unit: 82 Finished Unit: 82 Processing Unit: 83 Finished Unit: 83 Processing Unit: 84 Finished Unit: 84 Processing Unit: 85 Finished Unit: 85 Processing Unit: 86 Finished Unit: 86 Processing Unit: 87 Finished Unit: 87 Processing Unit: 88 Finished Unit: 88 Processing Unit: 89 Finished Unit: 89 Processing Unit: 90 Finished Unit: 90 Processing Unit: 91 Finished Unit: 91 Processing Unit: 92 Finished Unit: 92 Processing Unit: 93 Finished Unit: 93 Processing Unit: 94 Finished Unit: 94 Processing Unit: 95 Finished Unit: 95 Processing Unit: 96 Finished Unit: 96 Processing Unit: 97 Finished Unit: 97 Processing Unit: 98 Finished Unit: 98 Processing Unit: 99 Finished Unit: 99 Processing Unit: 100 Finished Unit: 100
if not h2o_only:
# Output the files, so we don't have to do the preprocessing again.
train_output.to_csv("train_FD001_preprocessed.csv", index=False)
test_output.to_csv("test_FD001_preprocessed.csv", index=False)
test_rul.to_csv("rul_FD001_preprocessed.csv", index=True)
h2o.init()
H2O cluster uptime: | 6 hours 15 minutes 53 seconds 924 milliseconds |
H2O cluster version: | 3.5.0.99999 |
H2O cluster name: | Kevin |
H2O cluster total nodes: | 1 |
H2O cluster total memory: | 3.54 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
train_hex = h2o.import_file(path=_locate("train_FD001_preprocessed.csv"))
test_hex = h2o.import_file(path=_locate("test_FD001_preprocessed.csv"))
Parse Progress: [##################################################] 100% Imported C:\Users\Kevin\Documents\GitHub\h2o-3\h2o-py\demos\train_FD001_preprocessed.csv. Parsed 20,631 rows and 48 cols Parse Progress: [##################################################] 100% Imported C:\Users\Kevin\Documents\GitHub\h2o-3\h2o-py\demos\test_FD001_preprocessed.csv. Parsed 13,096 rows and 47 cols
Use the operational settings and Kalman smoothed mean states as the independent features
Setup a fold column to great cross validation models from 90 units and cross validating on 10 units. This creates a 10-fold cross validation. The cross validation models are then used to create an ensemble model for predictions
xCols= operational_settings_columns_names + kalman_smoothed_mean_columns_names
yCol = dependent_vars
foldCol = "UnitNumberMod10"
train_hex[foldCol] = train_hex["UnitNumber"] % 10
def trainGLM(x, y, fold_column, training_frame, alpha=0.5, penalty=1e-5):
model = H2OGeneralizedLinearEstimator(family = "gaussian",alpha = [alpha], Lambda = [penalty])
model.train(x=x, y=y, training_frame=training_frame, fold_column=fold_column)
return model
def gridSearchGLM(x, y, fold_column, training_frame, alphas = [0,0.5,1], penalties=np.logspace(-3,0,num=4)):
results = []
for alpha in alphas:
for penalty in penalties:
results.append( trainGLM(x, y, fold_column, training_frame, alpha, penalty) )
return results
if doGridSearch:
glmModels = gridSearchGLM(xCols, yCol, foldCol, train_hex)
else:
# this is used to speed up the demonstration by just building the single model previously found
glmModels = [ trainGLM(xCols, yCol, foldCol, train_hex, alpha=1, penalty=0.01 )]
glm Model Build Progress: [##################################################] 100%
Uses model with lowest MSE on the cross validation data.
This is a reasonable substitute for using the final scoring method.
def extractBestModel(models):
bestMse = models[0].mse(xval=True)
result = models[0]
for model in models:
if model.mse(xval=True) < bestMse:
bestMse = model.mse(xval=True)
result = model
return result
bestModel = extractBestModel(glmModels)
bestModel
Model Details ============= H2OGeneralizedLinearEstimator : Generalized Linear Model Model Key: GLM_model_python_1445631620006_380 GLM Model: summary
family | link | regularization | number_of_predictors_total | number_of_active_predictors | number_of_iterations | training_frame | |
gaussian | identity | Lasso (lambda = 0.01 ) | 17 | 18 | 1 | train_FD001_preprocessed.hex |
ModelMetricsRegressionGLM: glm ** Reported on train data. ** MSE: 1907.00855358 R^2: 0.598047319684 Mean Residual Deviance: 1907.00855358 Null degrees of freedom: 20630 Residual degrees of freedom: 20613 Null deviance: 97880908.3648 Residual deviance: 39343493.469 AIC: 214418.190254 ModelMetricsRegressionGLM: glm ** Reported on cross-validation data. ** MSE: 1980.32797342 R^2: 0.582593305455 Mean Residual Deviance: 1980.32797342 Null degrees of freedom: 20630 Residual degrees of freedom: 20614 Null deviance: 98171005.0908 Residual deviance: 40856146.4196 AIC: 215194.529022 Scoring History:
timestamp | duration | iteration | log_likelihood | objective | |
2015-10-23 19:46:45 | 0.000 sec | 0 | 48955702.8 | 2372.9 |
Extract the 'best' model using the same approach as with GLM.
def trainGBM(x, y, fold_column, training_frame, learning_rate=0.1, ntrees=50, max_depth=5):
model = H2OGradientBoostingEstimator(distribution = "gaussian",
learn_rate=learning_rate, ntrees=ntrees, max_depth=max_depth)
model.train(x=x, y=y, training_frame=training_frame, fold_column=fold_column)
return model
def gridSearchGBM(x, y, fold_column, training_frame, learning_rates = [0.1,0.03,0.01], ntrees=[10,30,100,300], max_depth=[1,3,5]):
results = []
for learning_rate in learning_rates:
for ntree in ntrees:
for depth in max_depth:
print "GBM: {learning rate: "+str(learning_rate)+"},{ntrees: "+str(ntree)+"},{max_depth: "+str(depth)+"}"
results.append( trainGBM(x, y, fold_column, training_frame, learning_rate=learning_rate, ntrees=ntree, max_depth=depth) )
return results
if doGridSearch:
gbmModels = gridSearchGBM(xCols, yCol, foldCol, train_hex,\
learning_rates=[0.03,0.01,0.003], ntrees=[100,300,1000,3000], max_depth=[1,2,3,5])
else:
gbmModels = [trainGBM(xCols, yCol, foldCol, train_hex, \
ntrees=300, max_depth=5)]
gbm Model Build Progress: [##################################################] 100%
bestGbmModel = extractBestModel(gbmModels)
Best model had depth 5, learning rate 0.01, and 300 trees
bestGbmModel.params
{u'balance_classes': {'actual': False, 'default': False}, u'build_tree_one_node': {'actual': False, 'default': False}, u'checkpoint': {'actual': None, 'default': None}, u'class_sampling_factors': {'actual': None, 'default': None}, u'col_sample_rate': {'actual': 1.0, 'default': 1.0}, u'distribution': {'actual': u'gaussian', 'default': u'AUTO'}, u'fold_assignment': {'actual': u'AUTO', 'default': u'AUTO'}, u'fold_column': {'actual': {u'__meta': {u'schema_name': u'ColSpecifierV3', u'schema_type': u'VecSpecifier', u'schema_version': 3}, u'column_name': u'UnitNumberMod10', u'is_member_of_frames': None}, 'default': None}, u'ignore_const_cols': {'actual': True, 'default': True}, u'ignored_columns': {'actual': [u'SensorMeasure21', u'SensorMeasure20', u'SensorMeasure8', u'SensorMeasure9', u'SensorMeasure4', u'SensorMeasure5', u'SensorMeasure6', u'SensorMeasure7', u'SensorMeasure1', u'SensorMeasure2', u'SensorMeasure3', u'SensorMeasure16', u'SensorMeasure17', u'SensorMeasure14', u'SensorMeasure15', u'SensorMeasure12', u'SensorMeasure13', u'SensorMeasure10', u'SensorMeasure11', u'SensorMeasure18', u'SensorMeasure19', u'UnitNumber', u'Cycle'], 'default': None}, u'keep_cross_validation_predictions': {'actual': False, 'default': False}, u'learn_rate': {'actual': 0.1, 'default': 0.1}, u'max_after_balance_size': {'actual': 5.0, 'default': 5.0}, u'max_confusion_matrix_size': {'actual': 20, 'default': 20}, u'max_depth': {'actual': 5, 'default': 5}, u'min_rows': {'actual': 10.0, 'default': 10.0}, u'model_id': {'actual': None, 'default': None}, u'nbins': {'actual': 20, 'default': 20}, u'nbins_cats': {'actual': 1024, 'default': 1024}, u'nbins_top_level': {'actual': 1024, 'default': 1024}, u'nfolds': {'actual': 0, 'default': 0}, u'ntrees': {'actual': 300, 'default': 50}, u'offset_column': {'actual': None, 'default': None}, u'r2_stopping': {'actual': 0.999999, 'default': 0.999999}, u'response_column': {'actual': {u'__meta': {u'schema_name': u'ColSpecifierV3', u'schema_type': u'VecSpecifier', u'schema_version': 3}, u'column_name': u'RemainingUsefulLife', u'is_member_of_frames': None}, 'default': None}, u'sample_rate': {'actual': 1.0, 'default': 1.0}, u'score_each_iteration': {'actual': False, 'default': False}, u'seed': {'actual': -3910841585865929898L, 'default': 4566035195997512939L}, u'training_frame': {'actual': {u'URL': u'/3/Frames/train_FD001_preprocessed.hex', u'__meta': {u'schema_name': u'FrameKeyV3', u'schema_type': u'Key<Frame>', u'schema_version': 3}, u'name': u'train_FD001_preprocessed.hex', u'type': u'Key<Frame>'}, 'default': None}, u'tweedie_power': {'actual': 1.5, 'default': 1.5}, u'validation_frame': {'actual': None, 'default': None}, u'weights_column': {'actual': None, 'default': None}}
Best GBM Model reported MSE on cross validation data as 1687, an improvement from GLM of 1954.
bestGbmModel
Model Details ============= H2OGradientBoostingEstimator : Gradient Boosting Machine Model Key: GBM_model_python_1445631620006_382 Model Summary:
number_of_trees | model_size_in_bytes | min_depth | max_depth | mean_depth | min_leaves | max_leaves | mean_leaves | |
300.0 | 105468.0 | 5.0 | 5.0 | 5.0 | 6.0 | 32.0 | 24.756666 |
ModelMetricsRegression: gbm ** Reported on train data. ** MSE: 648.185355447 R^2: 0.863377728184 Mean Residual Deviance: 648.185355447 ModelMetricsRegression: gbm ** Reported on cross-validation data. ** MSE: 1798.90669443 R^2: 0.620832656411 Mean Residual Deviance: 1798.90669443 Scoring History:
timestamp | duration | number_of_trees | training_MSE | training_deviance | |
2015-10-23 20:02:34 | 1 min 11.082 sec | 1.0 | 4150.6 | 4150.6 | |
2015-10-23 20:02:34 | 1 min 11.109 sec | 2.0 | 3668.9 | 3668.9 | |
2015-10-23 20:02:34 | 1 min 11.138 sec | 3.0 | 3269.6 | 3269.6 | |
2015-10-23 20:02:34 | 1 min 11.168 sec | 4.0 | 2945.4 | 2945.4 | |
2015-10-23 20:02:34 | 1 min 11.197 sec | 5.0 | 2680.0 | 2680.0 | |
--- | --- | --- | --- | --- | --- |
2015-10-23 20:02:38 | 1 min 14.994 sec | 197.0 | 789.1 | 789.1 | |
2015-10-23 20:02:38 | 1 min 15.012 sec | 198.0 | 788.3 | 788.3 | |
2015-10-23 20:02:38 | 1 min 15.031 sec | 199.0 | 787.6 | 787.6 | |
2015-10-23 20:02:38 | 1 min 15.049 sec | 200.0 | 786.1 | 786.1 | |
2015-10-23 20:02:40 | 1 min 16.757 sec | 300.0 | 648.2 | 648.2 |
Variable Importances:
variable | relative_importance | scaled_importance | percentage |
SensorMeasureKalmanMean4 | 227962512.0 | 1.0 | 0.5 |
SensorMeasureKalmanMean3 | 54569408.0 | 0.2 | 0.1 |
SensorMeasureKalmanMean9 | 45673524.0 | 0.2 | 0.1 |
SensorMeasureKalmanMean14 | 20590862.0 | 0.1 | 0.0 |
SensorMeasureKalmanMean6 | 16295963.0 | 0.1 | 0.0 |
SensorMeasureKalmanMean11 | 14129182.0 | 0.1 | 0.0 |
SensorMeasureKalmanMean17 | 12120585.0 | 0.1 | 0.0 |
SensorMeasureKalmanMean21 | 9411226.0 | 0.0 | 0.0 |
SensorMeasureKalmanMean7 | 9371074.0 | 0.0 | 0.0 |
SensorMeasureKalmanMean2 | 9129063.0 | 0.0 | 0.0 |
SensorMeasureKalmanMean12 | 7419663.0 | 0.0 | 0.0 |
SensorMeasureKalmanMean20 | 6796496.5 | 0.0 | 0.0 |
SensorMeasureKalmanMean8 | 4020280.0 | 0.0 | 0.0 |
SensorMeasureKalmanMean13 | 3561268.5 | 0.0 | 0.0 |
SensorMeasureKalmanMean15 | 2799715.2 | 0.0 | 0.0 |
OpSet1 | 760000.5 | 0.0 | 0.0 |
OpSet2 | 496507.8 | 0.0 | 0.0 |
See how well the models do predicting on the training set. Should be pretty good, but often worth a check.
Predictions are an ensemble of the 10-fold cross validation models.
train_hex["weights"] = 1
allModels = bestGbmModel.xvals
pred = sum([model.predict(train_hex) for model in allModels]) / len(allModels)
pred["actual"] = train_hex["RemainingUsefulLife"]
pred["unit"] = train_hex["UnitNumber"]
Ideally all points would be on the diagonal, indication prediction from data matched exactly the actual.
Also, it is important that the prediction gets more accurate the closer it gets to no useful life remaining.
Looking at a sample of the first 12 units.
Moved predictions from H2O to Python Pandas for plotting using Seaborn.
if not h2o_only:
scored_df = pred.as_data_frame(use_pandas=True)
if not h2o_only:
g=sns.lmplot(x="actual",y="predict",hue="unit",col="unit",data=scored_df[scored_df.unit < 13],col_wrap=3,fit_reg=False)
g = (g.set_axis_labels("Remaining Useful Life", "Predicted Useful Life")
.set(xlim=(-400, 200), ylim=(-400, 200),
xticks=[-200, 0, 200], yticks=[-200, 0, 200]))
testPreds = sum([model.predict(test_hex) for model in allModels]) / len(allModels)
Append the original index information (Cycle and UnitNumber) to the predicted values so we have them later.
testPreds["Cycle"] = test_hex["Cycle"]
testPreds["UnitNumber"] = test_hex["UnitNumber"]
Move the predictions over to Python Pandas for final analysis and scoring
if not h2o_only:
testPreds_df = testPreds.as_data_frame(use_pandas=True)
Load up the actual Remaining Useful Life information.
if not h2o_only:
actual_RUL = pd.read_csv(_locate("rul_FD001_preprocessed.csv"))
The final scoring used in the competition is based on a single value per unit. We extract the last three predictions and use the mean of those (simple aggregation) and put the prediction back from remaining useful life in T-minus format to cycles remaining (positive).
def aggfunc(x):
if not h2o_only:
return np.mean( x.order().tail(3) )
else:
pass
if not h2o_only:
grouped_by_unit_preds = testPreds_df.groupby("UnitNumber", as_index=False)
predictedRUL = grouped_by_unit_preds.agg({'predict' : aggfunc })
predictedRUL.predict = -predictedRUL.predict
Add the prediction to the actual data frame, and use the scoring used in the PHMO8 competition (more penality for predicting more useful life than there is actual).
if not h2o_only:
final = pd.concat([actual_RUL, predictedRUL.predict], axis=1)
def rowScore(row):
if not h2o_only:
d = row.predict-row.RemainingUsefulLife
return np.exp( -d/10 )-1 if d < 0 else np.exp(d/13)-1
else:
pass
rowScores = final.apply(rowScore, axis=1)
This is the final score using PHM08 method of scoring.
sum(rowScores)
1400.9008386359285
Some things that should ideally would be true:
if not h2o_only:
sns.regplot("RemainingUsefulLife", "predict", data=final, fit_reg=False);