#!/usr/bin/env python # coding: utf-8 # # An Example of using sklearn Pipeline with matminer # # This goes over the steps to build a model using sklearn Pipeline and matminer. Look at the intro_predicting_bulk_modulus notebook for more details about matminer and the featurizers used here. # # This notebook was last updated 06/07/21 for version 0.7.0 of matminer. # # **Note that in order to get the in-line plotting to work, you might need to start Jupyter notebook with a higher data rate, e.g., ``jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10``. We recommend you do this before starting.** # ## Why use Pipeline? # # Pre-processing and featurizing materials data can be viewed as a series of transformations on the data, going from the initially loaded state to training ready. Pipelines are a tool for encapsulating this process in a way that enables easy replication/repeatability, presents a simple model of data transformation, and helps to avoid errant changes to the data. Pipelines chain together transformations into a single transformation. They can also be used to build end end-to-end methods for preprocessing/training/validating a model, by optionally putting an estimator at the end of the pipeline. See the [scikit-learn Pipeline documentation](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) for details. # In[1]: # Load sklearn modules from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.base import TransformerMixin, BaseEstimator from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR, LinearSVR from sklearn.decomposition import PCA, NMF from sklearn.feature_selection import SelectKBest, chi2 from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.metrics import mean_squared_error from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_val_predict, train_test_split, GridSearchCV, RandomizedSearchCV import numpy as np from pandas import DataFrame from scipy.stats import randint as sp_randint # Load featurizers and conversion functions from matminer.featurizers.composition import ElementProperty, OxidationStates from matminer.featurizers.structure import DensityFeatures from matminer.featurizers.conversions import CompositionToOxidComposition, StrToComposition # ## Loading the Dataset # Matminer comes pre-loaded with several example data sets you can use. Below, we'll load a data set of computed elastic properties of materials which is sourced from the paper: "Charting the complete elastic properties of inorganic crystalline compounds", M. de Jong *et al.*, Sci. Data. 2 (2015) 150009. # In[2]: from matminer.datasets.convenience_loaders import load_elastic_tensor df = load_elastic_tensor() # loads dataset in a pandas DataFrame unwanted_columns = ["volume", "nsites", "compliance_tensor", "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss"] df = df.drop(unwanted_columns, axis=1) # In[3]: # seperate out values to be estimated y = df['K_VRH'].values # ## Data Preprocessing # The conversion functions in matminer need to be run before the pipeline as a data preprocessing step. # In[4]: df = StrToComposition().featurize_dataframe(df, "formula") df = CompositionToOxidComposition().featurize_dataframe(df, "composition") # ## Helper Functions # The matminer library uses pandas DataFrames, where sklearn.pipeline mainly looks at things as numpy arrays, so helper methods are needed to seperate out columns from the DataFrame for pipeline. To be used in pipeline they need to be transformers, meaning they implement a transform method. (A fit method that does nothing is also needed) # In[5]: from matminer.utils.pipeline import DropExcluded, ItemSelector # ## Making Feature Union Pipeline for Featurizers # This creates a pipeline that transforms preprocessed data to featurized data usable in sklearn. It can be used to transform data on its own or as part of another pipeline. It is possible to cache values in the pipeline so that this is only done once. # # This Feature Union pipeline has three parts, ``drop`` which drops unwanted columns, ``density`` which adds density features, and ``oxidation`` which adds oxidation state features. These are combined by ``FeatureUnion`` to create the final dataset. The ``drop`` transform acts as an identity+filter, passing through the original data minus the unwanted columns. # In[6]: # columns to remove before regression excluded = ["G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id", "poisson_ratio", "structure", "composition", "composition_oxid"] # featurization transformations featurizer = FeatureUnion( transformer_list=[ ('drop', DropExcluded(excluded)), ('density', Pipeline([ ('select', ItemSelector("structure")), ('density_feat', DensityFeatures()) ])), ('element', Pipeline([ ('select', ItemSelector("composition")), ('oxidation_feat', ElementProperty.from_preset(preset_name="magpie")) ])), ('oxidation', Pipeline([ ('select', ItemSelector("composition_oxid")), ('oxidation_feat', OxidationStates()) ])), ] ) # ## Making a Regression Pipeline # This is a simple pipeline combining the featurizer transformer pipeline with a linear regression estimator. # In[7]: # make the pipeline pipeline = Pipeline([ ('featurize', featurizer), ('regress', LinearRegression()), ]) pipeline.fit(df, y) # get fit statistics print('training R2 = ' + str(round(pipeline.score(df, y), 3))) print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=pipeline.predict(df)))) # ## Making a Random Forest Pipeline # This is the same, but with a random forest regression instead. The only line changed is the one defining ``regress`` in the pipeline. # In[8]: # make the pipeline pipeline = Pipeline([ ('featurize', featurizer), ('regress', RandomForestRegressor(n_estimators=50, random_state=1)), ]) pipeline.fit(df, y) # get fit statistics print('training R2 = ' + str(round(pipeline.score(df, y), 3))) print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=pipeline.predict(df)))) # ## Cross Validation # To run cross validation, the featurizer transformation can't be in a pipeline with the regressor, as the initial form of the data cannot be used with KFold. This is because the transformer adds and removes columns, it's more than just a simple function of the data. Instead the final featurized data can be computed beforehand, here as ``X``. # In[9]: X = featurizer.transform(df) # Define a KFold for cross validation. Using RepeatedKFold can reduce variance in the cross val score without increasing the number of folds, this is similar to bootstrapping, as the data is randomly subsampled multiple times by the KFold and then averaged. Using repeated folds is a good way to reduce variance if there is sufficient compute time to do so. For very computationally expensive models, such as DNNs, it is common to use a single train/validation split (not counting the excluded test data). # In[10]: crossvalidation = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1) # This is the same linear regression as before, now with RepeatedKFold cross validation. This gives a better estimate of how well the model will generalize than looking at training error. Cross validation usually gives a pessimistic estimate, and in practice the best performing model will be retrained on the full set of train/val data before testing. # In[11]: lr = LinearRegression() scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1) rmse_scores = [np.sqrt(abs(s)) for s in scores] r2_scores = cross_val_score(lr, X, y, scoring='r2', cv=crossvalidation, n_jobs=1) print('Cross-validation results:') print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores)))) print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores)))) # This is the same with the random forest regressor. # In[12]: # compute cross validation scores for random forest model rf = RandomForestRegressor(n_estimators=50, random_state=1) r2_scores = cross_val_score(rf, X, y, scoring='r2', cv=crossvalidation, n_jobs=1) scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1) rmse_scores = [np.sqrt(abs(s)) for s in scores] print('Cross-validation results:') print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores)))) print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores)))) # ## Model Selection with Grid Search # # A pipeline can be used with Grid Search or Random Search for model selection and hyper-parameter optimization. This can include normalization, scaling, whitening, PCA / dimensionality reduction, basis expansion, or any other preprocessing or data transformation steps. # # Setting up a pipeline is design pattern that gives a straight forward abd repeatable method of processing the data and training a model. This can make it easy to try many different models and perform model selection with a hyper-parameter optimization scheme like grid search. # # Before doing model selection, the data should be split into a training set and a holdout test set. This tries to measure the generality of the model, predicting how it may perform on real data. Without a test set there is no way to measure if the model has likely overfit the training data. # # Note: The best model is chosen by cross validation score, and only the final model (after being retrained on all train/val data) is evaluated on the test set. Evaluating multiple models on the test set and choosing the best of them is an almost sure way of leading to overfitting or overestimating the true performance/generality of the model, exactly what we are trying to avoid by creating a hold out test set. # In[13]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) # In[14]: rf = RandomForestRegressor(n_estimators=50, random_state=1) param_grid = [ {'n_estimators': [10,15,20,25,30,50,100]}, ] gs = GridSearchCV(rf, param_grid, n_jobs=4, cv=5) gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) print(gs.score(X_test, y_test)) # ## Random Search # Random search is another possible option for hyper-parameter selection, and usually outperforms grid search both in theory and in practice (see Random Search for Hyper-Parameter Optimization by Bergstra & Bengio). This is true especially in higher dimentional hyper-parameter spaces. This shows an example of a scaling step in the pipeline, which can improve performance for some types of models. # In[15]: pipe = Pipeline([ ('scale', StandardScaler()), ('regress', RandomForestRegressor(random_state=1)), ]) param_dist = {'regress__n_estimators': sp_randint(10,150)} gs = RandomizedSearchCV(pipe, param_dist, cv=crossvalidation, n_jobs=-1) gs.fit(X_train, y_train) print('best crossval score ' + str(round(gs.best_score_, 3))) print('best params ' + str(gs.best_params_)) # get fit statistics print('training R2 = ' + str(round(gs.score(X_train, y_train), 3))) print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_train, y_pred=gs.predict(X_train)))) print('test R2 = ' + str(round(gs.score(X_test, y_test), 3))) print('test RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_test, y_pred=gs.predict(X_test))))