Recreate the Voronoi-tessellation-based machine learning approach of Ward et al.. Builds a model to predict the formation enthalpy based on the crystal structure of a material, using the FLLA dataset.
Note: Requires approximately 2 CPU-hours to run.
Last Tested Version of matminer: 0.4.5
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.datasets import load_dataset
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.composition import ElementProperty, Stoichiometry, ValenceOrbital, IonProperty
from matminer.featurizers.structure import (SiteStatsFingerprint, StructuralHeterogeneity,
ChemicalOrdering, StructureComposition, MaximumPackingEfficiency)
from matminer.featurizers.conversions import DictToObject
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy import stats
from tqdm import tqdm_notebook as tqdm
import numpy as np
Ward et al. use a variety of different featurizers
featurizer = MultipleFeaturizer([
SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
StructuralHeterogeneity(),
ChemicalOrdering(),
MaximumPackingEfficiency(),
SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
StructureComposition(Stoichiometry()),
StructureComposition(ElementProperty.from_preset("magpie")),
StructureComposition(ValenceOrbital(props=['frac'])),
StructureComposition(IonProperty(fast=True))
])
Get the dataset from Faber 2015
%%time
# Note: If this is your first time loading the flla dataset, it will be downloaded from an online dataset repository
data = load_dataset("flla")
print('Loaded {} entries'.format(len(data)))
Loaded 3938 entries CPU times: user 3.99 s, sys: 160 ms, total: 4.15 s Wall time: 6.93 s
dto = DictToObject(target_col_id='structure', overwrite_data=True)
data = dto.featurize_dataframe(data, 'structure')
HBox(children=(IntProgress(value=0, description='DictToObject', max=3938), HTML(value='')))
%%time
print('Total number of features:', len(featurizer.featurize(data['structure'][0])))
print('Number of sites in structure:', len(data['structure'][0]))
Total number of features: 273 Number of sites in structure: 2 CPU times: user 200 ms, sys: 10.9 ms, total: 211 ms Wall time: 387 ms
Ward et al. report 100ms for their average featurization time. At least for this structure, we have a similar runtime.
Running the calculations in parallel
%%time
X = featurizer.featurize_many(data['structure'], ignore_errors=True)
HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=3938), HTML(value='')))
/Users/daniel/.conda/envs/matminer/lib/python3.6/site-packages/pymatgen/core/periodic_table.py:409: UserWarning: No electronegativity for Ne. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float. /Users/daniel/.conda/envs/matminer/lib/python3.6/site-packages/pymatgen/core/periodic_table.py:409: UserWarning: No electronegativity for He. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.
CPU times: user 4.52 s, sys: 1.09 s, total: 5.61 s Wall time: 53min 54s
Convert X
to a full array
X = np.array(X)
print('Input data shape:', X.shape)
Input data shape: (3938, 273)
Check how many tessellations failed
import pandas as pd
failed = np.any(pd.isnull(X), axis=1)
print('Number failed: {}/{}'.format(np.sum(failed), len(failed)))
Number failed: 14/3938
model = Pipeline([
('imputer', SimpleImputer()), # For the failed structures
('model', RandomForestRegressor(n_estimators=150, n_jobs=-1))
])
Train model on whole dataset
%%time
model.fit(X, data['formation_energy_per_atom'])
CPU times: user 1min 26s, sys: 2.2 s, total: 1min 28s Wall time: 37.4 s
Pipeline(memory=None, steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impuri...mators=150, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False))])
Evaluate the MAE
maes = []
for train_ids, test_ids in tqdm(ShuffleSplit(train_size=3000, n_splits=20).split(X)):
# Split off the datasets
train_X = X[train_ids, :]
train_y = data['formation_energy_per_atom'].iloc[train_ids]
test_X = X[test_ids, :]
test_y = data['formation_energy_per_atom'].iloc[test_ids]
# Train the model
model.fit(train_X, train_y)
# Run the model, compute MAE
predict_y = model.predict(test_X)
maes.append(np.abs(test_y - predict_y).mean())
/Users/daniel/.conda/envs/matminer/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1678: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
print('MAE: {:.3f}+/-{:.3f} eV/atom'.format(np.mean(maes), stats.sem(maes)))
MAE: 0.170+/-0.002 eV/atom
Finding: 0.17 eV/atom is in close agreement to what Ward et al. report in their reproduction of this test using OQMD data and Magpie to compute features.