#!/usr/bin/env python # coding: utf-8 # # Using the Crystal Structure Representation of Ward et al. # Recreate the Voronoi-tessellation-based machine learning approach of [Ward et al.](https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104). Builds a model to predict the formation enthalpy based on the crystal structure of a material, using the [FLLA dataset](https://onlinelibrary.wiley.com/doi/abs/10.1002/qua.24917). # # Note: Requires approximately 2 CPU-hours to run. # # *Last Tested Version of matminer*: 0.4.5 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt from matminer.datasets import load_dataset from matminer.featurizers.base import MultipleFeaturizer from matminer.featurizers.composition import ElementProperty, Stoichiometry, ValenceOrbital, IonProperty from matminer.featurizers.structure import (SiteStatsFingerprint, StructuralHeterogeneity, ChemicalOrdering, StructureComposition, MaximumPackingEfficiency) from matminer.featurizers.conversions import DictToObject from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import ShuffleSplit, train_test_split from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from scipy import stats from tqdm import tqdm_notebook as tqdm import numpy as np # ## Create the featurizer # Ward et al. use a variety of different featurizers # In[2]: featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=['frac'])), StructureComposition(IonProperty(fast=True)) ]) # ## Load in a test dataset # Get the dataset from Faber 2015 # In[3]: get_ipython().run_cell_magic('time', '', '# Note: If this is your first time loading the flla dataset, it will be downloaded from an online dataset repository\ndata = load_dataset("flla")\nprint(\'Loaded {} entries\'.format(len(data)))\n') # In[4]: dto = DictToObject(target_col_id='structure', overwrite_data=True) data = dto.featurize_dataframe(data, 'structure') # In[5]: get_ipython().run_cell_magic('time', '', "print('Total number of features:', len(featurizer.featurize(data['structure'][0])))\nprint('Number of sites in structure:', len(data['structure'][0]))\n") # Ward et al. report 100ms for their average featurization time. At least for this structure, we have a similar runtime. # ## Featurize the entire test set # Running the calculations in parallel # In[6]: get_ipython().run_cell_magic('time', '', "X = featurizer.featurize_many(data['structure'], ignore_errors=True)\n") # Convert `X` to a full array # In[7]: X = np.array(X) print('Input data shape:', X.shape) # Check how many tessellations failed # In[8]: import pandas as pd failed = np.any(pd.isnull(X), axis=1) print('Number failed: {}/{}'.format(np.sum(failed), len(failed))) # # Train an ML Model # In[9]: model = Pipeline([ ('imputer', SimpleImputer()), # For the failed structures ('model', RandomForestRegressor(n_estimators=150, n_jobs=-1)) ]) # Train model on whole dataset # In[10]: get_ipython().run_cell_magic('time', '', "model.fit(X, data['formation_energy_per_atom'])\n") # Evaluate the MAE # In[11]: maes = [] for train_ids, test_ids in tqdm(ShuffleSplit(train_size=3000, n_splits=20).split(X)): # Split off the datasets train_X = X[train_ids, :] train_y = data['formation_energy_per_atom'].iloc[train_ids] test_X = X[test_ids, :] test_y = data['formation_energy_per_atom'].iloc[test_ids] # Train the model model.fit(train_X, train_y) # Run the model, compute MAE predict_y = model.predict(test_X) maes.append(np.abs(test_y - predict_y).mean()) # In[12]: print('MAE: {:.3f}+/-{:.3f} eV/atom'.format(np.mean(maes), stats.sem(maes))) # *Finding*: 0.17 eV/atom is in close agreement to what Ward et al. report in their reproduction of this test using OQMD data and Magpie to compute features.