#!/usr/bin/env python # coding: utf-8 # # Assign NERC labels to plants using 860 data and RandomForest # # ## Instructions # Make sure the `file_date` parameter below is set to whatever value you would like appended to file names. # # Change the `most_recent_860_year` parameter below to match the most up-to-date EIA-860 annual data file. As of March 2018 this is 2016. # # EIA-860 (annual) excel files will need to be [downloaded](https://www.eia.gov/electricity/data/eia860/) and unzipped to the `EIA downloads` folder. Make sure that all years from 2012 through the most recent data year are available. Also download the most recent [EIA-860m](https://www.eia.gov/electricity/data/eia860m/) to `EIA downloads`. # # The most recent annual 860 file available (as of March 2018) represents 2016 data. When newer EIA-860 annual files are added the dictionary with pandas `read_excel` parameters will need to be updated. Note that EIA puts out an Early Release version of 860 with extra header rows and columns, so be sure to appropriately adjust the `skiprows` and `usecols` parameters if using an Early Release file. # # The entire notebook can be run at once using *Run All Cells* # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import os from os.path import join import pandas as pd from sklearn import neighbors, metrics from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split, GridSearchCV from collections import Counter from copy import deepcopy cwd = os.getcwd() data_path = join(cwd, '..', 'Data storage') # ### Date string for filenames # This will be inserted into all filenames (reading and writing) # In[2]: file_date = '2018-03-06' most_recent_860_year = 2016 # ## Load data # This loads facility data that has been assembled from the EIA bulk data file, and EIA-860 excel files. The EIA-860 excel files need to be downloaded manually. # ### Load EIA facility data # Only need to keep the plant id, year (as a check that plants don't move between years), and lat/lon # In[204]: path = os.path.join(data_path, 'Derived data', 'Facility gen fuels and CO2 {}.csv'.format(file_date)) facility_df = pd.read_csv(path) facility_df['state'] = facility_df['geography'].str[-2:] # In[205]: plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon', 'state']] plants.drop_duplicates(inplace=True) # Because the 2017 facility dataframe only includes annually reporting facilities I'm going to duplicate the plant id, lat/lon, and state information from 2016. # In[206]: # make a copy of 2016 (or most recent annual data year) and change the year to plants_2017 = plants.loc[plants['year'] == most_recent_860_year, :].copy() plants_2017.loc[:, 'year'] += 1 plants = pd.concat([plants.loc[plants['year']<=most_recent_860_year, :], plants_2017]) # In[207]: (set(plants.loc[plants.year==2016, 'plant id']) - set(plants.loc[plants.year==2017, 'plant id'])) # ### Load known NERC labels from EIA-860 # Current NERCS go back to 2012. Use all annual 860 files from 2012 through the most recent available. Extend the dictionary of dictionaries below with any files available after 2016. `io`, `skiprows`, and `usecols` are all input parameters for the Pandas `read_excel` function. # In[5]: eia_base_path = join(data_path, 'EIA downloads') file_860_info = { # 2011: {'io': join(eia_base_path, 'eia8602011', 'Plant.xlsx'), # 'skiprows': 0, # 'parse_cols': 'B,J'}, 2012: {'io': join(eia_base_path, 'eia8602012', 'PlantY2012.xlsx'), 'skiprows': 0, 'usecols': 'B,J'}, 2013: {'io': join(eia_base_path, 'eia8602013', '2___Plant_Y2013.xlsx'), 'skiprows': 0, 'usecols': 'C,L'}, 2014: {'io': join(eia_base_path, 'eia8602014', '2___Plant_Y2014.xlsx'), 'skiprows': 0, 'usecols': 'C,L'}, 2015: {'io': join(eia_base_path, 'eia8602015', '2___Plant_Y2015.xlsx'), 'skiprows': 0, 'usecols': 'C,L'}, 2016: {'io': join(eia_base_path, 'eia8602016', '2___Plant_Y2016.xlsx'), 'skiprows': 0, 'usecols': 'C,L'} } # In[6]: eia_nercs = {} for key, args in file_860_info.items(): eia_nercs[key] = pd.read_excel(**args) eia_nercs[key].columns = ['plant id', 'nerc'] eia_nercs[key]['year'] = key # I want to assign NERC regions for every year. We have data for 2012 onward from the EIA-860 files. For the purpose of this analysis I'll assume that all years from 2001-2011 are the same NERC as 2012. # # Also assume that values in 2017 are the same as in 2016. I'll fill in nerc values for plants that were built in 2017 below. # In[7]: for year in range(2001, 2012): # the pandas .copy() method is deep by default but I'm not sure in this case df = deepcopy(eia_nercs[2012]) df['year'] = year eia_nercs[year] = df df = deepcopy(eia_nercs[2016]) df['year'] = 2017 eia_nercs[2017] = df # In[8]: eia_nercs.keys() # In[9]: eia_nercs[2001].head() # In[10]: nercs = pd.concat(eia_nercs.values()) nercs.sort_values('year', inplace=True) # In[11]: nercs.head() # In[13]: (set(nercs.loc[(nercs.nerc == 'MRO') & (nercs.year == 2016), 'plant id']) - set(nercs.loc[(nercs.nerc == 'MRO') & (nercs.year == 2017), 'plant id'])) # In[12]: nercs.year.unique() # ### Look for plants listed with different NERC labels # # ** This may not matter anymore if I use NERC labels for each year ** # # There are 30 plants duplicated. Five of them don't have a NERC label in one of the years. The largest move is from MRO to other regions (12), with most of those to SPP (7). After that, moves from RFC (5) to MRO (3) and SERC (2). There are also some moves from WECC and FRCC to HICC/ASCC - these might be diesel generators that get moved. # # The plants that have duplicate NERC region labels represent a small fraction of national generation, but one that is growing over time. By 2016 they consist of 0.15% of national generation. # In[13]: for df_ in list(eia_nercs.values()) + [nercs]: print('{} total records'.format(len(df_))) print('{} unique plants'.format(len(df_['plant id'].unique()))) # In[14]: dup_plants = nercs.loc[nercs['plant id'].duplicated(keep=False), 'plant id'].unique() dup_plants # In[15]: region_list = [] for plant in dup_plants: regions = nercs.loc[nercs['plant id'] == plant, 'nerc'].unique() # regions = regions.tolist() region_list.append(regions) Counter(tuple(x) for x in region_list) # In[16]: (facility_df.loc[facility_df['plant id'].isin(dup_plants), :] .groupby('year')['generation (MWh)'].sum() / facility_df.loc[:, :] .groupby('year')['generation (MWh)'].sum()) # ### Some plants in EIA-860 don't have NERC labels. Drop them now. # This is my training data. All of these plants should still be in my `plants` dataframe. # In[14]: nan_plants = {} all_nan = [] years = nercs.year.unique() for year in years: nan_plants[year] = nercs.loc[(nercs.year == year) & (nercs.isnull().any(axis=1)), 'plant id'].tolist() all_nan.extend(nan_plants[year]) # number of plants that don't have a nerc in at least one year len(all_nan) # In[15]: # drop all the rows without a nerc value nercs.dropna(inplace=True) # In[16]: nan_plants[2017] # ## Load EIA-860m for some info on recent facilities # The EIA-860m (monthly) data file has an up-to-date list of all operating power plants and their associated balancing authority. It does not list the NERC region, so it can't be used to assign NERC labels for all plants. But in some cases knowing the state and balancing authority is enough to make a good guess about which NERC a plant is in. # # Assigning NERC region labels has the lowest accuracy for plants in SPP and TRE. To compensate, I'm going to assume that anything in TX or OK and SWPP balancing authority is in SPP. On the flip side, if it's in TX and ERCOT I'll assign it to TRE. # # Only do this for plants that come online since the most recent 860 annual data. # # **NOTE** # Because I'm looking at units that came online in 2017 some of the plant ids will already exist # In[127]: path = join(data_path, 'EIA downloads', 'december_generator2017.xlsx') # Check the excel file columns if there is a read error. They should match # the plant id, plant state, operating year, and balancing authority code. _m860 = pd.read_excel(path, sheet_name='Operating',skip_footer=1, usecols='C,F,P,AE', skiprows=0) _m860.columns = _m860.columns.str.lower() # In[128]: # most_recent_860_year is defined at the top of this notebook # The goal here is to only look at plants that started operating after # the most recent annual data. So only include units starting after # the last annual data and that don't have plant ids in the nercs # dataframe m860 = _m860.loc[(_m860['operating year'] > most_recent_860_year)].copy() #& # (~_m860['plant id'].isin(nercs['plant id'].unique()))].copy() m860.tail() # In[129]: m860.loc[(m860['plant state'].isin(['TX', 'OK'])) & (m860['balancing authority code'] == 'SWPP'), 'nerc'] = 'SPP' m860.loc[(m860['plant state'].isin(['TX'])) & (m860['balancing authority code'] == 'ERCO'), 'nerc'] = 'TRE' # In[130]: # Drop all rows except the ones I've labeled as TRE or SPP m860.dropna(inplace=True) m860.head() # Make lists of plant codes for SPP and TRE facilities # In[131]: nercs.head() # In[132]: # Create additional dataframes with 2017 SPP and TRE plants. # Use these to fill in values for 2017 plants m860_spp_plants = (m860.loc[m860['nerc'] == 'SPP', 'plant id'] .drop_duplicates() .reset_index(drop=True)) additional_spp = pd.DataFrame(m860_spp_plants.copy()) # additional_spp['plant id'] = m860_spp_plants additional_spp['nerc'] = 'SPP' additional_spp['year'] = 2017 m860_tre_plants = (m860.loc[m860['nerc'] == 'TRE', 'plant id'] .drop_duplicates() .reset_index(drop=True)) additional_tre = pd.DataFrame(m860_tre_plants) # additional_tre['plant id'] = m860_tre_plants additional_tre['nerc'] = 'TRE' additional_tre['year'] = 2017 # In[133]: additional_spp # In[134]: additional_tre # ### Append my 2017 SPP and TRE guesses to the full nerc dataframe # In[115]: nercs = pd.concat([nercs, additional_spp, additional_tre]) # ## Clean and prep data for KNN # In[116]: plants.head() # In[117]: nercs.tail() # Checked to make sure the type of merge doesn't matter once rows without nerc values are dropped # In[209]: df = pd.merge(plants, nercs, on=['plant id', 'year'], how='left') # In[210]: omitted = set(df['plant id'].unique()) - set(nercs['plant id'].unique()) # In[213]: df.head() # In[214]: df.tail() # In[215]: df.columns # Drop plants that don't have lat/lon data (using just lon to check), and then drop duplicates. If any plants have kept the same plant id but moved over time (maybe a diesel generator?) or switched NERC they will show up twice. # In[216]: cols = ['plant id', 'lat', 'lon', 'nerc', 'state', 'year'] df_slim = (df.loc[:, cols].dropna(subset=['lon']) .drop_duplicates(subset=['plant id', 'year', 'nerc'])) # In[217]: df_slim.tail() # Separate out the list of plants where we don't have NERC labels from EIA-860. # In[218]: unknown = df_slim.loc[df_slim.nerc.isnull()].copy() # In[219]: print("{} plants/years don't have NERC labels\n".format(len(unknown))) print(unknown.head()) # In[220]: unknown.tail() # ### Create X and y matricies # X is lat/lon and year # # y is the NERC label # # For both, I'm only using plants where we have all data (no `NaN`s). Not doing any transformation of the lat/lon at this time. # In[221]: X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon', 'year']] y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc'] # In[222]: len(X) # In[223]: # Make sure that unknown and X include all records from df_slim len(X) + len(unknown) - len(df_slim) # In[224]: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) # ## GridSearch to find the best parameters in a RandomForest Classifier # # I previously used k-nearest neighbors with just lat/lon as input features. The problem is that some facilities don't have lat/lon data. They do usually have a state geography label though. Categorical labels don't work well in KNN, but the combination of lat/lon and a state label will work well in a tree model. RandomForest is usually a quite effective tree model and my results are more accurate with this than they were with KNN. # In[225]: from sklearn.ensemble import RandomForestClassifier # In[226]: rf = RandomForestClassifier() params = dict( n_estimators = [5, 10, 25, 50], min_samples_split = [2, 5, 10], min_samples_leaf = [1, 3, 5], ) clf_rf = GridSearchCV(rf, params, n_jobs=-1, iid=False, verbose=1) clf_rf.fit(X_train, y_train) # In[227]: clf_rf.best_estimator_, clf_rf.best_score_ # In[228]: clf_rf.score(X_test, y_test) # In[229]: nerc_labels = nercs.nerc.dropna().unique() # Accuracy score by region # In[230]: for region in nerc_labels: mask = y_test == region X_masked = X_test[mask] y_hat_masked = clf_rf.predict(X_masked) y_test_masked = y_test[mask] accuracy = metrics.accuracy_score(y_test_masked, y_hat_masked) print('{} : {}'.format(region, accuracy)) # F1 score by region # In[231]: y_hat = clf_rf.predict(X_test) for region in nerc_labels: f1 = metrics.f1_score(y_test, y_hat, labels=[region], average='macro') print('{} : {}'.format(region, f1)) # ## Plants without lat/lon # Use just the state for plants that don't have lat/lon info. Less accurate, especially where NERC regions cross state lines, but better than nothing. # # Need to start with the `lon` column so I can filter to only unknown facilities that also don't have lon # In[232]: cols = ['plant id', 'nerc', 'state', 'year', 'lon'] df_state_slim = (df.loc[:, cols].dropna(subset=['state']).copy()) # In[233]: df_state_slim.head() # In[234]: len(df_state_slim) # ### Encode state names as numbers for use in sklearn # In[235]: le = LabelEncoder() df_state_slim.loc[:, 'enc state'] = le.fit_transform(df_state_slim.loc[:, 'state'].tolist()) # In[236]: len(df_state_slim) # In[237]: unknown_state = df_state_slim.loc[(df_state_slim.nerc.isnull()) & (df_state_slim.lon.isnull())].copy() # In[238]: len(unknown_state), len(unknown) # In[239]: X_state = df_state_slim.loc[df_state_slim.notnull().all(axis=1), ['enc state', 'year']].copy() y_state = df_state_slim.loc[df_state_slim.notnull().all(axis=1), 'nerc'].copy() # In[240]: X_state_train, X_state_test, y_state_train, y_state_test = train_test_split( X_state, y_state, test_size=0.33, random_state=42) # In[241]: rf = RandomForestClassifier() params = dict( n_estimators = [5, 10, 25, 50], min_samples_split = [2, 5, 10], min_samples_leaf = [1, 3, 5], ) clf_rf_state = GridSearchCV(rf, params, n_jobs=-1, iid=False, verbose=1) clf_rf_state.fit(X_state_train, y_state_train) # In[242]: clf_rf_state.best_estimator_, clf_rf_state.best_score_ # In[243]: clf_rf_state.score(X_state_test, y_state_test) # Accuracy score by region # In[244]: nerc_labels = nercs.nerc.dropna().unique() for region in nerc_labels: mask = y_state_test == region X_state_masked = X_state_test[mask] y_state_hat_masked = clf_rf_state.predict(X_state_masked) y_state_test_masked = y_state_test[mask] accuracy = metrics.accuracy_score(y_state_test_masked, y_state_hat_masked) print('{} : {}'.format(region, accuracy)) # F1 score by region # In[245]: y_state_hat = clf_rf_state.predict(X_state_test) for region in nerc_labels: f1 = metrics.f1_score(y_state_test, y_state_hat, labels=[region], average='macro') print('{} : {}'.format(region, f1)) # ## Use best RandomForest parameters to predict NERC for unknown plants # In[246]: unknown.loc[:, 'nerc'] = clf_rf.predict(unknown.loc[:, ['lat', 'lon', 'year']]) unknown_state.loc[:, 'nerc'] = clf_rf_state.predict(unknown_state.loc[:, ['enc state', 'year']]) # Ensuring that no plants in Alaska or Hawaii are assigned to continental NERCs, or the other way around. # In[247]: print(unknown.loc[unknown.state.isin(['AK', 'HI']), 'nerc'].unique()) print(unknown.loc[unknown.nerc.isin(['HICC', 'ASCC']), 'state'].unique()) # In[248]: Counter(unknown['nerc']) # In[249]: unknown.head() # In[250]: unknown_state.head() # ## Export plants with lat/lon, state, and nerc # In[251]: nercs.tail() # In[252]: unknown.head() # In[253]: unknown_state.tail() # In[267]: len(unknown_state['plant id'].unique()) # In[254]: df_slim.head() # In[255]: labeled = pd.concat([df_slim.loc[df_slim.notnull().all(axis=1)], unknown, unknown_state.loc[:, ['plant id', 'nerc', 'state', 'year']]]) # In[256]: labeled.tail() # In[257]: labeled.loc[labeled.nerc.isnull()] # There are 7 facilities that don't show up in my labeled data. # In[258]: facility_df.loc[~facility_df['plant id'].isin(labeled['plant id']), 'plant id'].unique() # In[259]: len(labeled), len(nercs) # In[ ]: # In[260]: nerc_labels # In[261]: mro_2016 = set(labeled.loc[(labeled.nerc == 'MRO') & (labeled.year == 2016), 'plant id']) mro_2017 = set(labeled.loc[(labeled.nerc == 'MRO') & (labeled.year == 2017), 'plant id']) # In[ ]: # In[262]: (set(nercs.loc[(nercs.nerc=='MRO') & (nercs.year==2017),'plant id']) - mro_2017) # In[263]: for nerc in nerc_labels: l = len((set(labeled.loc[(labeled.nerc == nerc) & (labeled.year == 2016), 'plant id']) - set(labeled.loc[(labeled.nerc == nerc) & (labeled.year == 2017), 'plant id']))) print('{} plants dropped in {}'.format(l, nerc)) # In[264]: (set(labeled.loc[(labeled.nerc == 'MRO') & (labeled.year == 2016), 'plant id']) - set(labeled.loc[(labeled.nerc == 'MRO') & (labeled.year == 2017), 'plant id'])) # In[265]: path = join(data_path, 'Facility labels', 'Facility locations_RF.csv') labeled.to_csv(path, index=False) # In[ ]: