#!/usr/bin/env python # coding: utf-8 # # # # #
# National generation capacity: Processing notebook # #
This Notebook is part of the National Generation Capacity Datapackage of Open Power System Data. #
# # Table of Contents # * [1. Introductory notes](#1.-Introductory-notes) # * [2. Script setup](#2.-Script-setup) # * [3. Data download and processing](#3.-Data-download-and-processing) # * [3.1 Manually compiled dataset](#3.1-Manually-compiled-dataset) # * [3.2 EUROSTAT data](#3.2-EUROSTAT-data) # * [3.3 ENTSO-E data](#3.3-ENTSO-E-data) # * [3.3.1 ENTSO-E statistical data](#3.3.1-ENTSO-E-statistical-data) # * [3.3.2 ENTSO-E SO&AF data](#3.3.2-ENTSO-E-SO&AF-data) # * [3.3.3 ENTSO-E Transparency Plaftform](#3.3.1-ENTSO-E-Transparency-Platform) # * [3.3.4 ENTSO-E Power Statistics](#3.3.4-ENTSO-E-Power-Statistics) # * [3.4 Merge data sources](#3.4-Merge-data-sources) # * [4. Convert stacked data to crosstable format](#4.-Convert-stacked-data-to-crosstable-format) # * [5. Output](#5.-Output) # * [5.1 Write results to file](#5.1-Write-results-to-file) # * [5.2 Formatting of Excel tables](#5.2-Formatting-of-Excel-tables) # * [5.3 Write checksums](#5.3-Write-checksums) # * [6. Documentation of the data package](#6.-Documentation-of-the-data-package) # # 1. Introductory notes # The script processes the compiled nationally aggregated generation capacity for European countries. Due to varying formats and data specifications of references for national generation capacity, the script firstly focuses on rearranging the manually compiled data. Thus, the script itself does not collect, select, download or manage data from original sources. Secondly, international data sources, such as EUROSTAT and ENTSO-E, are directly downloaded from original web sources and complement the initial data set. # # 2. Script setup # In[1]: # some functions and classes that are defined in seperate files import functions.helper_functions as func import functions.soaf as soaf # core packages import os import pandas as pd import numpy as np # packages to copy files, write sqllite data bases and manipulate excel files import shutil import sqlite3 import openpyxl from openpyxl.styles import PatternFill, colors, Font, Alignment from openpyxl.utils import get_column_letter import yaml import json # # 3. Data download and processing # We compile data from different national and international sources. Firstly, national data sources are manually compiled due to varying data formats and specifications. Secondly, international sources are compiled directly and appended to the compiled data set. The international data sources comprise: # - [EUROSTAT](http://ec.europa.eu/eurostat/product?code=nrg_113a&mode=view) # - [ENTSO-E Statistical data](https://www.entsoe.eu/data/data-portal/miscellaneous/Pages/default.aspx) # - [ENTSO-E System Outlook and Adequacy Forecast](https://www.entsoe.eu/outlooks/maf/Pages/default.aspx) # - [ENTSO-E Transparency Platform](https://transparency.entsoe.eu/) # - [ENTSO-E Power Statistics](https://www.entsoe.eu/data/power-stats/) # # In the following section, the data sets are downloaded and uploaded to Python. # ## 3.1 Manually compiled dataset # The manually compiled dataset is imported and rearranged to a DataFrame for further processing. The dataset comprises for each European country and specified generation technology different data entries, which are based on different sources. As these sources differ by country and year, information on the corresponding reference are directly given with the data entry. # In[2]: data_file = 'National_Generation_Capacities.xlsx' filepath = os.path.join('input', data_file) # Read data into pandas data_raw = pd.read_excel(filepath, sheet_name='Summary', header=None, na_values=['-'], skiprows=0) # Deal with merged cells from Excel: fill first three rows with information data_raw.iloc[0:2] = data_raw.iloc[0:2].fillna(method='ffill', axis=1) # Set index for rows data_raw = data_raw.set_index([0]) data_raw.index.name = 'technology' # Extract energylevels from raw data for later use energylevels_raw = data_raw.iloc[:, 0:5] energylevels_raw.head() # In[3]: # Delete definition of energy levels from raw data data_raw.drop(data_raw.columns[[0, 1, 2, 3, 4, 5]], axis=1, inplace=True) level_names = ['country', 'type', 'year', 'source', 'source_type', 'weblink', 'capacity_definition'] # Set multiindex column names data_raw.columns = pd.MultiIndex.from_arrays(data_raw[:7].values, names=level_names) # Remove 3 rows which are already used as column names data_raw = data_raw[pd.notnull(data_raw.index)] # Extract the ordering of technologies technology_order = data_raw.index.str.replace('- ', '').values.tolist() data_raw.head() # In[4]: data_raw.columns[data_raw.columns.duplicated()] # In[5]: data_raw[data_raw.index.duplicated()] # In[6]: # Reshape dataframe to list data_opsd = pd.DataFrame(data_raw.stack(level=level_names)) # Reset index for dataframe data_opsd.reset_index(inplace=True) data_opsd['technology'] = data_opsd['technology'].str.replace('- ', '') data_opsd.rename(columns={0: 'capacity'}, inplace=True) data_opsd['capacity'] = pd.to_numeric(data_opsd['capacity'], errors='coerce') # For some source, permission to publish data banlist = ['ELIA', 'BMWi', 'Mavir'] davail = 'data available, but cannot be provided' data_opsd.loc[data_opsd['source'].isin(banlist), 'comment'] = davail data_opsd.head() # ## 3.2 EUROSTAT data # EUROSTAT publishes annual structural data on national electricity generation capacities for European countries. The dataset is available in the EUROSTAT database within the category 'Environment and Energy' ([nrg_113a](http://ec.europa.eu/eurostat/product?code=nrg_113a&mode=view)). # In[7]: url_eurostat = 'Link unavailable' filepath_eurostat = os.path.join('input', 'Eurostat', 'Eurostat.tsv.gz') data_eurostat = pd.read_csv(filepath_eurostat, compression='gzip', sep='\t|,', engine='python' ) data_eurostat.head() # In[8]: id_vars = ['unit', 'product','indic_nrg', 'geo\\time'] data_eurostat = pd.melt(data_eurostat, id_vars=id_vars, var_name='year', value_name='value') data_eurostat.head() # In[9]: data_definition = pd.read_csv(os.path.join('input', 'definition_EUROSTAT_indic.txt'), header=None, names=['indic', 'description', 'energy source'], sep='\t') data_eurostat = data_eurostat.merge(data_definition, how='left', left_on='indic_nrg', right_on='indic') # The classification of generation capacities in the EUROSTAT dataset is specified in [Regulation (EC) No 1099/2008](http://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32008R1099&from=EN) (Annex B, 3.3). The available EUROSTAT dataset [nrg_113a](http://ec.europa.eu/eurostat/product?code=nrg_113a&mode=view) covers the following indicators: # # |indic_nrg | Description | Technology in OPSD | # |---|---|---| # |**12_1176011**| **Electrical capacity, main activity producers - Combustible Fuels**| Fossil fuels & bioenergy| # |**12_1176012**| **Electrical capacity, autoproducers - Combustible Fuels**| Fossil fuels & bioenergy| # |*12_1176061*| *Electrical capacity, main activity producers - Mixed plants*| | # |*12_1176101*| *Electrical capacity, main activity producers - Other Sources*| | # |*12_1176102*| *Electrical capacity, autoproducers - Other Sources*| | # |*12_1176111*| *Electrical capacity, main activity producers - Steam*|| # |*12_1176112*| *Electrical capacity, autoproducers - Steam*|| # |*12_1176121*| *Electrical capacity, main activity producers - Gas Turbine*|| # |*12_1176122*| *Electrical capacity, autoproducers - Gas Turbine*|| # |*12_1176131*| *Electrical capacity, main activity producers - Combined Cycle*|| # |*12_1176132*| *Electrical capacity, autoproducers - Combined Cycle*|| # |*12_1176141*| *Electrical capacity, main activity producers - Internal Combustion*|| # |*12_1176142*| *Electrical capacity, autoproducers - Internal Combustion*|| # |*12_1176401*| *Electrical capacity, main activity producers - Other Type of Generation*| | # |*12_1176402*| *Electrical capacity, autoproducers - Other Type of Generation*| | # |12_1176253| Net maximum capacity - Municipal Wastes| Non-renewable waste| # |12_1176263| Net maximum capacity - Wood/Wood Wastes/Other Solid Wastes| Other bioenergy and renewable waste| # |12_1176273| Net maximum capacity - Biogases| Biomass and biogas| # |12_1176283| Net maximum capacity - Industrial Wastes (non-renewable)| Non-renewable waste| # |12_1176343| Net maximum capacity - Liquid Biofuels| Biomass and biogas| # |**12_1176031**| **Electrical capacity, main activity producers - Nuclear**| Nuclear| # |**12_1176032**| **Electrical capacity, autoproducers - Nuclear**| Nuclear| # |**12_1176051**| **Electrical capacity, main activity producers - Hydro**| Hydro| # |**12_1176052**| **Electrical capacity, autoproducers - Hydro**| Hydro| # |12_1176071| Net electrical capacity, main activity producers - Pure Pumped Hydro| Pumped storage| # |12_1176072| Net electrical capacity, autoproducers - Pure Pumped Hydro| Pumped storage| # |*12_117615*| *Net maximum capacity - Hydro <1 MW*| | # |*12_117616*| *Net maximum capacity - Hydro >= 1 MW and <= 10 MW*| | # |*12_117617*| *Net maximum capacity - Hydro 10 MW and over*| | # |**12_1176301**| **Electrical capacity, main activity producers - Tide, wave and ocean**| Marine| # |**12_1176302**| **Electrical capacity, autoproducers - Tide, wave and ocean**| Marine| # |*12_1176303*| *Net maximum capacity - Tide, Wave, Ocean*|| # |**12_1176081**| **Electrical capacity, main activity producers - Geothermal**| Geothermal| # |**12_1176082**| **Electrical capacity, autoproducers - Geothermal**| Geothermal| # |*12_1176083*| *Net maximum capacity - Geothermal*| | # |**12_1176091**| **Electrical capacity, main activity producers - Wind**| Wind| # |**12_1176092**| **Electrical capacity, autoproducers - Wind**| Wind| # |**12_1176233**| **Net maximum capacity - Solar Photovoltaic**| Photovoltaics| # |**12_1176243**| **Net maximum capacity - Solar Thermal Electric**| Concentrated solar power| # # **Bold** rows indicate top level classes within the EUROSTAT classification, whereas normal and *italic* rows cover different kinds of subclassifications. Especially within the top level 'Combustible fuels' different kinds of subcategorizations based on fuel or technology are available. Simarily, 'Hydro' is differentiated by type (e.g. pumped-hydro storage) or capacity classes. *Italic* rows are not further considered within the OPSD dataset due to the mismatch with existing technology classes. # In[10]: data_eurostat = data_eurostat[data_eurostat['energy source'].isnull() == False] values_as_string = data_eurostat['value'].astype(str) string_values = values_as_string.str.split(' ', 1).str[0] string_values.replace(':', np.nan, inplace=True) subset_nan = string_values.isnull() data_eurostat['value'] = string_values data_eurostat['year'] = data_eurostat['year'].astype(int) data_eurostat['value'] = data_eurostat['value'].astype(float) data_eurostat.head() # In[11]: data_eurostat = data_eurostat.drop(['unit', 'product', 'indic_nrg', 'indic', 'description'], axis=1) data_eurostat = data_eurostat.rename(columns={'geo\\time': 'country', 'energy source': 'technology', 'value': 'capacity'}) # In[12]: data_eurostat['country'].replace({'UK': 'GB', 'EL': 'GR'}, inplace=True) drop_list = data_eurostat[data_eurostat['country'].isin(['EU28','EA19'])].index data_eurostat.drop(drop_list, inplace=True) by_columns = ['technology', 'year', 'country'] data_eurostat = pd.DataFrame(data_eurostat.groupby(by_columns)['capacity'].sum()) data_eurostat_isnull = data_eurostat['capacity'].isnull() == True data_eurostat.reset_index(inplace=True) data_eurostat.head() # In[13]: eurostat_pivot = data_eurostat.pivot_table(values='capacity', index=['country','year'], columns='technology') eurostat_pivot.head() # In[14]: eurostat_pivot['Differently categorized solar'] = 0 eurostat_pivot['Solar'] = eurostat_pivot[['Photovoltaics', 'Concentrated solar power']].sum(axis=1) eurostat_pivot['Differently categorized wind'] = eurostat_pivot['Wind'] bio_arr = ['Biomass and biogas', 'Other bioenergy and renewable waste'] eurostat_pivot['Bioenergy and renewable waste'] = eurostat_pivot[bio_arr].sum(axis=1) res_arr = ['Hydro', 'Wind', 'Solar', 'Geothermal', 'Marine', 'Bioenergy and renewable waste'] eurostat_pivot['Renewable energy sources'] = eurostat_pivot[res_arr].sum(axis=1) eurostat_pivot['Fossil fuels'] = eurostat_pivot['Fossil fuels'] - eurostat_pivot['Bioenergy and renewable waste'] eurostat_pivot['Differently categorized fossil fuels'] = eurostat_pivot['Fossil fuels']\ - eurostat_pivot['Non-renewable waste'] total_arr = ['Fossil fuels','Nuclear','Renewable energy sources'] eurostat_pivot['Total'] = eurostat_pivot[total_arr].sum(axis=1) eurostat_pivot.head() # In[15]: data_eurostat = eurostat_pivot.stack().reset_index().rename(columns={0: 'capacity'}) data_eurostat['source'] = 'EUROSTAT' data_eurostat['source_type'] = 'Statistical Office' data_eurostat['capacity_definition'] = 'Unknown' data_eurostat['type'] = 'Installed capacity in MW' data_eurostat['weblink'] = url_eurostat data_eurostat.head() # ## 3.3 ENTSO-E data # The ENTSO-E publishes annual data on national generation capacites in different specifications and formats. We use two relevant data sources from the ENTSOE-E, which comprises firstly statistical data within the [Data Portal (up to 2015)](https://www.entsoe.eu/data/data-portal/Pages/default.aspx) or [ENTSO-E Transparency Platform](https://transparency.entsoe.eu/), and secondly datasets compiled within the [ENTSO-E System Outlook & Adequacy Forecast (SO&AF)](https://www.entsoe.eu/outlooks/maf/Pages/default.aspx). The ENTSO-E Transparency Platform is currently not implemented as a data source for national generation capacities. # # The advantage of the ENTSO-E SO&AF is the higher granularity of the data with respect to the main fuel or technology. However, as the SO&AF provides a forecast on future system conditions in particular peak hours, the dataset also accounts for expected capacity changes throughout the years. Therefore, we only consider years which are closest to the publication year of the respective SO&AF. # ### 3.3.1 ENTSO-E Statistical Data # In the following, we use the statistical data available in the [Data Portal (up to 2015)](https://www.entsoe.eu/data/data-portal/Pages/default.aspx). # In[16]: url_entsoe = 'https://docstore.entsoe.eu/Documents/Publications/Statistics/NGC_2010-2015.xlsx' filepath_entsoe = func.downloadandcache(url_entsoe, 'Statistics.xls', os.path.join('ENTSO-E','Data Portal 2010-2015') ) data_entsoe_raw = pd.read_excel(filepath_entsoe) data_entsoe_raw.head() # In[17]: dict_energy_source = {'hydro': 'Hydro', 'of which storage': 'Reservoir', 'of which run of river': 'Run-of-river', 'of which pumped storage': 'Pumped storage', 'nuclear': 'Nuclear', 'of which wind': 'Wind', 'of which solar': 'Solar', 'of which biomass': 'Biomass and biogas', 'fossil_fuels': 'Fossil fuels', 'other': 'Other or unspecified energy sources', "Country": "country", 'fossil_fueals': 'Fossil fuels'} data_entsoe_raw.rename(columns=dict_energy_source, inplace=True) data_entsoe_raw.drop(columns='representativity', inplace=True) data_entsoe_raw.head() # In[18]: data_entsoe_raw['Differently categorized solar'] = data_entsoe_raw['Solar'] data_entsoe_raw['Differently categorized wind'] = data_entsoe_raw['Wind'] data_entsoe_raw['Bioenergy and renewable waste'] = data_entsoe_raw['Biomass and biogas'] data_entsoe_raw['Differently categorized fossil fuels'] = data_entsoe_raw['Fossil fuels'] data_entsoe_raw['Differently categorized hydro'] = ( data_entsoe_raw['Hydro'] - data_entsoe_raw['Run-of-river'] - data_entsoe_raw['Reservoir'] - data_entsoe_raw['Pumped storage']) data_entsoe_raw['Differently categorized renewable energy sources'] = ( data_entsoe_raw['renewable'] - data_entsoe_raw['Wind'] - data_entsoe_raw['Solar'] - data_entsoe_raw['Biomass and biogas']) data_entsoe_raw.drop(columns='renewable', inplace=True) data_entsoe_raw['Renewable energy sources'] = ( data_entsoe_raw['Hydro'] + data_entsoe_raw['Wind'] + data_entsoe_raw['Solar'] + data_entsoe_raw['Bioenergy and renewable waste'] + data_entsoe_raw['Differently categorized renewable energy sources']) data_entsoe_raw['Total'] = ( data_entsoe_raw['Renewable energy sources'] + data_entsoe_raw['Nuclear'] + data_entsoe_raw['Fossil fuels'] + data_entsoe_raw['Other or unspecified energy sources']) data_entsoe = pd.melt(data_entsoe_raw, id_vars=['country', 'year'], var_name='technology', value_name='capacity') data_entsoe.head() # In[19]: data_entsoe['country'].replace('NI', 'GB', inplace=True) # set negative capacities to zero data_entsoe.loc[data_entsoe['capacity'] < 0, 'capacity'] = 0 data_entsoe['source'] = 'ENTSO-E Data Portal' data_entsoe['source_type'] = 'Other association' data_entsoe['capacity_definition'] = 'Net capacity' data_entsoe['type'] = 'Installed capacity in MW' data_entsoe.head() # ### 3.3.2 ENTSO-E SO&AF data # In[20]: soafs = [soaf.SoafDataRaw('https://www.entsoe.eu/fileadmin/user_upload/_library/SDC/SOAF/SO_AF_2011_-_2025_.zip', 'SO_AF_2011_-_2025_.zip', 'SO&AF 2011 - 2025 Scenario B.xls', 2011), soaf.SoafDataRaw('https://www.entsoe.eu/fileadmin/user_upload/_library/SDC/SOAF/120705_SOAF_2012_Dataset.zip', '120705_SOAF_2012_Dataset.zip', 'SOAF 2012 Scenario B.xls', 2012), soaf.SoafDataRaw('https://www.entsoe.eu/fileadmin/user_upload/_library/publications/entsoe/So_AF_2013-2030/130403_SOAF_2013-2030_dataset.zip', '130403_SOAF_2013-2030_dataset.zip', 'ScB.xls', 2013), soaf.SoafDataRaw('https://www.entsoe.eu/Documents/SDC%20documents/SOAF/140602_SOAF%202014_dataset.zip', '140602_SOAF%202014_dataset.zip', 'ScB.xlsx', 2014), soaf.SoafDataRaw('https://www.entsoe.eu/Documents/Publications/SDC/data/SO_AF_2015_dataset.zip', 'SO_AF_2015_dataset.zip', os.path.join('SO&AF 2015 dataset', 'ScB_publication.xlsx'), 2016)] data_soaf = pd.concat([s.transformed_df for s in soafs]) # Correct that in the Soaf2015 datatset the year column is 2016 instead of 2015 data_soaf['year'].replace({2016 : 2015}, inplace=True) data_soaf.head() # In[21]: soaf_unstacked = func.unstackData(data_soaf) soaf_unstacked['Differently categorized solar'] = soaf_unstacked['Solar'] soaf_unstacked['Differently categorized wind'] = soaf_unstacked['Wind']\ - soaf_unstacked['Offshore']\ - soaf_unstacked['Onshore'] soaf_unstacked['Differently categorized hydro'] = soaf_unstacked['Hydro']\ - soaf_unstacked['Run-of-river']\ - soaf_unstacked['Reservoir including pumped storage'] soaf_unstacked['Bioenergy and renewable waste'] = soaf_unstacked['Biomass and biogas'] soaf_unstacked['Differently categorized renewable energy sources'] = ( soaf_unstacked['renewable'] - soaf_unstacked['Wind'] - soaf_unstacked['Solar'] - soaf_unstacked['Biomass and biogas']) soaf_unstacked.drop(columns='renewable', inplace=True) subtract_fossils_arr = ['Lignite','Hard coal','Oil','Natural gas','Mixed fossil fuels'] soaf_unstacked['Differently categorized fossil fuels'] = soaf_unstacked['Fossil fuels']\ - soaf_unstacked[subtract_fossils_arr].sum(axis=1) res_arr = ['Solar','Wind','Bioenergy and renewable waste','Hydro', 'Differently categorized renewable energy sources'] soaf_unstacked['Renewable energy sources'] = soaf_unstacked[res_arr].sum(axis=1) total_arr = ['Renewable energy sources','Fossil fuels','Nuclear', 'Other or unspecified energy sources'] soaf_unstacked['Total'] = soaf_unstacked[total_arr].sum(axis=1) soaf_unstacked.head() # In[22]: data_soaf = func.restackData(soaf_unstacked) data_soaf.loc[data_soaf['capacity'] < 0, 'capacity'] = 0 data_soaf['source'] = 'ENTSO-E SOAF' data_soaf['type'] = 'Installed capacity in MW' data_soaf['capacity_definition'] = 'Net capacity' data_soaf['source_type'] = 'Other association' data_soaf['weblink'] = url_entsoe data_soaf.head() # ### 3.3.3 ENTSO-E Transparency Platform # In[23]: # file pattern for the single years filenamepattern = '_1_InstalledGenerationCapacityAggregated.csv' list_of_data_tables = [] # list to append # iterate over the years from 2015 to 2020 for i in range(2015,2021): filepath = os.path.join('input', 'ENTSO-E', 'Transparency', 'InstalledGenerationCapacityAggregated', str(i) + filenamepattern) list_of_data_tables.append(pd.read_csv(filepath, delimiter="\t", encoding = "UTF-16")) # merge the datasets of the single of files into one pandas dataframe data_transparency = pd.concat(list_of_data_tables, ignore_index=True) data_transparency.head() # In[24]: # rename columns according to the opsd standards data_transparency.rename(columns={'ProductionType': 'technology', 'AggregatedInstalledCapacity': 'capacity', 'MapCode': 'country', 'Year': 'year'}, inplace=True) # drop non relevant columns data_transparency = data_transparency.filter(items=['technology','capacity','country','year'], axis=1) # drop countries that are not part of opsd data_transparency = data_transparency[data_transparency['country'].isin(data_opsd.country.unique())] data_transparency.head() # In[25]: # adapt energy source notation dict_energy_source = {'Biomass': 'Biomass and biogas', 'Fossil Brown coal/Lignite': 'Lignite', 'Fossil Coal-derived gas': 'Mixed fossil fuels', 'Fossil Gas': 'Natural gas', 'Fossil Hard coal': 'Hard coal', 'Fossil Oil': 'Oil', 'Fossil Oil shale': 'Oil', 'Fossil Peat': 'Other fossil fuels', 'Hydro Pumped Storage': 'Pumped storage', 'Hydro Run-of-river and poundage': 'Run-of-river', 'Hydro Water Reservoir': 'Reservoir', 'Other': 'Other or unspecified energy sources', 'Other renewable': 'Differently categorized renewable energy sources', 'Waste': 'Other bioenergy and renewable waste', 'Wind Offshore': 'Offshore', 'Wind Onshore': 'Onshore', ' ': np.nan} data_transparency['technology'].replace(dict_energy_source, inplace=True) data_transparency.head() # In[26]: # add missing categories transparency_pivot = data_transparency.pivot_table(values='capacity', index=['country','year'], columns='technology') # technology level transparency_pivot['Differently categorized solar'] = transparency_pivot['Solar'] transparency_pivot['Differently categorized natural gas'] = transparency_pivot['Natural gas'] transparency_pivot['Non-renewable waste'] = 0 transparency_pivot['Differently categorized fossil fuels'] = 0 # level 3 hydro_arr = ['Pumped storage', 'Reservoir', 'Run-of-river'] transparency_pivot['Hydro'] = transparency_pivot[hydro_arr].sum(axis=1) wind_arr = ['Onshore', 'Offshore'] transparency_pivot['Wind'] = transparency_pivot[wind_arr].sum(axis=1) # level 2 bio_arr = ['Biomass and biogas', 'Other bioenergy and renewable waste'] transparency_pivot['Bioenergy and renewable waste'] = transparency_pivot[bio_arr].sum(axis=1) #level 1 res_arr = ['Hydro', 'Wind', 'Solar', 'Geothermal', 'Marine', 'Bioenergy and renewable waste', 'Differently categorized renewable energy sources'] transparency_pivot['Renewable energy sources'] = transparency_pivot[res_arr].sum(axis=1) fossil_arr = ['Lignite', 'Hard coal', 'Oil', 'Natural gas', 'Mixed fossil fuels', 'Other fossil fuels'] transparency_pivot['Fossil fuels'] = transparency_pivot[fossil_arr].sum(axis=1) # level 0 total_arr = ['Fossil fuels','Nuclear','Renewable energy sources', 'Other or unspecified energy sources'] transparency_pivot['Total'] = transparency_pivot[total_arr].sum(axis=1) transparency_pivot.reset_index(inplace=True) transparency_pivot.head() # In[27]: data_transparency = pd.melt(transparency_pivot, id_vars=['country', 'year'], var_name='technology', value_name='capacity') data_transparency = data_transparency.loc[data_transparency["year"] < 2020, :] data_transparency['source'] = 'ENTSO-E Transparency Platform' data_transparency['source_type'] = 'Other association' data_transparency['capacity_definition'] = 'Net capacity' data_transparency['type'] = 'Installed capacity in MW' data_transparency['weblink'] = ('https://transparency.entsoe.eu/generation' '/r2/installedGenerationCapacityAggregation/show') data_transparency.head() # ### 3.3.4 ENTSO-E Power Statistics # In[28]: row_of_year = {2014: 9, 2015: 53, 2016: 97, 2017: 141, 2018: 185} dataframes = [] for year, row in row_of_year.items(): # read the dataframe for each year power_statistics_raw = pd.read_excel(os.path.join('input', 'ENTSO-E', 'Power Statistics', 'NGC.xlsx'), header=[0,1], sheet_name='NGC', skiprows=row, nrows=42) # drop non relevant columns power_statistics_raw.drop(columns='Coverage ratio in %', level=1, inplace=True) power_statistics_raw.drop(columns=['Unnamed: 1_level_1','Unnamed: 2_level_1','Unnamed: 3_level_1','Unnamed: 4_level_1'], level=1, inplace=True) # get rid of multi index df = power_statistics_raw.set_index(year).stack().reset_index().drop('level_1', axis=1) # remove leftovers of multi index in the index column df["technology"] = df[year].apply(lambda x: x[0]) df.drop(columns=year, inplace=True) # stack df to the opsd standard format stacked_df = df.melt(id_vars='technology', var_name='country', value_name='capacity') # add information about the year stacked_df['year'] = year # append to the main list of dataframes dataframes.append(stacked_df) power_statistics = pd.concat(dataframes) power_statistics.head() # In[29]: # drop countries that are not covered in opsd opsd_countries = data_opsd.country.unique() drop_list_country = power_statistics.loc[~power_statistics['country'].isin(opsd_countries)].index.to_list() power_statistics.drop(drop_list_country, inplace=True) # technology classes to be dropped tech_to_drop = ['Non-Renewable', 'Fossil fuels', 'Renewable','Non-renwable hydro', 'Total Waste', 'Bio', 'Renewable Hydro', 'Comments', 'Total NGC'] drop_list_tech = power_statistics.loc[power_statistics['technology'].isin(tech_to_drop)].index.to_list() power_statistics.drop(drop_list_tech, inplace=True) # replace string with values that can be used in math operations power_statistics['capacity'].replace(to_replace='Not Expected', value=0, inplace=True) power_statistics['capacity'].replace(to_replace='Not Available', value=np.nan, inplace=True) power_statistics.head() # In[30]: # Not included because already categorized in OPSD standard: # Nuclear, Solar, Geothermal, Wind dict_energy_source = {'Of which hydro pure pumped storage':'Pumped storage', 'Of which Hydro mixed pumped storage (non renewable part)':'Pumped storage', 'Of which Fossil Brown coal/Lignite':'Lignite', 'Of which Fossil Coal-derived gas':'Differently categorized fossil fuels', 'Of which Fossil Gas':'Natural gas', 'Of which Fossil Hard coal':'Hard coal', 'Of which Fossil Oil':'Oil', 'Of which Fossil Oil shale':'Oil', 'Of which Fossil Peat':'Differently categorized fossil fuels', 'Of which Mixed fuels':'Mixed fossil fuels', 'Of which Other fossil fuels':'Other fossil fuels', 'Non-renewable Waste':'Non-renewable waste', 'Other non-renewable':'Differently categorized fossil fuels', 'Of which Wind offshore':'Offshore', 'Of which Wind onshore':'Onshore', 'Of which Solar PV':'Photovoltaics', 'Of which Solar Thermal':'Differently categorized solar', 'Of which Biomass':'Biomass and biogas', 'Of which Biogas':'Biomass and biogas', 'Renewable Waste':'Other bioenergy and renewable waste', 'Of which Hydro Pure storage':'Reservoir', 'Of which Hydro Run-of-river and pondage':'Run-of-river', 'Of which Hydro mixed pumped storage (renewable part)':'Pumped storage', 'Of which Hydro Marine (tidal/wave)':'Marine', 'Other renewable (not listed)':'Differently categorized renewable energy sources', 'Non identified (other not listed)':'Other or unspecified energy sources', 'Total Hydro':'Hydro'} power_statistics["technology"].replace(dict_energy_source, inplace=True) power_statistics.head() # In[31]: powerstats_pivot = power_statistics.pivot_table(values='capacity', index=['country','year'], columns='technology').reset_index() powerstats_pivot.head() # In[32]: # technology level powerstats_pivot['Differently categorized natural gas'] = powerstats_pivot['Natural gas'] # level 2 powerstats_pivot['Bioenergy and renewable waste'] = ( powerstats_pivot['Biomass and biogas'] + powerstats_pivot['Other bioenergy and renewable waste']) #level 1 fossil_techs = ['Lignite', 'Hard coal', 'Oil', 'Natural gas', 'Non-renewable waste', 'Mixed fossil fuels', 'Other fossil fuels', 'Differently categorized fossil fuels'] powerstats_pivot['Fossil fuels'] = powerstats_pivot[fossil_techs].sum(axis=1) res_tech = ['Hydro', 'Wind', 'Solar', 'Geothermal', 'Marine', 'Bioenergy and renewable waste', 'Differently categorized renewable energy sources'] powerstats_pivot['Renewable energy sources'] = powerstats_pivot[res_tech].sum(axis=1) total_arr = ['Fossil fuels','Nuclear','Renewable energy sources'] powerstats_pivot['Total'] = powerstats_pivot[total_arr].sum(axis=1) powerstats_pivot.head() # In[33]: data_power_statistics = powerstats_pivot.melt(id_vars=['country', 'year'], var_name='technology', value_name='capacity') data_power_statistics['source'] = 'ENTSO-E Power Statistics' data_power_statistics['source_type'] = 'Other association' data_power_statistics['capacity_definition'] = 'Net capacity' data_power_statistics['type'] = 'Installed capacity in MW' data_power_statistics['weblink'] = ('https://www.entsoe.eu/data/' 'power-stats/net-gen-capacity/') data_power_statistics.head() # ## 3.4 Merge data sources # In[34]: dataframes = [data_opsd, data_eurostat, data_soaf, data_entsoe, data_transparency, data_power_statistics] data = pd.concat(dataframes, sort=False) data['comment'] = data['comment'].fillna('').astype(str) col_order = ['technology', 'source', 'source_type', 'weblink', 'year', 'type', 'country', 'capacity_definition', 'capacity', 'comment'] data = data[col_order] energy_source_mapping = pd.read_csv(os.path.join('input','energy_source_mapping.csv'), index_col ='name') energy_source_mapping.replace({0: False, 1: True}, inplace=True) data = data.merge(energy_source_mapping, left_on='technology', right_index=True, how='left') new_level_names = {"Level 0": "energy_source_level_0", "Level 1": "energy_source_level_1", "Level 2": "energy_source_level_2", "Level 3": "energy_source_level_3", "Technology level": "technology_level"} data.rename(columns=new_level_names, inplace=True) data.head() # # 4. Convert stacked data to crosstable format # In[35]: cols = ['technology', 'source', 'source_type', 'weblink','year', 'type', 'country', 'capacity_definition', 'capacity'] data_crosstable = pd.pivot_table(data[cols], index=['technology'], columns=['country', 'type', 'year', 'source', 'source_type', 'weblink', 'capacity_definition'], values='capacity') # Apply initial ordering of technologies data_crosstable = data_crosstable.reindex(technology_order) # Delete index naming data_crosstable.index.name = None data_crosstable.columns.names = ('Country (ISO code)', 'Type of data', 'Year', 'Source', 'Type of source', 'Weblink', 'Capacity definition (net, gross, unknown)') data_crosstable.head() # In[36]: energylevels_table = energylevels_raw[6:] energylevels_table.columns = pd.MultiIndex.from_arrays(energylevels_raw[:6].values, names=['country', 'type', 'year', 'source', 'source_type', 'capacity_definition' ]) energylevels_table = energylevels_table.reset_index() energylevels_table['technology'] = energylevels_table['technology'].str.replace('- ', '') energylevels_table = energylevels_table.set_index('technology') # Delete index naming energylevels_table.index.name = None energylevels_table.columns.names = ('Country (ISO code)', 'Description', None, None, None, 'Level') energylevels_table.head() # # 5. Output # Delete downloaded zip files # In[37]: for root, dirs, files in os.walk("download"): for file in files: item = os.path.join(root, file) if item.endswith(".zip"): os.remove(item) print("Deleted: " + item) # Copy input files # In[38]: orig_data_path = os.path.join('output', 'original_data') shutil.rmtree(orig_data_path, ignore_errors=True) func.copydir(os.path.join('input'), orig_data_path) func.copydir(os.path.join('download'), orig_data_path) # ## 5.1 Write results to file # Write stacked data to formats: csv, xls and sql. # In[39]: # Write the result to file data.to_csv(os.path.join('output', 'national_generation_capacity_stacked.csv'), encoding='utf-8', index_label='ID') # Write the results to excel file data.to_excel(os.path.join('output', 'national_generation_capacity_stacked.xlsx'), sheet_name='output', index_label='ID') # Write the results to sql database data.to_sql('national_generation_capacity_stacked', sqlite3.connect(os.path.join('output', 'national_generation_capacity.sqlite')), if_exists="replace", index_label='ID') # Write data in human readable form to excel. # In[40]: # Write crosstable data to excel file writer = pd.ExcelWriter(os.path.join('output', 'national_generation_capacity.xlsx')) data_crosstable.to_excel(writer, sheet_name='output') energylevels_table.to_excel(writer, sheet_name='technology levels') writer.save() # ## 5.2 Formatting of Excel tables # In[41]: outputxls = openpyxl.load_workbook(os.path.join('output', 'national_generation_capacity.xlsx')) ws1 = outputxls['output'] ws2 = outputxls['technology levels'] ws1_rows, ws1_cols = data_crosstable.shape amount_cols = ws1_cols + 1 # correct 0 index ws1.column_dimensions['A'].width = 50 ws2.column_dimensions['A'].width = 50 # In[42]: blackfont = Font(color=colors.BLACK, italic=False, bold=False) blackfontitalic = Font(color=colors.BLACK, italic=True, bold=False) blackfontbold = Font(color=colors.BLACK, italic=False, bold=True) align0 = Alignment(horizontal='left', indent=0) align1 = Alignment(horizontal='left', indent=1) align2 = Alignment(horizontal='left', indent=2) # darkest grey colour = "{0:02X}{1:02X}{2:02X}".format(166, 166, 166) grey166 = PatternFill(fgColor=colour, bgColor=colour, patternType="solid") # darker grey colour = "{0:02X}{1:02X}{2:02X}".format(191, 191, 191) grey191 = PatternFill(fgColor=colour, bgColor=colour, patternType="solid") # lighter grey colour = "{0:02X}{1:02X}{2:02X}".format(217, 217, 217) grey217 = PatternFill(fgColor=colour, bgColor=colour, patternType="solid") # lightest grey colour = "{0:02X}{1:02X}{2:02X}".format(242, 242, 242) grey242 = PatternFill(fgColor=colour, bgColor=colour, patternType="solid") # In[43]: for col in range(2, amount_cols+1): colname = openpyxl.utils.cell.get_column_letter(col) ws1.column_dimensions[colname].width = 16 for col in range(1, amount_cols+1): # format column name block for row in range(2,8): ws1.cell(row=row, column=col).font = blackfont # format cells that contain the values for row in range(9, 48): ws1.cell(row=row, column=col).fill = grey242 ws1.cell(row=row, column=1).font = blackfontitalic ws1.cell(row=row, column=1).alignment = align2 # format row 'Total' with dark grey ws1.cell(row=47, column=col).fill = grey166 ws1.cell(row=47, column=col).font = blackfontbold # format level 1 for row in [9, 22, 23, 46]: ws1.cell(row=row, column=col).fill = grey191 ws1.cell(row=row, column=col).font = blackfontbold ws1.cell(row=row, column=1).alignment = align0 # format level 2 for row in [10, 11, 12, 13, 18, 19, 20, 21, 24, 31, 35, 39, 40, 41, 45]: ws1.cell(row=row, column=col).fill = grey217 ws1.cell(row=row, column=1).alignment = align1 ws1.cell(row=47, column=1).alignment = align0 ws1.freeze_panes = ws1['B8'] #freeze first column and header rows # In[44]: # do the same for the second worksheet 'technology levels' for col in range(1, 7): colname = get_column_letter(col + 1) ws2.column_dimensions[colname].width = 25 for row in range(2, 8): ws2.cell(row=row, column=col).font = blackfont for row in range(9, 48): ws2.cell(row=row, column=col).fill = grey242 ws2.cell(row=row, column=1).font = blackfontitalic ws2.cell(row=row, column=1).alignment = align2 # format row 'Total' with dark grey ws2.cell(row=46, column=col).fill = grey166 # format level 1 for row in [8, 21, 22, 45]: ws2.cell(row=row, column=col).fill = grey191 ws2.cell(row=row, column=col).font = blackfontbold ws2.cell(row=row, column=1).font = blackfontbold ws2.cell(row=row, column=1).alignment = align0 # format level 2 for row in [9, 10, 11, 12, 17, 18, 19, 20, 23, 30, 34, 38, 39, 40, 44]: ws2.cell(row=row, column=col).fill = grey217 ws2.cell(row=row, column=1).alignment = align1 ws2.cell(row=46, column=1).alignment = align0 # In[45]: additional_notes = openpyxl.load_workbook(os.path.join('input', 'National_Generation_Capacities.xlsx'))['Additional notes'] # copy additional notes to output file for col in range(1, 3): for row in range(1, 10): add_notes_value = additional_notes.cell(row=row, column=col).value ws1.cell(row=row + 50, column=col).value = add_notes_value ws1.cell(row=51, column=1).font = blackfontbold ws1.cell(row=row + 51, column=1).font = blackfontitalic # In[46]: outputxls.save(os.path.join('output', 'national_generation_capacity.xlsx')) # ## 5.3 Write checksums # In[47]: files = ['national_generation_capacity.xlsx', 'national_generation_capacity_stacked.csv', 'national_generation_capacity_stacked.xlsx', 'national_generation_capacity.sqlite'] hash_dict = {} filesize_dict = {} with open('checksums.txt', 'w') as f: for file_name in files: path = os.path.join('output', file_name) file_hash = func.get_sha_hash(path) hash_dict[file_name] = file_hash filesize_dict[file_name] = os.path.getsize(path) f.write('{},{}\n'.format(file_name, file_hash)) # # 6. Documentation of the data package # We document the data packages meta data in the specific format JSON as proposed by the Open Knowledge Foundation. See the Frictionless Data project by OKFN (http://data.okfn.org/) and the Data Package specifications (http://dataprotocols.org/data-packages/) for more details. # # In order to keep the notebook more readable, we first formulate the metadata in the human-readable YAML format using a multi-line string. We then parse the string into a Python dictionary and save that to disk as a JSON file. # In[48]: with open(os.path.join('input', 'metadata.yml'), 'r') as f: metadata = yaml.load(f.read(), Loader=yaml.BaseLoader) metadata['resources'][0]['hash'] = hash_dict['national_generation_capacity.xlsx'] metadata['resources'][1]['hash'] = hash_dict['national_generation_capacity_stacked.csv'] metadata['resources'][0]['bytes'] = filesize_dict['national_generation_capacity.xlsx'] metadata['resources'][1]['bytes'] = filesize_dict['national_generation_capacity_stacked.csv'] # In[49]: datapackage_json = json.dumps(metadata, indent=4, separators=(',', ': ')) # Write the information of the metadata with open(os.path.join('output', 'datapackage.json'), 'w') as f: f.write(datapackage_json) # End of script.