#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('run', 'Requirements.ipynb')
get_ipython().run_line_magic('run', 'Country_ISO_Codes.ipynb')


# # Sources used in Greenhouse Data
# All of these inventories have been homogenized (as much as possible/recommended) by making sure they all include the following characteristics:
# - Country ISO codes (ISO-3). This is useful to easily create emission figures for all countries included in an inventory. 
# - Emission units in CO2 or CO2-equivalent (CO2eq). Some inventories give native units of gases (for example kt of CH4), so columns were added for easy conversion using GWPs.
# 
# Furthermore, thank you to the following people:
# - Mauricio Foronda, for all the coding help
# - Nicolás Guarín-Zapata (nicoguaro in github) for the neon style in matplotlib, and for inspiring my NEONIZE() function.
# - My thesis supervisors: Sonia Seneviratne, Anthony Patt, Jonas Schwabb
# - Open Climate Data, from which I took the UNFCCC submission inventories.
# 

# # BP 
# 
# INFO:
#  - BP. (2021). Methodology for calculating CO 2 emissions from energy use. https://www.ipcc-nggip.iges.or.jp/public/2006gl/vol2.html
#  - Downloaded from "https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/xlsx/energy-economics/statistical-review/bp-stats-review-2021-all-data.xlsx"

# In[4]:


print( ">>>>>>> BP <<<<<<<<")

# download file
# link_bp = "https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/xlsx/energy-economics/statistical-review/bp-stats-review-2021-all-data.xlsx"
# resp = requests.get(link_bp)
# output = open('raw_data/BP/bp-stats-review-2021-all-data.xlsx', "wb")
# output.write(resp.content)
# output.close()

#read file
df_bp = pd.read_excel(io="../raw_data/BP/bp-stats-review-2021-all-data.xlsx",sheet_name=4,header=2)

#clean
df_bp = df_bp[df_bp[2020].notnull()]
df_bp = df_bp.rename(columns = {"Million tonnes of carbon dioxide":"name"})
df_bp = df_bp.drop(["2020.1","2009-19","2020.2"], axis=1)#deleting last three columns
df_bp.loc[:,1965:] = df_bp.loc[:,1965:].astype(float)

#add unit
df_bp.insert(1, "unit", "MtCO2")

#reindex
df_bp = df_bp.reset_index().drop("index", axis=1)

# CODE_GENERATOR_ISO3 
df_bp.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_bp, show_missing=True,show_duplicates_nan=True)

# -----------------------------------------------------------------------------------------------------

#global?
df_bp_global = df_bp[df_bp.name=="Total World"].iloc[0,:]
df_bp_global.to_csv("../clean_data/global_subsets/BP_global.csv")

# ------------------------------------------------------------------------------------------------------------------

df_bp.to_csv("../clean_data/BP_global.csv", index=None)
df_bp = pd.read_csv("../clean_data/BP.csv")
df_bp


# # CDIAC
# INFO:
# - Carbon Dioxide Information Analysis Center
# - CDIAC. (2017). Global, Regional, and National Fossil-Fuel CO2 Emissions. https://cdiac.ess-dive.lbl.gov/trends/emis/overview_2013.html
# - DOI: 10.3334/CDIAC/00001_V2017
# - Authors: Tom Boden and Bob Andres (Oak Ridge National Laboratory); Gregg Marland (Appalachian State University)
# 
# NOTES:
# - Units: given in carbon units, not carbon dioxide (converted by multiplying values by 44/12)

# In[13]:


print( ">>>>>>> CDIAC <<<<<<<<")

#download file
link_cdiac = "http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv"
resp = requests.get(link_cdiac)
output = open('../raw_data/CDIAC/nation.1751_2014.csv',"wb")
output.write(resp.content)
output.close()

#read file
df_cdiac = pd.read_csv("../raw_data/CDIAC/nation.1751_2014.csv")

# #clean
df_cdiac = df_cdiac.drop([0,1,2])
df_cdiac = df_cdiac.rename(columns = {"Nation":"name"})
df_cdiac = df_cdiac.replace(".",np.nan) #remove empty cells marked with . to NaN or 0? NaN
df_cdiac.iloc[:,2:] = df_cdiac.iloc[:,2:].astype(float, errors="raise")

#CONVERT Carbon units to Carbon Dioxide units
convert_C_to_CO2=44/12
df_cdiac.iloc[:,2:]  = df_cdiac.iloc[:,2:] * convert_C_to_CO2 / 1000


#convert years to integers?
df_cdiac.Year = df_cdiac.Year.astype(int)

#add unit (and change column name, unit multiplied by 1000)
df_cdiac.insert(2, "unit", "MtCO2")
df_cdiac = df_cdiac.rename(columns={"Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)":"Total CO2 emissions from fossil-fuels and cement production (MtCO2)",
                                    "Per capita CO2 emissions (metric tons of carbon)": "Per capita CO2 emissions (tCO2)"})

# reindex
df_cdiac = df_cdiac.reset_index().drop("index", axis=1)

# CODE_GENERATOR_ISO3 
df_cdiac.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_cdiac, show_missing=True,show_duplicates_nan=True)

# global
df_cdiac_global = df_cdiac.groupby(["Year","unit"]).sum().reset_index()
df_cdiac_global.to_csv("../clean_data/global_subsets/CDIAC_global.csv")

#------------------------------------------------------------------------------------------------------------------------------

df_cdiac.to_csv("../clean_data/CDIAC.csv", index=None)
df_cdiac = pd.read_csv("../clean_data/CDIAC.csv")
df_cdiac_global


# # EIA
# 
# INFO:
# - Energy Information Administration of the United States of America
# - Data downloaded from website of eia.gov > international > other statistics > emissions by fuel type, and manually cleaned in excel, saved as EIA_raw_processed_all.csv
# - https://www.eia.gov/international/data/world/other-statistics/emissions-by-fuel

# In[16]:


print( ">>>>>>> EIA <<<<<<<<")

#read file
df_eia = pd.read_csv("../raw_data/EIA/EIA_raw_processed_all.csv")

#clean
df_eia = df_eia.drop("Unnamed: 0", axis=1)
df_eia = df_eia.rename({'Unnamed: 1': 'name', 'Unnamed: 2': 'fuel'}, axis=1)  # new method
df_eia = df_eia.replace("--",np.nan)
df_eia.loc[:,"1949":] = df_eia.loc[:,"1949":].astype(float)
df_eia.loc[:,"1949":].columns = df_eia.loc[:,"1949":].columns.astype(int)
df_eia.fuel = df_eia.fuel.replace({"        CO2 emissions (MMtonnes CO2)":"Total CO2 emissions",
                           "            Coal and coke (MMtonnes CO2)":"Coal and coke",
                           "            Consumed natural gas (MMtonnes CO2)": "Consumed natural gas",
                           "            Petroleum and other liquids (MMtonnes CO2)": "Petroleum and other liquids"})
#delete all empty unnecesary rows
for x in range(231):
    equation1 = 6*x
    equation2 = 6*x+1
    df_eia = df_eia.drop([equation1])
    df_eia = df_eia.drop([equation2])

# #add unit
df_eia.insert(2, "unit", "MtCO2")

# #reindex
df_eia = df_eia.reset_index().drop("index", axis=1)

# CODE_GENERATOR_ISO3 
df_eia.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_eia, show_missing=True,show_duplicates_nan=True)

# #global?
year = 2018
df_eia_global = df_eia[df_eia.name=="World"]
df_eia_global.to_csv("../clean_data/global_subsets/EIA_global.csv")
#------------------------------------------------------------------------------------------------------------------------------

df_eia.to_csv("../clean_data/EIA.csv", index=None)
df_eia = pd.read_csv("../clean_data/EIA.csv")

df_eia_global


# # IEA
# 
# INFO:
# - International Energy Agency's GHG emissions from energy *highlights*
# - downloaded from https://www.iea.org/data-and-statistics/data-product/greenhouse-gas-emissions-from-energy-highlights
# 
# NOTES:
# - this dataset includes all greenhouse gases, but only shows quantities in carbon dioxide equivalents
# - should I delete bunkers in the main df? (currently they are included)
# - Sectoral data by country is available in raw_data/IEA/cleaning_products folder

# In[17]:


print( ">>>>>>> IEA <<<<<<<<")

# read excel file
excel_df_dict = pd.read_excel("../raw_data/IEA/GHGHighlights.XLS",sheet_name=None,header=3)

# Cleaning 1: of the whole excel (wow), making individual dataframes
sheet_list = excel_df_dict.keys()
sheet_list = ['CONTENTS', 'GHG Energy', 'GHG Fugi', 'GHG FC', 'GHG FC - Coal', 'GHG FC - Oil', 
              'GHG FC - Gas', 'CO2 MARBUNK', 'CO2 AVBUNK', 'SECTOR', 'SECTOREH', 'CO2-TES', 
              'CO2-GDP', 'CO2-GDP PPP', 'CO2-POP', 'SECTPOP', 'KAYA', 'TIMEEXTENDED', 'GEO COVERAGE']
country_list = excel_df_dict["GHG FC"][20:189]["million tonnes of CO2 eq"].array
sheet_list_24line = ['GHG FC', 'GHG FC - Coal', 'GHG FC - Oil', 'GHG FC - Gas']
sheet_list_22line = ['GHG Energy', 'GHG Fugi','CO2 MARBUNK', 'CO2 AVBUNK', 'SECTOR', 'SECTOREH']
df_list24 = ["total_fuel_combustion", "coal", "oil", "gas"]
df_list22 = ["energy", "fugitive", "marine_bunkers", "int_aviation_bunkers", "sectors_2019", "sectors_alt_2019"]
for i,j in enumerate(sheet_list_24line):
    # print(j)
    df = excel_df_dict[j]
    df = df.drop([0,2,10,12,17,18,19])
    df = df.iloc[:-3,:]
    df.insert(1,"type", "{}".format(df_list24[i]), allow_duplicates=True)
    df.insert(2,"unit","MtCO2eq", allow_duplicates=True)
    df = df.replace("..",np.nan)
    df = df.replace("x",np.nan)
    df.iloc[:,3:] = df.iloc[:,3:].astype(float)
    df = df.rename(columns={"million tonnes of CO2 eq":"name"})
    df = df[df.name.notna()]
    df = df.reset_index().drop("index", axis=1)
    df.to_csv("../raw_data/IEA/Cleaning_products/IEA_{}.csv".format(df_list24[i]))
    # display(df)
for i,j in enumerate(sheet_list_22line):
    # print(j)
    df = excel_df_dict[j]
    df = df.drop([0,2,10,12,15,16,17,])
    df = df.iloc[:-3,:]
    df.insert(1,"type", "{}".format(df_list22[i]), allow_duplicates=True)
    df.insert(2,"unit","MtCO2eq", allow_duplicates=True)
    df = df.replace("..",np.nan)
    df = df.replace("x",np.nan)
    df.iloc[:,3:] = df.iloc[:,3:].astype(float)
    df = df.rename(columns={"kilo tonnes of CO2 eq":"name"}) #for fugitive!
    df = df.rename(columns={"million tonnes of CO2":"name"}) #for bunkers
    df = df.rename(columns={"million tonnes of CO2 eq":"name"})
    df = df[df.name.notna()]
    df = df.reset_index().drop("index", axis=1)
    df.to_csv("../raw_data/IEA/Cleaning_products/IEA_{}.csv".format(df_list22[i]))
    # display(df)

#Cleaning 2: Stitch everything together!
df_iea = pd.concat([
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_energy.csv"),
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_total_fuel_combustion.csv"),
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_coal.csv"),
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_oil.csv"),
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_gas.csv"), 
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_marine_bunkers.csv"), #should i keep these last three?
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_int_aviation_bunkers.csv"), #should i keep these last three?
    pd.read_csv("../raw_data/IEA/Cleaning_products/IEA_fugitive.csv"), #should i keep these last three?
                   ])

#Cleaning 3: reset index and fix fugitive (/1000)
df_iea = df_iea.drop("Unnamed: 0", axis=1).reset_index().drop("index", axis=1)
df_iea.loc[df_iea.type=="fugitive", "1971":] = df_iea.loc[df_iea.type=="fugitive", "1971":]/1000

# CODE_GENERATOR_ISO3 
df_iea.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_iea, show_missing=True,show_duplicates_nan=True)


#global
df_iea_global = df_iea[df_iea.name=='World']
df_iea_global.to_csv("../clean_data/global_subsets/IEA_global.csv")

#------------------------------------------------------------------------------------------------------------------------------

df_iea.to_csv("../clean_data/IEA.csv", index=None)
df_iea = pd.read_csv("../clean_data/IEA.csv")

df_iea_global


# # GCP
# 
# INFO:
# - Global Carbon Project's main dataset: production/territorial-based CO2 emissions
# - Reference of the full global carbon budget 2021: Pierre  Friedlingstein, Matthew W. Jones, Michael O'Sullivan, Robbie M. Andrew, Dorothee, C. E. Bakker, Judith  Hauck, Corinne  Le Quéré, Glen P. Peters, Wouter  Peters, Julia Pongratz, Stephen Sitch, Josep G. Canadell, Philippe Ciais, Rob B. Jackson, Simone R. Alin, Peter Anthoni, Nicholas R. Bates, Meike Becker, Nicolas Bellouin, Laurent Bopp, Thi Tuyet Trang Chau, Frédéric Chevallier, Louise P. Chini, Margot Cronin, Kim I. Currie, Bertrand Decharme, Laique M. Djeutchouang, Xinyu Dou, Wiley Evans, Richard A. Feely, Liang Feng, Thomas Gasser, Dennis Gilfillan, Thanos Gkritzalis, Giacomo Grassi, Luke Gregor, Nicolas Gruber, Özgür Gürses, Ian Harris, Richard A. Houghton, George C. Hurtt, Yosuke Iida, Tatiana Ilyina, Ingrid T. Luijkx, Atul Jain, Steve D. Jones, Etsushi Kato, Daniel Kennedy, Kees Klein Goldewijk, Jürgen Knauer, Jan Ivar Korsbakken, Arne Körtzinger, Peter Landschützer, Siv K. Lauvset, Nathalie Lefèvre, Sebastian Lienert, Junjie Liu, Gregg Marland, Patrick C. McGuire, Joe R. Melton, David R. Munro, Julia E.M.S Nabel Shin-Ichiro Nakaoka, Yosuke Niwa, Tsuneo Ono, Denis Pierrot, Benjamin Poulter, Gregor Rehder, Laure Resplandy, Eddy Robertson, Christian Rödenbeck, Thais M Rosan, Jörg Schwinger, Clemens Schwingshackl, Roland Séférian, Adrienne J. Sutton, Colm Sweeney, Toste Tanhua, Pieter P Tans, Hanqin Tian, Bronte Tilbrook, Francesco Tubiello, Guido van der Werf, Nicolas Vuichard, Chisato Wada Rik Wanninkhof, Andrew J. Watson, David Willis, Andrew J. Wiltshire, Wenping Yuan, Chao Yue, Xu Yue, Sönke Zaehle, Jiye Zeng. Global Carbon Budget 2021, Earth Syst. Sci. Data, 2021. https://doi.org/XXXXXXX
# 
# NOTES:
# - Given in carbon units, converted to carbon dioxide

# In[20]:


print( ">>>>>>> GCP <<<<<<<<")

#read data
df_gcp = pd.read_excel("../raw_data/GCP/National_Carbon_Emissions_2021v0.4.xlsx",1, header=11, index_col=0)
df_gcp = df_gcp.T

#clean
df_gcp = df_gcp.reset_index().rename({"index":"name"}, axis=1)

#delete "statistical difference" row
df_gcp = df_gcp.drop(df_gcp.loc[df_gcp.name=="Statistical Difference"].index[0])

# convert from MtC to MtCO2
df_gcp.loc[:,1959:] = df_gcp.loc[:,1959:]*44/12

#add unit
df_gcp.insert(1, "unit", "MtCO2")

# CODE_GENERATOR_ISO3 
df_gcp.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_gcp, show_missing=True,show_duplicates_nan=True)

#global
df_gcp_global = df_gcp[df_gcp.name=="World"]
df_gcp_global.to_csv("../clean_data/global_subsets/GCP_global.csv")
#------------------------------------------------------------------------------------------------------------------------------

df_gcp.to_csv("../clean_data/GCP.csv", index=None)
df_gcp = pd.read_csv("../clean_data/GCP.csv")

df_gcp_global


# # GCP (consumption-based)
# 
# INFO:
# - Consumption-based CO2 emissions
# - Updated from Peters, GP, Minx, JC, Weber, CL and Edenhofer, O 2011. Growth in emission transfers via international trade from 1990 to 2008. Proceedings of the National Academy of Sciences 108, 8903-8908. http://www.pnas.org/content/108/21/8903.abstract
# - “Growth in emission transfers via international trade from 1990 to 2008” (Peters & Hertwich, 2008).
# 
# NOTES:
# - Given in carbon units, converted to carbon dioxide

# In[44]:


print( ">>>>>>> GCP (consumption) <<<<<<<<")

#read data
df_gcp_cons = pd.read_excel("../raw_data/GCP/National_Carbon_Emissions_2021v0.4.xlsx",2, header=8, index_col=0)
df_gcp_cons = df_gcp_cons.T

#clean
df_gcp_cons = df_gcp_cons.reset_index().rename({"index":"name"}, axis=1)

#delete "statistical difference" row
df_gcp_cons = df_gcp_cons.drop(df_gcp_cons.loc[df_gcp_cons.name=="Statistical Difference"].index[0])

# convert from MtC to MtCO2
df_gcp_cons.loc[:,1959:] = df_gcp_cons.loc[:,1959:]*44/12

#add unit
df_gcp_cons.insert(1, "unit", "MtCO2")

# CODE_GENERATOR_ISO3 
df_gcp_cons.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_gcp_cons, show_missing=True,show_duplicates_nan=True)

#-----------------------------------------------------------------------------------------------------------------------------

#global
df_gcp_cons_global = df_gcp_cons[df_gcp_cons.name=="World"]
df_gcp_cons_global.to_csv("../clean_data/global_subsets/GCP_consumption_global.csv")

#------------------------------------------------------------------------------------------------------------------------------

df_gcp_cons.to_csv("../clean_data/GCP_consumption.csv", index=None)
df_gcp_cons = pd.read_csv("../clean_data/GCP_consumption.csv")
df_gcp_cons


# # EPA
# INFO:
# - Environmental Protection Agency of the United States of America
# - *Global Non-CO2 Greenhouse Gas Emission Projections & Mitigation Potential: 2015-2050*
# - from https://www.epa.gov/global-mitigation-non-co2-greenhouse-gases/global-non-co2-greenhouse-gas-emission-projections (zip, proj-data-annex_Sept2019.xlsx file)

# In[25]:


print( ">>>>>>> EPA <<<<<<")

# read file
df_epa = pd.read_excel("../raw_data/EPA/proj-data-annex_Sept2019.xlsx", 0)

#clean
df_epa = df_epa.rename(columns={"country":"name"})
df_epa = df_epa.drop("unit",axis=1)
df_epa.insert(1,"unit","MtCO2eq")
df_epa.value.sum()

# CODE_GENERATOR_ISO3 
df_epa.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_epa, show_missing=True,show_duplicates_nan=True)


# #global - should I add it to the total dataframe?
df_epa_global = df_epa.groupby(["year", "unit","sector","source","subsource","gas"],dropna=False).sum().reset_index()
df_epa_global.insert(0, "name","World")
df_epa_global.insert(0, "code",np.nan)

df_epa_global.to_csv("../clean_data/global_subsets/EPA_global.csv")
print("\nGlobal non-CO2 emissions (2015): " + str(df_epa_global.groupby("year").sum().loc[2015,"value"]) + " MtCO2eq")

#------------------------------------------------------------------------------------------------------------------------------

df_epa.to_csv("../clean_data/EPA.csv", index=None)
df_epa = pd.read_csv("../clean_data/EPA.csv")
df_epa


# # FAO / FAOSTAT
# INFO:
# - Food and Agriculture Organization of the United Nations
# - from FAOSTAT website > Emission totals > Bulk download > All data (zip file, using NOFLAG file here): https://www.fao.org/faostat/en/#data/GT

# In[5]:


print( ">>>>>>> FAOSTAT <<<<<<<<")

df_fao = pd.read_csv("../raw_data/FAO/Emissions_Totals_E_All_Data_NOFLAG.csv", encoding='latin-1')

#clean
df_fao.columns = df_fao.columns.str.replace("Y","")
df_fao = df_fao.rename(columns={"Area":"name"})

# some info
    # print("Items    ", df.groupby("Item").count().index)
    # print("Elements  ",df.groupby("Element").count().index)
    # print("Source   ",df.groupby("Source").count().index)
    # print("Area   ",df.groupby("Area").count().index)

# CODE_GENERATOR_ISO3 
df_fao.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_fao, show_missing=True,show_duplicates_nan=True)

#global
df_fao_global = df_fao[df_fao.name=="World"]
df_fao_global.to_csv("../clean_data/global_subsets/FAO_global.csv")

#------------------------------------------------------------------------------------------------------------------------------

df_fao.to_csv("../clean_data/FAO.csv", index=None)
df_fao = pd.read_csv("../clean_data/FAO.csv", low_memory=False)

df_fao_global


# # CAIT 
# INFO: 
# - Climate Analysis IndicatorS Tool from the World Resources Institute (WRI)
# - Climate Watch Historical GHG Emissions. 2021. Washington, DC: World Resources Institute. Available online at: https://www.climatewatchdata.org/ghg-emissions
# - Downloaded from their website's data explorer >  "Download bulk data"> GHG emissions (zip file, only using CAIT csv file) https://www.climatewatchdata.org/data-explorer/historical-emissions?historical-emissions-data-sources=cait&historical-emissions-end_year=2018&historical-emissions-gases=all-ghg&historical-emissions-regions=All%20Selected&historical-emissions-sectors=total-including-lucf&historical-emissions-start_year=1990&page=1
# 

# In[28]:


print( ">>>>>>> CAIT <<<<<<<<")

df_cait = pd.read_csv("../raw_data/CAIT/CW_HistoricalEmissions_CAIT.csv")

#clean
df_cait = df_cait.rename(columns={"Country":"code"})


# COUNTRY_DICT NOT NECESSARY HERE, INVERSE (creating names out of ISO codes)
df_cait.insert(1, "name", np.nan)
COUNTRY_NAME_GENERATOR_FROM_ISO3(df_cait, show_missing = True, show_duplicates_nan = True)


#Global (included in dataset!)
df_cait_global = df_cait[df_cait.code=="WORLD"]
df_cait_global.to_csv("../clean_data/global_subsets/CAIT_global.csv")

#------------------------------------------------------------------------------------------------------------------------------

df_cait.to_csv("../clean_data/CAIT.csv", index=None)
df_cait = pd.read_csv("../clean_data/CAIT.csv")
df_cait


# # EDGAR 
# INFO:
# - European Comission
# - Crippa, Monica; Guizzardi, Diego; Muntean, Marilena; Schaaf, Edwin; Lo Vullo, Eleonora; Solazzo, Efisio; Monforti-Ferrario, Fabio; Olivier, Jos; Vignati, Elisabetta (2021):  EDGAR v6.0 Greenhouse Gas Emissions. European Commission, Joint Research Centre (JRC) [Dataset] PID: http://data.europa.eu/89h/97a67d67-c62e-4826-b873-9d972c4f670b
# 
# NOTES:
# - given in native units (must convert to CO2 equivalents)

# In[38]:


print( ">>>>>>> EDGAR <<<<<<<<")

"""
IMPORTANT: EDGAR website separates fossil CO2 emissions and biomass (organic) emissions. In their main methodology paper they exclude biomass emissions
(and other LULUCF emission), so we will do the same here too. To reiterate - biomass (organic) emissions are OMITTED here, but has its own dataset ("df_co2_org").

"""

#separate co2-organic (biomass) dataset (excluded from main dataset)
df_co2_org = pd.read_excel("../raw_data/" + "EDGAR/v60_CO2_org_short-cycle_C_1970_2018.xls", 0, header=9)
df_co2_org.insert(7,"gas","CO2 (bio)")

#read MAIN data
df_co2_fossil = pd.read_excel("../raw_data/" + "EDGAR/v60_CO2_excl_short-cycle_org_C_1970_2018.xls", 0, header=9)
df_co2_fossil.insert(7,"gas","CO2 (fossil)")
df_ch4 = pd.read_excel("../raw_data/" + "EDGAR/v60_CH4_1970_2018.xls", 0, header=9)
df_ch4.insert(7,"gas","CH4")
df_n2o = pd.read_excel("../raw_data/" + "EDGAR/v60_N2O_1970_2018.xls", 0, header=9)
df_n2o.insert(7,"gas","N2O")

#convert into one dataframe
df_edgar = pd.concat([df_ch4, df_co2_fossil, df_n2o]).reset_index().drop("index", axis=1)

#clean
df_edgar = df_edgar.rename(columns={"Name":"name"})
df_edgar = df_edgar.rename(columns={"Country_code_A3":"code"})

df_edgar.columns = df_edgar.columns.str.replace("Y_","")
    #df_edgar.ipcc_code_2006_for_standard_report = df_edgar.ipcc_code_2006_for_standard_report.str.replace(".","")
    
# IMPORTANT: CONVERT TO CO2equivalents!
GWP_100_ipcc_ar6 = {
        "CO2 (fossil)" : 1,
        "CO2 (bio)" : 1,
        "CH4" : 27.9,
        "N2O" : 273,
        }
for i in range(len(df_edgar)):
    df_edgar.loc[i,"1970":] = df_edgar.loc[i,"1970":] * GWP_100_ipcc_ar6[df_edgar.loc[i,"gas"]]

# divide by 1000 (converting gigagrams to Mt)    
df_edgar.loc[:,"1970":] = df_edgar.loc[:,"1970":] / 1000

#add unit column
df_edgar.insert(8,"unit","MtCO2")

#separate co2-organic (biomass) dataset
df_co2_org = pd.read_excel("../raw_data/" + "EDGAR/v60_CO2_org_short-cycle_C_1970_2018.xls", 0, header=9)
df_co2_org.insert(7,"gas","CO2 (bio)")


#no need to make codes nor names! 
# woooo! (except if I want to make codes for country groups such as "IPCC_annex" or "C_group_IM24_sh")


#global! (not inlcluded in main)
df_edgar_global = df_edgar.groupby(["IPCC_annex","ipcc_code_2006_for_standard_report", "ipcc_code_2006_for_standard_report_name","fossil_bio", "gas", "unit"]).sum().reset_index()
df_edgar_global.to_csv("../clean_data/global_subsets/EDGAR_global.csv")

print("NO DUPLICATES HERE!")

#------------------------------------------------------------------------------------------------------------------------------

df_edgar.to_csv("../clean_data/EDGAR.csv", index=None)
df_edgar = pd.read_csv("../clean_data/EDGAR.csv")
df_edgar_global


# # PRIMAP-hist (Version 2.31)
# INFO:
# - Potsdam-Institut für Klimafolgenforschung
# - Gütschow, J.; Jeffery, L.; Gieseke, R.; Gebel, R.; Stevens, D.; Krapp, M.; Rocha, M. (2016): The PRIMAP-hist national historical emissions time series, Earth Syst. Sci. Data, 8, 571-603, https://doi.org/10.5194/essd-8-571-2016
# - Dataset downloaded from https://zenodo.org/record/5494497 (Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv)

# In[51]:


print( ">>>>>>> PRIMAP-hist <<<<<<<<")

"""
Warning! CH4, N2O, NF3 and SF6 have not been converted to CO2eqs yet!
"""

df_primap = pd.read_csv("../raw_data/PRIMAP-hist/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv")

#clean
df_primap = df_primap.rename(columns={"area (ISO3)":"code"}) 


# COUNTRY_DICT NOT NECESSARY HERE, INVERSE (creating names out of ISO codes)
df_primap.insert(3, "name", np.nan)
COUNTRY_NAME_GENERATOR_FROM_ISO3(df_primap, show_missing = True, show_duplicates_nan = True)


#global
df_primap_global = df_primap[df_primap.code=="EARTH"]
df_primap_global.to_csv("../clean_data/global_subsets/PRIMAP-hist_global.csv")#[df_primap_global["category (IPCC2006_PRIMAP)"]=="M.0.EL"].groupby(["entity","unit","scenario (PRIMAP-hist)"]).sum()["2019"]/1000

print("NO DUPLICATES HERE!")

#------------------------------------------------------------------------------------------------------------------------------

df_primap.to_csv("../clean_data/PRIMAP-hist.csv", index=None)
df_primap = pd.read_csv("../clean_data/PRIMAP-hist.csv")
df_primap_global


# # Minx et al 2021
# INFO:
# - A comprehensive and synthetic dataset for global, regional and national greenhouse gas emissions by sector 1970-2018 with an extension to 2019 
# - VERSION 5 from Zenodo https://zenodo.org/record/5844489
# - Using *essd_ghg_data.xlsx* file. Gives two convenient columns for ar4 and ar5 GWPs
# 
# NOTES:
# - 2020 non-CO2 emissions seem to be incomplete

# In[57]:


print( ">>>>>>> Minx et al 2021 <<<<<<<<")

"""
IPCC categorization cannot be integrated into the main dataset (yet). This is because there are various subsectors (and some sectors such as INdustry and Energy) that overlap.
"""
#read file (excel with multiple interesting sheets)
# minx_info = pd.read_excel("raw_data/Minx/essd_ghg_data.xlsx",0) #only info
# minx_metadata = pd.read_excel("raw_data/Minx/essd_ghg_data.xlsx",1) #not very useful
minx_data = pd.read_excel("../raw_data/Minx/essd_ghg_data.xlsx",2) #MAIN DATA 
# minx_sector_classification = pd.read_excel("raw_data/Minx/essd_ghg_data.xlsx",3) #includes description and IPCC categories!
# minx_region_classification = pd.read_excel("raw_data/Minx/essd_ghg_data.xlsx",4) #not useful
# minx_100_yr_gwps = pd.read_excel("raw_data/Minx/essd_ghg_data.xlsx",5) #both ar5 and ar6 GWPS for all included gases
# minx_CH4_gwps = pd.read_excel("raw_data/Minx/essd_ghg_data.xlsx",6) # CH4 classification and chapters

#choose dataset
df_minx = minx_data

#clean a little
df_minx = df_minx.rename(columns={"country":"name", "ISO":"code", "value":"value_native"})

# add columns for both ar5 and ar6 gwps
df_minx = df_minx.assign(value_ar5=df_minx.value_native*df_minx.gwp100_ar5)
df_minx = df_minx.assign(value_ar6=df_minx.value_native*df_minx.gwp100_ar6)

#codes and names are amazing
print("NO DUPLICATES HERE!")

#global
df_minx_global = df_minx.groupby(["year","region_ar6_6","region_ar6_10","region_ar6_22","region_ar6_dev","sector_title","subsector_title","gas","gwp100_ar5", "gwp100_ar6"]).sum()
df_minx_global = df_minx_global.reset_index()
df_minx_global.to_csv("../clean_data/global_subsets/Minx2021_global.csv")

#------------------------------------------------------------------------------------------------------------------------------

df_minx.to_csv("../clean_data/Minx2021.csv", index=None)
df_minx = pd.read_csv("../clean_data/Minx2021.csv")
df_minx


# In[58]:


df_minx_global


# # UNFCCC Annex I Submissions
# - from Open Climate Data: https://github.com/openclimatedata
# - all values in current state are converted to CO2equivalents using AR6 GWPs, to get native units DIVIDE BY GWPs columns

# In[56]:


print( ">>>>>>> UNFCCC Annex I <<<<<<<<")

df_unfccc_ai = pd.read_csv("../raw_data/UNFCCC/detailed-data-by-country-annex-one.csv")

#clean
df_unfccc_ai = df_unfccc_ai.rename(columns={"Party":"name"})
df_unfccc_ai = df_unfccc_ai.replace({"CO₂":"CO2",
                                       "N₂O": "N2O",
                                       "CH₄": "CH4",
                                       "SF₆": "SF6",
                                       "NF₃":"NF3"})

#add GWPs
GWP_100_ipcc_ar6 = {
        "Aggregate GHGs" : 1,
        "CO2" : 1,
        "CH4" : 27.9,
        "N2O" : 273,
        "SF6" : 25200,
        "NF3" : 17400,
        "HFCs": 1, #unit is already in CO2eq
        "PFCs": 1, #unit is already in CO2eq
        "Aggregate F-gases": 1,#unit is already in CO2eq
        "Unspecified mix of HFCs and PFCs": 1 #unit is already in CO2eq
        }
GWP_100_ipcc_ar6_df = pd.DataFrame(GWP_100_ipcc_ar6.keys(), GWP_100_ipcc_ar6.values()).reset_index().set_index(0)
GWP_100_ipcc_ar6_df.index.name = "gas"
GWP_100_ipcc_ar4 = {
        "Aggregate GHGs" : 1,
        "CO2" : 1,
        "CH4" : 25,
        "N2O" : 298,
        "SF6" : 22800,
        "NF3" : 17200,
        "HFCs": 1, #unit is already in CO2eq
        "PFCs": 1, #unit is already in CO2eq
        "Aggregate F-gases": 1,#unit is already in CO2eq
        "Unspecified mix of HFCs and PFCs": 1 #unit is already in CO2eq
        }
GWP_100_ipcc_ar4_df = pd.DataFrame(GWP_100_ipcc_ar4.keys(), GWP_100_ipcc_ar4.values()).reset_index().set_index(0)
GWP_100_ipcc_ar4_df.index.name = "gas"
df_unfccc_ai.insert(5,"GWP_AR4_conditional",df_unfccc_ai.merge(GWP_100_ipcc_ar4_df, left_on="Gas", right_on="gas", how="left")["index"])
df_unfccc_ai.insert(6,"GWP_AR6_conditional",df_unfccc_ai.merge(GWP_100_ipcc_ar4_df, left_on="Gas", right_on="gas", how="left")["index"])

# optional stuff
# df_unfccc_ai.insert(1, "IPCC_cat", df_unfccc_ai["Category"].str[0])


# CODE_GENERATOR_ISO3 
df_unfccc_ai.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_unfccc_ai, show_missing=True,show_duplicates_nan=True)


#fix units and convert
df_unfccc_ai.loc[df_unfccc_ai.Unit=="kt","Base year":]                   = df_unfccc_ai.loc[df_unfccc_ai.Unit=="kt","Base year":].div(1000).mul(df_unfccc_ai.loc[df_unfccc_ai.Unit=="kt","Unit":]["GWP_AR6_conditional"], axis=0)
df_unfccc_ai.loc[df_unfccc_ai.Unit=="kt CO₂ equivalent","Base year":]    = df_unfccc_ai.loc[df_unfccc_ai.Unit=="kt CO₂ equivalent","Base year":].div(1000).mul(df_unfccc_ai.loc[df_unfccc_ai.Unit=="kt CO₂ equivalent","Unit":]["GWP_AR6_conditional"], axis=0)
df_unfccc_ai.loc[df_unfccc_ai.Unit=="t CO₂ equivalent","Base year":]     = df_unfccc_ai.loc[df_unfccc_ai.Unit=="t CO₂ equivalent","Base year":].div(1000000).mul(df_unfccc_ai.loc[df_unfccc_ai.Unit=="t CO₂ equivalent","Unit":]["GWP_AR6_conditional"], axis=0)
df_unfccc_ai.loc[df_unfccc_ai.Unit=="t","Base year":]                    = df_unfccc_ai.loc[df_unfccc_ai.Unit=="t","Base year":].div(1000000).mul(df_unfccc_ai.loc[df_unfccc_ai.Unit=="t","Unit":]["GWP_AR6_conditional"], axis=0)
df_unfccc_ai.loc[:,"Unit"] = "MtCO2eq"
    
#------------------------------------------------------------------------------------------------------------------------------

df_unfccc_ai.to_csv("../clean_data/UNFCCC_AI.csv", index=None)
df_unfccc_ai = pd.read_csv("../clean_data/UNFCCC_AI.csv")
df_unfccc_ai


# # UNFCCC Non-Annex I
# - from Open Climate Data: https://github.com/openclimatedata
# 

# In[ ]:


# In[23]:


print( ">>>>>>> UNFCCC Non-Annex I <<<<<<<<")


df_unfccc_nai = pd.read_csv("../raw_data/UNFCCC//detailed-data-by-country-non-annex-one.csv")


# clean
df_unfccc_nai = df_unfccc_nai.rename(columns={"Party":"name"})
df_unfccc_nai = df_unfccc_nai.replace({"CO₂":"CO2",
                   "N₂O": "N2O",
                   "CH₄": "CH4",
                   "SF₆": "SF6",
                   "NF₃":"NF3"})


#   IMPOSSIBLE to convert to common units?
# GWP_100_ipcc_ar6 = {
#         "Aggregate GHGs" : 1,
#         "CO2" : 1,
#         "CH4" : 27.9,
#         "N2O" : 273,
#         "SF6" : 1, #unit is already in CO2eq
#         "NF3" : 1, #unit is already in CO2eq
#         "HFCs": 1, #unit is already in CO2eq
#         "PFCs": 1, #unit is already in CO2eq
#         "Aggregate F-gases": 1,#unit is already in CO2eq
#         "Unspecified mix of HFCs and PFCs": 1 #unit is already in CO2eq
#         }
# GWP_100_ipcc_ar6_df = pd.DataFrame(GWP_100_ipcc_ar6.keys(), GWP_100_ipcc_ar6.values()).reset_index().set_index(0)
# GWP_100_ipcc_ar6_df.index.name = "gas"
# df_unfccc_nai.insert(5,"GWP_AR6_conditional",df_unfccc_nai.merge(GWP_100_ipcc_ar6_df, left_on="Gas", right_on="gas", how="left")["index"])
# df_unfccc_nai.loc[:, "1990":] = df_unfccc_nai.loc[:, "1990":].mul(df_unfccc_nai["GWP_AR6_conditional"], axis=0)
# df_unfccc_nai.loc[:,"Unit"]= "MtCO2eq"


#Convert from Gg to Mt
df_unfccc_nai.loc[:, "1990":] = df_unfccc_nai.loc[:, "1990":].div(1000)
df_unfccc_nai.loc[:,"Unit"]= df_unfccc_nai.loc[:,"Unit"].replace({"Gg CO₂ equivalent":"MtCO2eq",
                                                                  "Gg": "Mt (substance)"})

# CODE_GENERATOR_ISO3 
df_unfccc_nai.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_unfccc_nai, show_missing=True,show_duplicates_nan=True)


#------------------------------------------------------------------------------------------------------------------------------

df_unfccc_nai.to_csv("../clean_data/UNFCCC_NAI.csv", index=None)
df_unfccc_nai = pd.read_csv("../clean_data/UNFCCC_NAI.csv")
df_unfccc_nai


# In[ ]:


# In[ ]:


# In[ ]:


# # Other Inventories

# - Carbon Majors
# - EDGAR FOOD?
# - LUCF from GCP
# - LUCF from MINX
# - CEDS (which is not mainly for GHG but for pollutants. Still quite nice)
# - carbon monitor

# # Carbon Monitor
# - Downloaded from https://carbonmonitor.org/
# - Liu, Z., Ciais, P., Deng, Z. et al. Near-real-time monitoring of global CO2 emissions reveals the effects of the COVID-19 pandemic. Nat Commun 11, 5172 (2020). https://doi.org/10.1038/s41467-020-18922-7

# In[2]:


print( ">>>>>>> Carbon Monitor <<<<<<")

#download the file
df_carbon_mon = pd.read_csv("https://datas.carbonmonitor.org/API/downloadFullDataset.php?source=carbon_global")
df_carbon_mon.date = pd.to_datetime(df_carbon_mon.date, format= "%d/%m/%Y")

#clean
df_carbon_mon = df_carbon_mon.rename(columns={"country":"name"})
df_carbon_mon.insert(3,"unit","MtCO2")

# CODE_GENERATOR_ISO3 
df_carbon_mon.insert(0, "code", np.nan)
CODE_GENERATOR_ISO3(df_carbon_mon, show_missing=True, show_duplicates_nan=True)


# global
df_carbon_mon_global = df_carbon_mon[df_carbon_mon.name=="WORLD"]
df_carbon_mon_global.to_csv("../clean_data/global_subsets/Carbon-Monitor_global.csv")

# #------------------------------------------------------------------------------------------------------------------------------
df_carbon_mon.to_csv("../clean_data/Carbon-Monitor.csv")
df_carbon_mon = pd.read_csv("../clean_data/Carbon-Monitor.csv", index_col=0)
df_carbon_mon


# # Carbon Majors

# # Climate Accountability Institute
# 
# Principal Investigator: Richard Heede
# 
# Some links:
# - https://climateaccountability.org/carbonmajors_dataset2020.html
# - https://climateaccountability.org/pdf/MRR%209.1%20Apr14R.pdf

# In[5]:


df_carbon_majors = pd.read_csv("../clean_data/Carbon-Majors-2018.csv")
df_carbon_majors = df_carbon_majors.set_index("Entity")
df_carbon_majors