#!/usr/bin/env python
# coding: utf-8

# # Example workflow to analyse the type and amount of errors for different preprocessing levels

# If you are only interested in the final plots and a summary for preprocessing level 5 go to:
# - [Plot and summary of findings about relative failing glacier area](#id-total-error-area-summary)
# - [Comparison of errors between the different RGI regions](#id-rgi-diff)

# In[31]:


from oggm import cfg, workflow, utils, shop
import pandas as pd
import os, glob
import numpy as np
import xarray as xr
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

cfg.initialize()

import matplotlib
matplotlib.rcParams['figure.figsize'] = (14, 8)

# makes sure that the pandas output columns are not cropped!
pd.set_option("max_colwidth", None)


# In[32]:


def error_analysis_w_plot(dfserr={}, pd_rel_error_area = None,
                          level='L2', exp='elev_bands', pcp ='', clim='', 
                          qc = '', match = '', border = 160,
                          rgi_reg = 'all', 
                          path = '/home/www/oggm/gdirs/oggm_v1.4/', subplot = False, xlim=None, plot=True, rename_match =True):
    """ Estimates for a specific preprocessed gdirs the relative glacier area with errors
    and the relative amount of glaciers with errors (both in %). 
    
    Plots the error counts for the different error types.
    
    Works only if the glacier statistics were estimated!
    
    Parameters
    ----------
    dfserr : dictionary
        dictionary where already errors of largest glaciers from other preprocessed gdirs are saved, default is an empty dictionary
    pd_rel_error_area : pd.DataFrame
        pandas DataFrame with statistics about relative area and amount of errors on glaciers. 
        Default is None, which creates a new pandas DataFrame
    level : str
        the preprocessing level where the errors should be estimated and plotted.
        So far, it works only for 'L2' and 'L5'. Default is 'L2'.
    exp : str
        which glacier flowlines are used. Either 'elev_bands' (default) or 'centerlines'
    pcp : str
        precipitation factor applied (depends on clim). Has to be set!
    clim : str
        which baseline is applied. Either 'ERA5' or 'CRU'. Has to be set!
    qc : str
        whether a glacier climate quality check and correction was applied. If yes, set it to 'qc3', 
        otherwise to 'qc0'. Has to be set!
    match: str
        which calibration & other methods are chosen: Has to be set if above level 2!
        - "no_match" : only direct glaciological WGMS data used
        - "match_geod" : same as no_match, but regionally the geodetic estimates are matched by changing epsilon
        - "match_geod_pergla" : only per-glacier-individual geodetic estimates of Hugonnet et al. (2021) matched
        - "match_geod_pergla_massredis" : same as match_geod_pergla, but instead of the Shallow-Ice Approximation, 
          a mass-redistribution is applied (see: https://docs.oggm.org/en/latest/mass-redistribution.html)
    border : str
        which size of the local glacier map was used (prepro_border). For qc0, e.g., 
        most preprocessing directories are available for border=80. Default is 160
    rgi_reg : str
        default is all RGI regions. But you can also choose a single RGI region (e.g. rgi_reg = '11') or all RGI 
        regions except 19 (i.e. not available for CRU!, rgi_reg='all_without_19')
    path : str
        path to the general gdirs folder
    subplot : bool
        Default is False. If you want to use several plots, set it to True
    xlim : int
        Default is None. Maximum of single type errors in the plot. Useful for subplots
    plot : bool
        If an error type plot should be returned or not!
    
    Returns
    -------
    dfserr : dictionary
        dictionary where for each preprocessed gdir, a pd.DataFrame exists which lists the 
        type of errors and area of the 15 largest glaciers with errors
    pd_rel_error_area : pd.DataFrame
        pandas DataFrame with statistics about relative area and amount of errors on glaciers. 
        If repeated, for several preprocessed gdirs, each row represents the statistics of 
        one preprocessed gdirs.
        
    """
    if clim == 'CRU' and rgi_reg == 'all':
        raise InvalidParamsError("CRU is NOT available for RGI 19, use rgi_reg='all_without_19' instead")
    if pd_rel_error_area is None:
        pd_rel_error_area = pd.DataFrame(columns=['rel_error_area_in_percent', 'level', 'exp', 'pcp_clim', 'match', 'qc'])

    if level == 'L2':
        fd = f'{path}/L1-L2_files/{exp}/RGI62/b_{border:03d}/L2/summary/'
        err_msg = "In preprocessing level 2, climate, precipitation factor and qc should not be defined"
        assert pcp == '' and clim == '' and qc == '', err_msg
        title = f'{level}: {exp}, border: {border}'
        dict_key = f'{level}_{exp}_b{border}_rgi_{rgi_reg}'
    elif level == 'L5':
        if (pcp == 'via_winterprcp') or (pcp == '2.6'):
            fd = f'{path}/L3-L5_files/{clim}/{exp}/{qc}/{match}/RGI62/b_{border:03d}/L5/summary/'
        else:
            fd = f'{path}/L3-L5_files/{clim}/{exp}/{qc}/{pcp}/{match}/RGI62/b_{border:03d}/L5/summary/'
        err_msg = "In preprocessing level 2, need to prescribe climate (clim), match, precipitation factor (pcp) and quality check type (qc)"
        assert pcp != '' and clim != '' and qc != '' and match !='' , err_msg

        title = f'{level}: {exp}, {pcp}, {clim}, {match}, {qc}, border: {border}, rgi_{rgi_reg}'
        dict_key = f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}_b{border}_rgi_{rgi_reg}'
    else:
        raise InvalidParamsError('Only L2 and L5 can be used, but the function can be adapted easily in order that it works for other levels!')
        
    # get the statistics file which lists errors and their type
    fs = glob.glob(fd+'glacier_statistics*.csv')
    
    if rgi_reg == 'all_without_19':
        for f in fs:
            if 'statistics_19' in f:
                fs.remove(f)
        assert len(fs) == 18
    elif rgi_reg == 'all':
        print(len(fs))
        assert len(fs) == 19
    else:
        for f in fs:
            if f'statistics_{rgi_reg}' in f:
                fs = [f]
                break
        assert len(fs) == 1
    
    df = []
    for f in fs:
        df.append(pd.read_csv(f, index_col=0, low_memory=False))
    df = pd.concat(df).sort_index()
    
    rel_error_area = df.loc[~df['error_task'].isnull()].rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100
    
    # plot the amount of errors for each error type
    if plot:
        title = title + '\n relative glacier area with errors: {:.2f}%'.format(rel_error_area)
        if not subplot:
            plt.figure()
        plt.title(title, fontsize=12)
        sns.countplot(y="error_task", data=df.sort_values(by='error_task'));
        if xlim is not None:
            plt.xlim([0,xlim])

    # save the glaciers that result in errors sorted from largest to smallest glacier area
    dfserr[dict_key] = df.loc[~df['error_task'].isnull()].sort_values(by='rgi_area_km2', ascending=False)[['rgi_area_km2', 'error_task', 'error_msg']]
    # .iloc[:15]
    # save statistics about relative area and amount of errors on glaciers
    pd_rel_error_area.loc[dict_key] = np.NaN
    pd_rel_error_area.loc[dict_key]['rel_error_area_in_percent'] = rel_error_area
    pd_rel_error_area.loc[dict_key]['rgi_reg'] = rgi_reg
    pd_rel_error_area.loc[dict_key]['level'] = level
    pd_rel_error_area.loc[dict_key]['border'] = border
    pd_rel_error_area.loc[dict_key]['exp'] = exp
    if rename_match:
        if match == 'match_geod_pergla_2.6':
            pd_rel_error_area.loc[dict_key]['match'] = match[:-4]
        elif match == 'match_geod_pergla_winterprcp':
            pd_rel_error_area.loc[dict_key]['match'] = 'match_geod_pergla'

        else:
            pd_rel_error_area.loc[dict_key]['match'] = match
    else:
        pd_rel_error_area.loc[dict_key]['match'] = match

    pd_rel_error_area.loc[dict_key]['clim_pcp'] = f'{clim}_{pcp}'
    pd_rel_error_area.loc[dict_key]['qc'] = qc
    
    return dfserr, pd_rel_error_area


# In[33]:


# could be changed, but not all pre-processed directories are available for all prepro_border options!
border = 160 # 80 if qc0
# these are just containers to save and later compare the error statistics! 
dfserr = {}
pd_rel_error_area = pd.DataFrame(columns=['rel_error_area_in_percent', 'level', 'exp', 'clim_pcp', 'match', 'qc', 'border', 'rgi_reg'])


# ## Analysis for Level 2 pre-processing directories!
# 
# In pre-processing level 2, we only distinguish between elevation bands and centerlines (see the [Flowlines documentation](https://docs.oggm.org/en/stable/flowlines.html#glacier-flowlines) for an explanation about the differences). 

# In[34]:


plt.figure(figsize=(14,3))
plt.subplot(121)
dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area, level='L2', subplot=True,
                                                  exp='elev_bands', border=border, xlim = 800)
plt.subplot(122)
dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area, level='L2',  subplot=True,
                                                  exp='centerlines', border=border, xlim = 800)
plt.tight_layout()


# - failing glacier area on pre-processing level 2 is ~0.1% or less
# - less glaciers will fail when using elev_bands than centerlines

# **Here you can analyse the failing glaciers with the largest area and the type of error that has occurred!**

# In[35]:


dfserr['L2_elev_bands_b160_rgi_all'].head()


# In[36]:


dfserr['L2_centerlines_b160_rgi_all'].head()


# Let's just look at the level 2 errors for only RGI region 12 

# In[37]:


rgi_reg = '12'
plt.figure(figsize=(14,3))
plt.subplot(121)
dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area, level='L2', subplot=True,
                                                      exp='elev_bands', rgi_reg=rgi_reg,
                                                      border=border, xlim = 400)
plt.subplot(122)
dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area, level='L2',  subplot=True,
                                                      exp='centerlines', rgi_reg=rgi_reg,
                                                      border=border, xlim = 400)
plt.tight_layout()


# In[38]:


dfserr['L2_elev_bands_b160_rgi_12']['error_msg']


# In RGI region 12, ~12% of the glacier area can not run even in pre-processing level 2. The reasons are glacier masks errors because many of the glaciers in RGI region 12 are "nominal" glaciers. Nominal means here that the glaciers don't have a glacier outline but only a location and area and hence can not be run with OGGM. 

# ## Analysis for Level 5 pre-processing directories! 
# 
# In pre-processing level 5, we distinguish between:
# - different flowline types (exp = 'elev_bands' or 'centerlines', see the [Flowlines documentation](https://docs.oggm.org/en/stable/flowlines.html#glacier-flowlines))
# - different climate and precipitation factors (here: clim='CRU' with pcp='pcp2.5' or 'ERA5' with 'pcp1.6', or 'GSWP3_W5E5' with 'pcp2.5' or with varying prcp. fac depending on winter prcp.)
# - different ways to calibrate the mass balance (see [documentation on available mass-balance calibration methods](https://docs.oggm.org/en/latest/input-data.html#d-option-mass-balance-calibration-method)):
#     - match = "no_match" : only direct glaciological WGMS data used
#     - match = "match_geod" : same as no_match, but regionally the geodetic estimates are matched by changing epsilon
#     - match = "match_geod_pergla" : only per-glacier-individual geodetic estimates of Hugonnet et al. (2021) matched
# - applying the climate quality check and correction (qc='qc3') or not (qc='qc0'), see [historical_climate_qc](https://docs.oggm.org/en/latest/generated/oggm.tasks.historical_climate_qc.html#oggm.tasks.historical_climate_qc).
#     - at the moment we compare match_geod_pergla "qc0" and "qc3" although technically, after the newest OGGM release, there should be no difference in between. When we have preprocessed gdirs with match_geod_pergla with qc0, we will remove the match_geod_pergla qc3 gdirs as they will be outdated and should not be used. We will update the notebook afterwards!
# - using either the default shallow-ice approximation or a simple mass-redistribution (see: https://docs.oggm.org/en/latest/mass-redistribution.html). For the mass-redistribution, we only have one preprocessed glacier directory at the moment. You can check it out under:
#     - match = 'match_geod_pergla_massredis' (same as match_geod_pergla but with mass redistribution instead of SIA)

# In[39]:


level = 'L5'


# ### Let's start by looking only on elev_bands flowline pre-processed directories with 'qc3':

# In[40]:


exp = 'elev_bands'
qc = 'qc3'
pcps = ['pcp2.5', 'pcp1.6']
clims = ['CRU', 'ERA5']
# when we compare between ERA5 and CRU, we have to omit region 19 because CRU has no climate date for RGI region 19!
rgi_reg = 'all_without_19'
missing = []
# different match options only available for elev_bands

plt.figure(figsize=(20,18))
n = 1
for match in ['no_match', 'match_geod', 'match_geod_pergla', 'match_geod_pergla_massredis']:  # 
    for pcp, clim in zip(pcps, clims):
        plt.subplot(4,2,n)
        n += 1
        try:
            dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area,
                                                              level=level,exp=exp, pcp =pcp, clim=clim, 
                                                              qc=qc,border=border, match=match,
                                                              rgi_reg = rgi_reg,
                                                              subplot=True, xlim=3800)
        except:
            missing.append(f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}')
            plt.title(f'missing: {level}_{exp}_{pcp}_{clim}_{match}_{qc}')
plt.tight_layout()


# - the preprocessed directory: 'L5_elev_bands_pcp2.5_CRU_match_geod_pergla_massredis_qc3' does not exist at the moment, but could be produced if necessary!

# Let's repeat the same for the centerlines:
# - so far, we can only look at no_match 
# - match_geod would also work with centerlines but there are no preprocessed directories for that
# - match_geod_pergla does NOT work with centerlines!!! 

# In[41]:


exp = 'centerlines'
match = 'no_match'
rgi_reg = 'all_without_19'
plt.figure(figsize=(14,4))
n = 1
for pcp, clim in zip(pcps, clims):
    plt.subplot(1,2,n)
    n += 1
    try:
        dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area,
                                                          level=level,exp=exp, pcp =pcp, clim=clim, 
                                                          qc=qc,border=border, match=match,
                                                          rgi_reg=rgi_reg, subplot=True)
    except:
        missing.append(f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}')
        plt.title(f'missing: {level}_{exp}_{pcp}_{clim}_{match}_{qc}')
plt.tight_layout()


# Let's have a quick look at the largest failing glaciers and their errors!

# In[42]:


dfserr[f'L5_elev_bands_pcp2.5_CRU_no_match_qc3_b160_rgi_all_without_19'].head()


# In[43]:


dfserr[f'L5_elev_bands_pcp1.6_ERA5_no_match_qc3_b160_rgi_all_without_19'].head()


# - for a better comparison we will look how only the relative glacier area with errors differs between the experiments

# In[44]:


pd_rel_error_area


# In[45]:


# get only the errors from preprocessing level L5
pd_rel_error_area_L5 = pd_rel_error_area[(pd_rel_error_area.level == 'L5') & (pd_rel_error_area.rgi_reg =='all_without_19')]
pd_rel_error_area_L5


# Let's plot this in a nicer way!

# In[46]:


sns.catplot(y="match", x="rel_error_area_in_percent", data=pd_rel_error_area_L5,
            hue='clim_pcp', orient='h', col='exp',
           kind='bar', palette='Greys') #['Black', 'Grey'])
plt.xlim([0,3.5])
plt.suptitle("Relative glacier area with missing glaciers for preprocessing level L5, qc3, border=160, rgi_reg = 'all_without_19'")
plt.tight_layout()


# - there is a short description of all findings [right here](#id-total-error-area-summary)!

# In[ ]:


# ### Let's now repeat the same for preprocesed directories without climate quality check and correction (qc0)
# - at the moment: qc0 was mostly computed for CRU and for elev_bands. ERA5 with qc0 only exists for match_geod_pergla, if needed the missing preprocessed directories could be computed!

# In[47]:


qc = 'qc0'
level = 'L5'
pcp = 'pcp2.5' 
clim = 'CRU' #, 'ERA5']
exp = 'elev_bands' #, 'centerlines']:
rgi_reg = 'all_without_19'

missing = []
# different match options only available for elev_bands

plt.figure(figsize=(20,18))
n = 1
for match in ['no_match', 'match_geod', 'match_geod_pergla', 'match_geod_pergla_massredis']:  # 
    for border in [80, 160, 240]:
        plt.subplot(4,3,n)
        n += 1
        try:
            if 'match_geod_pergla' in match:
                path = '/home/www/oggm/gdirs/oggm_v1.6/'
            else:
                path = '/home/www/oggm/gdirs/oggm_v1.4/'
            dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area,
                                                              level=level,exp=exp, pcp =pcp, clim=clim, 
                                                              qc=qc,border=border, match=match,path=path,
                                                              rgi_reg=rgi_reg, subplot=True, xlim=8000)

        except:
            missing.append(f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}_b{border}')
            plt.title(f'missing: {level}_{exp}_{pcp}_{clim}_{match}_{qc}_b{border}')
plt.tight_layout()


plt.figure(figsize=(20,4))
plt.subplot(1,2,1)
dfserr_median_prcp_fac, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area = pd_rel_error_area,
                          level=level, exp='elev_bands', pcp = '2.6',
                       match ='match_geod_pergla_2.6', clim='GSWP3_W5E5', 
                          qc = 'qc0', border = 160,xlim=3800,
                          rgi_reg = rgi_reg, 
                          path = '/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/', subplot = True, plot=True)
plt.subplot(1,2,2)
dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area = pd_rel_error_area,
                          level=level, exp='elev_bands', pcp = 'via_winterprcp',
                       match ='match_geod_pergla_winterprcp', clim='GSWP3_W5E5', 
                          qc = 'qc0', border = 160,xlim=3800,
                          rgi_reg = rgi_reg, 
                          path = '/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/', subplot = True, plot=True)
plt.tight_layout()


# - also get the estimates from ERA5 with qc0

# In[48]:


qc = 'qc0'
level = 'L5'
pcp = 'pcp1.6' 
clim = 'ERA5' #, 'ERA5']
exp = 'elev_bands' #, 'centerlines']:
rgi_reg = 'all_without_19'

missing = []
# different match options only available for elev_bands

plt.figure(figsize=(20,10))
n = 1
for match in ['match_geod_pergla', 'match_geod_pergla_massredis']:  # 
    for border in [80, 160]:
        plt.subplot(2,2,n)
        n += 1
        try:
            dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr, pd_rel_error_area=pd_rel_error_area,
                                                              level=level,exp=exp, pcp =pcp, clim=clim, 
                                                              qc=qc,border=border, match=match,
                                                              rgi_reg=rgi_reg, subplot=True, xlim=8000,
                                                             path = '/home/www/oggm/gdirs/oggm_v1.6/')

        except:
            missing.append(f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}_b{border}')
            plt.title(f'missing: {level}_{exp}_{pcp}_{clim}_{match}_{qc}_b{border}')
plt.tight_layout()


# - a larger domain (border = 240 instead of border=80), does not always result in smaller amount of errors. 
#     - a https error coming from the DEM being too large could occur if the border is set too large! (see largest errors for b240 below). This is an error that could be solved, but only occurs rarely and is not important on global scales where the border is set to 80 or 160 anyways!

# In[49]:


# Let's get all level 5 statistics (for both, qc0 and qc3!)
pd_rel_error_area_L5 = pd_rel_error_area[(pd_rel_error_area.level == 'L5') & (pd_rel_error_area.rgi_reg =='all_without_19')]
condi1 = ((pd_rel_error_area_L5.border == 160) & (pd_rel_error_area_L5.qc=='qc0'))
condi2 = ((pd_rel_error_area_L5.border == 160) & (pd_rel_error_area_L5.qc=='qc3'))
pd_rel_error_area_L5 = pd_rel_error_area_L5[condi1 | condi2]


# <a id='id-total-error-area-summary'></a>

# In[50]:


sns.set(font_scale = 1.4)
sns.set_style('whitegrid')
pd_rel_error_area_L5['clim_pcp_qc_border'] = np.NaN
_hue = pd_rel_error_area_L5['clim_pcp'].values+'_'+pd_rel_error_area_L5['qc'].values+'_b'+ pd_rel_error_area_L5['border'].values.astype(str)
pd_rel_error_area_L5['clim_pcp_qc_border'] = _hue
pd_rel_error_area_L5 = pd_rel_error_area_L5.sort_values('clim_pcp_qc_border')

sns.catplot(y="match", x="rel_error_area_in_percent", data=pd_rel_error_area_L5,
            hue='clim_pcp_qc_border', orient='h', col='exp',
           kind='bar', palette='Greys',
           height=6, aspect=1.3)
plt.suptitle("Relative glacier area with failing glaciers for preprocessing level L5 (all RGI regions except RGI 19)")
ax = plt.gca()
plt.xlim([0,3.5])
plt.tight_layout();
plt.savefig('relative_failing_glacier_area_160.png')


# In[51]:


pd_rel_error_area_L5


# In[52]:


# Let's get all level 5 statistics (for both, qc0 and qc3!)
pd_rel_error_area_L5 = pd_rel_error_area[(pd_rel_error_area.level == 'L5') & (pd_rel_error_area.rgi_reg =='all_without_19')]
# we only use border = 80 for qc0 as this is the option where most preprocessed directories are available
condi1 = ((pd_rel_error_area_L5.border == 80) & (pd_rel_error_area_L5.qc=='qc0'))
# we only use border = 160 for qc3 as this is the option where most preprocessed directories are available
condi2 = ((pd_rel_error_area_L5.border == 160) & (pd_rel_error_area_L5.qc=='qc3'))
# 
condi3 = (('GSWP3_W5E5_2.6' == pd_rel_error_area['clim_pcp']) | ('GSWP3_W5E5_via_winterprcp' == pd_rel_error_area['clim_pcp']))

pd_rel_error_area_L5 = pd_rel_error_area_L5[(condi1 |condi2|condi3)]


# In[53]:


sns.set(font_scale = 1.4)
sns.set_style('whitegrid')
pd_rel_error_area_L5['clim_pcp_qc_border'] = np.NaN
_hue = pd_rel_error_area_L5['clim_pcp'].values+'_'+pd_rel_error_area_L5['qc'].values+'_b'+ pd_rel_error_area_L5['border'].values.astype(str)
pd_rel_error_area_L5['clim_pcp_qc_border'] = _hue
pd_rel_error_area_L5 = pd_rel_error_area_L5.sort_values('clim_pcp_qc_border')

sns.catplot(y="match", x="rel_error_area_in_percent", data=pd_rel_error_area_L5,
            hue='clim_pcp_qc_border', orient='h', col='exp',
           kind='bar', palette='Greys',
           height=6, aspect=1.3)
plt.suptitle("Relative glacier area with failing glaciers for preprocessing level L5 (all RGI regions except RGI 19)")
ax = plt.gca()
plt.xlim([0,3.5])
plt.tight_layout();
plt.savefig('relative_failing_glacier_area.png')


# By plotting the different error types and the relative failing glacier area, we can conclude:
# - with CRU much more glaciers and glacier area fail than with ERA5
#     - for CRU, the main errors are coming from local_t_star or mu_star_calibration_from_geodetic_mb, hence from the MB calibration
#     - for ERA5, more errors come from (simple_)glacier_masks or elevation_band_flowline (which are those errors that occur already in pre-processing Level 2)
# - using centerlines results in more errors than using elevation band flowlines
# - from the different match options to calibrate the mass-balance, 'no_match' and 'match_geod' result in more errors than 'match_geod_pergla', specifically when using CRU
# - 'match_geod_pergla_massredis' has around the same amount of errors / relative failing glacier area as 'match_geod_pergla', except for RGI region 19 where using a mass redistribution instead of the SIA results in a 4\*larger failing glacier area (see more [here](#id-rgi-19))
# - not applying any quality checks and climate corrections results in slightly less (for no_match or match_geod):
#     - For all RGI regions except RGI region 05 (and RGI 19 because this is not available for CRU), qc0 results, however, in equal or more errors.
#     - The large failing glacier area in RGI region 05 (i.e. the largest RGI glacier region) when using qc3 results in the global estimate that qc0 has in total less failing glacier area than qc3.
#     - note that for match_geod_pergla, an internal quality check and climate correction is always done if the mu_star is not in the specified bound (20, 600). Therefore, no qc3 exists for them.
#     
# - **NEW**: W5E5 with constant prcp. fac results in much more errors than with variable prcp. fac! Using W5E5 with a variable prcp. fac results in slightly more errors than using ERA5, but in much less failing glacier area than using CRU!

# ### Method to get those glaciers that work for all preprocessed glacier directories? This is important when later trying to compare volume estimates between each other!

# In[54]:


pd_geodetic = utils.get_geodetic_mb_dataframe()[utils.get_geodetic_mb_dataframe().period=='2000-01-01_2020-01-01']
# we do not model RGI region 19 because we want to compare it to CRU!!! 
pd_geodetic = pd_geodetic[pd_geodetic.reg != 19]

total_area = pd_geodetic.area.sum()
total_counts = len(pd_geodetic)
# pd_geodetic.area
pd_working = pd.DataFrame(index = pd_geodetic.index,
                        columns=dfserr.keys())
# we will set those that are not running afterwards to np.NaN value
pd_working.loc[pd_geodetic.index] = True
pd_working['area'] = pd_geodetic.area
pd_working['all_running_rgis'] = np.NaN
pd_working['rgi_reg'] = pd_geodetic.reg


# In[55]:


for key in dfserr.keys():
    if 'rgi_all_without_19' in key: 
        failing_rgis = dfserr[key].index.values


        # pd_error[cmip6_output_type] = False
        # set those glaciers that did not work to np.NaN
        pd_working.loc[failing_rgis,key] = np.NaN
        
        # check if it worked
        assert len(failing_rgis) == len(pd_working.loc[pd_working[key].isna()])
    else:
        pd_working = pd_working.drop(columns=key)
        pass


# In[56]:


all_running_rgis = pd_working[pd_working.columns[:-3]].dropna().index


# In[57]:


pd_working.loc[all_running_rgis, 'all_running_rgis'] = True


# In[58]:


all_running_rel = len(all_running_rgis)*100/len(pd_geodetic)
all_running_rel_area = pd_working.loc[all_running_rgis].area.sum()*100/pd_working.area.sum()

print(f'Amount of glaciers (without Antarctica RGI region 19) that run without glaciers for all compared preprocessed gdirs: {len(all_running_rgis)}')
print(f'Relative percentage of glacier amount where all analysed L5 preprocessed gdirs do not fail: {all_running_rel:0.2f}%')
print(f'Relative percentage of glacier area where all analysed L5 preprocessed gdirs do not fail: {all_running_rel_area:0.2f}%')


# In[59]:


pd_rel_error_area_L5.to_csv('rel_error_area_statistics_for_prepro_level5_gdirs.csv')
pd_working.to_csv('working_rgis_for_prepro_level5_gdirs.csv')


# We will use it in [this notebook which analyses the volume, mass change and specific mass balance differences between the preprocessed gdir options](https://nbviewer.org/urls/cluster.klima.uni-bremen.de/~lschuster/error_analysis/working_glacier_gdirs_comparison.ipynb). Feel free to check it out afterwards!

# ## Regional RGI error check for the different options:

# In[60]:


level = 'L5'
exp = 'elev_bands' #, 'centerlines']:

dfserr_reg = {}
pd_rel_error_area_reg = pd.DataFrame(columns=['rel_error_area_in_percent', 'level', 'exp',
                                              'clim_pcp', 'match', 'qc', 'border', 'rgi_reg'])

missing = []
# different match options only available for elev_bands
for pcp, clim in zip(pcps, clims):
    for match in ['no_match', 'match_geod', 'match_geod_pergla', 'match_geod_pergla_massredis']:  # 
        for qc in ['qc0', 'qc3']:
            for rgi_reg in np.arange(0,21,1):
                if 0<rgi_reg <10:
                    rgi_reg = f'0{rgi_reg}'
                elif rgi_reg == 0:
                    rgi_reg = 'all_without_19'
                elif rgi_reg == 20:
                    rgi_reg = 'all'
                else:
                    rgi_reg = str(rgi_reg)
                if qc == 'qc0':
                    border = 80
                else:
                    border = 160
                try:
                    # todo remove this, when all preprocessed gdirs are inside of main oggm 
                    if 'match_geod_pergla' in match:
                        path = '/home/www/oggm/gdirs/oggm_v1.6/'
                    else:
                        path = '/home/www/oggm/gdirs/oggm_v1.4/'
                    dfserr_reg, pd_rel_error_area_reg = error_analysis_w_plot(dfserr=dfserr_reg, pd_rel_error_area=pd_rel_error_area_reg,
                                                                      level=level,exp=exp, pcp =pcp, clim=clim, 
                                                                      qc=qc,border=border, match=match, path=path,
                                                                      rgi_reg=rgi_reg, plot = False)

                except:
                    missing.append(f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}_b{border}_rgi_{rgi_reg}')


# In[61]:


pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg[pd_rel_error_area_reg.qc == 'qc0']
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.loc[(pd_rel_error_area_reg_qc0.rgi_reg != '19') & (pd_rel_error_area_reg_qc0.rgi_reg != 'all')]
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.loc[pd_rel_error_area_reg_qc0.border == 80]
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.loc[pd_rel_error_area_reg_qc0.clim_pcp == 'CRU_pcp2.5']
pd_rel_error_area_reg_qc0.index = pd_rel_error_area_reg_qc0['rgi_reg']
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0[['rel_error_area_in_percent','match']]
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.pivot(columns=['match'])
pd_rel_error_area_reg_qc0.columns = pd_rel_error_area_reg_qc0.columns.droplevel(0)
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.astype("float32")
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0[['no_match', 'match_geod', 'match_geod_pergla', 'match_geod_pergla_massredis']]
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.rename({'all_without_19':'all'})
pd_rel_error_area_reg_qc0= pd_rel_error_area_reg_qc0.rename(columns={'match_geod_pergla_massredis': 'match_geod_pergla\nmassredis'})

pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg[pd_rel_error_area_reg.qc == 'qc3']
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.loc[pd_rel_error_area_reg_qc3.border == 160]
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.loc[(pd_rel_error_area_reg_qc3.rgi_reg != '19') & (pd_rel_error_area_reg_qc3.rgi_reg != 'all')]
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.loc[pd_rel_error_area_reg_qc3.clim_pcp == 'CRU_pcp2.5']
pd_rel_error_area_reg_qc3.index = pd_rel_error_area_reg_qc3['rgi_reg']
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3[['rel_error_area_in_percent','match']]
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.pivot(columns=['match'])
pd_rel_error_area_reg_qc3.columns = pd_rel_error_area_reg_qc3.columns.droplevel(0)
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.astype("float32")
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3[['no_match', 'match_geod']] #, 'match_geod_pergla']]
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.rename({'all_without_19':'all'})


# <a id='id-rgi-diff'></a>

# In[62]:


sns.set(font_scale = 1.3)
plt.figure(figsize=(28,10))
plt.suptitle(f'Relative failing glacier area per RGI region (using {clim})', fontsize=24)
plt.subplot(121)
plt.title('using no climate check and correction (qc0)', fontsize=22)
sns.heatmap(pd_rel_error_area_reg_qc0,
            linewidths=.5, cmap="PuRd", cbar_kws={'label':'relative failing glacier area (in %)'},
           annot=True, fmt=".2f")
ax = plt.gca()
ax.figure.axes[-1].yaxis.label.set_size(18)
ax.set_ylabel('RGI region (without RGI region 19)', fontsize=20);
ax.set_xlabel('MB calibration type or other methods', fontsize=20)
plt.subplot(122)
plt.title('difference between qc0 and qc3', fontsize=22)

sns.heatmap(pd_rel_error_area_reg_qc0.drop(columns=['match_geod_pergla','match_geod_pergla\nmassredis']) - pd_rel_error_area_reg_qc3,
            linewidths=.5, cmap='coolwarm', 
            cbar_kws={'label':'relative failing glacier area difference (in %)\n (qc0-qc3)'},
           annot=True,center=0, fmt=".2f")
ax = plt.gca()
ax.figure.axes[-1].yaxis.label.set_size(18)
ax.set_ylabel('');
ax.set_xlabel('MB calibration type', fontsize=20)

plt.tight_layout()


# - `no_match` and `match_geod` have almost the same amount of relative failing glacier area
# - with `qc0`, most relative glacier area fails in:
#     - RGI region 5, 16, 17, 10 (descending order) with more MB calibration errors if using the no_match or match_geod options!
#     - RGI region 12 mostly from missing outlines of nominal glaciers
#     - RGI region 13 if using the match_geod_pergla method (no mu* could be found even when correcting the climate, which happens in that case for both qc0 and qc3!)
# - differences between qc0 and qc3:
#     - only in RGI region 5 (Greenland) more glaciers fail when using qc3, but RGI region 5 is the RGI region with the largest glacier area!!!
#     - for all other RGI regions, more glaciers fail with qc0 (specifically for RGI region 16 & 10, where there are more MB calibration errors for qc0 than for qc3)

# ### Let's repeat this with ERA5

# In[63]:


clim = 'ERA5'
pcp = '1.6'
pd_rel_error_area_reg_clim = pd_rel_error_area_reg.loc[pd_rel_error_area_reg.clim_pcp == f'{clim}_pcp{pcp}']

pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_clim[pd_rel_error_area_reg_clim.qc == 'qc0']

pd_rel_error_area_reg_qc0.index = pd_rel_error_area_reg_qc0['rgi_reg']
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0[['rel_error_area_in_percent','match']]
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.pivot(columns=['match'])
pd_rel_error_area_reg_qc0.columns = pd_rel_error_area_reg_qc0.columns.droplevel(0)
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.astype("float32")
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0[['match_geod_pergla', 'match_geod_pergla_massredis']]
pd_rel_error_area_reg_qc0= pd_rel_error_area_reg_qc0.rename(columns={'match_geod_pergla_massredis':
                                                                     'match_geod_pergla\nmassredis'})
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_clim[pd_rel_error_area_reg_clim.qc == 'qc3']
pd_rel_error_area_reg_qc3.index = pd_rel_error_area_reg_qc3['rgi_reg']
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3[['rel_error_area_in_percent','match']]
pd_rel_error_area_reg_qc3 = pd_rel_error_area_reg_qc3.pivot(columns=['match'])
pd_rel_error_area_reg_qc3.columns = pd_rel_error_area_reg_qc3.columns.droplevel(0)

pd_rel_error_area_reg_best = pd.concat([pd_rel_error_area_reg_qc3, pd_rel_error_area_reg_qc0], axis=1)
pd_rel_error_area_reg_best = pd_rel_error_area_reg_best.rename(columns= {'no_match':'no_match\n(qc3)', 'match_geod' : 'match_geod\n(qc3)',
                                                               'match_geod_pergla': 'match_geod_pergla\n(qc0)',
                                                               'match_geod_pergla\nmassredis': 'match_geod_pergla\nmassredis (qc0)'})
pd_rel_error_area_reg_best = pd_rel_error_area_reg_best.astype("float32")

plt.figure(figsize=(14,10))
plt.suptitle(f'Relative failing glacier area per RGI region (using {clim})', fontsize=24)
#plt.subplot(121)
#plt.title('using no climate check and correction (qc0)', fontsize=22)
sns.heatmap(pd_rel_error_area_reg_best,
            linewidths=.5, cmap="PuRd", cbar_kws={'label':'relative failing glacier area (in %)'},
           annot=True, fmt=".2f")
ax = plt.gca()
ax.figure.axes[-1].yaxis.label.set_size(18)
ax.set_ylabel('RGI region (without RGI region 19)', fontsize=20);
ax.set_xlabel('MB calibration type or other methods', fontsize=20)


# - ok, with ERA5, there are also some regions where we have more errors with match_geod_pergla and match_geod_pergla_massredis than with no_match (e.g. RGI 13,14), but it is never more than 0.03% of more failing glacier area. However, match_geod and no_match are qc3, while match_geod_pergla is qc0. 
#     - in RGI region 17, there is only for match_geod_pergla more failing glacier area and not for match_geod_pergla massredis (it is a flowline_model_run_historical error from level 5 and we would need a larger domain, e.g. border = 160 for that)!!!
# - so maybe, at least for ERA5, it is not a big issue that the precipitation factor is too low to allow positive MB!!!

# In[ ]:


# **Let's also estimate the regional missing area for the W5E5 preprocessed gdirs:**

# In[64]:


for match, pcp in zip(['match_geod_pergla_winterprcp', 'match_geod_pergla_2.6'],['via_winterprcp', '2.6']):
    for rgi_reg in np.arange(0,21,1):
        if 0<rgi_reg <10:
            rgi_reg = f'0{rgi_reg}'
        elif rgi_reg == 0:
            rgi_reg = 'all_without_19'
        elif rgi_reg == 20:
            rgi_reg = 'all'
        else:
            rgi_reg = str(rgi_reg)
        dfserr_reg, pd_rel_error_area_reg = error_analysis_w_plot(dfserr=dfserr_reg, pd_rel_error_area=pd_rel_error_area_reg, rename_match=True,
                  level='L5', exp='elev_bands', pcp = pcp,
                      match =match, clim='GSWP3_W5E5', 
                      qc = 'qc0', border = 160,
                      rgi_reg = rgi_reg, 
                      path = '/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/', subplot = False, xlim=None, plot=False)


# In[65]:


pd_rel_error_area_reg


# In[83]:


# In[79]:


_g_stats_W5E5_cte['m_star_glacierwide'].dropna()


# In[104]:


### get the prcp. factor distributions for every RGI region 

pd_glac_stats_all_d = {}
pd_clim_stats_all_d = {}
for clim_p in ['CRU_pcp2.5', 'ERA5_pcp1.6', 'W5E5_pcp2.6', 'W5E5_pcp_via\nwinterprcp']:
    _g_stats_l = []
    _c_stats_l = []
    if clim_p == 'CRU_pcp2.5':
        rgi_regs = np.arange(1,19,1)
    else:
        rgi_regs = np.arange(1,20,1)
    for rgi_reg in rgi_regs:
        if 0<rgi_reg <10:
            rgi_reg = f'0{rgi_reg}'
        else:
            rgi_reg = str(rgi_reg)
        if clim_p == 'W5E5_pcp_via\nwinterprcp':
            _g_stats = pd.read_csv(f'/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/L3-L5_files/GSWP3_W5E5/elev_bands/qc0/match_geod_pergla_winterprcp/RGI62/b_160/L5/summary/glacier_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]
            _c_stats = pd.read_csv(f'/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/L3-L5_files/GSWP3_W5E5/elev_bands/qc0/match_geod_pergla_winterprcp/RGI62/b_160/L5/summary/climate_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]

        elif clim_p == 'W5E5_pcp2.6':
            _g_stats = pd.read_csv(f'/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/L3-L5_files/GSWP3_W5E5/elev_bands/qc0/match_geod_pergla_2.6/RGI62/b_160/L5/summary/glacier_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]
            _c_stats = pd.read_csv(f'/home/www/fmaussion/runs/new_gdirs/oggm_v1.6/L3-L5_files/GSWP3_W5E5/elev_bands/qc0/match_geod_pergla_2.6/RGI62/b_160/L5/summary/climate_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]

        elif clim_p == 'CRU_pcp2.5':
            _g_stats = pd.read_csv(f'/home/www/oggm/gdirs/oggm_v1.6/L3-L5_files/CRU/elev_bands/qc0/pcp2.5/match_geod_pergla/RGI62/b_080/L5/summary/glacier_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]
            _c_stats = pd.read_csv(f'/home/www/oggm/gdirs/oggm_v1.6/L3-L5_files/CRU/elev_bands/qc0/pcp2.5/match_geod_pergla/RGI62/b_080/L5/summary/climate_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]

        elif clim_p == 'ERA5_pcp1.6':
            _g_stats = pd.read_csv(f'/home/www/oggm/gdirs/oggm_v1.6/L3-L5_files/ERA5/elev_bands/qc0/pcp1.6/match_geod_pergla/RGI62/b_080/L5/summary/glacier_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]
            _c_stats = pd.read_csv(f'/home/www/oggm/gdirs/oggm_v1.6/L3-L5_files/ERA5/elev_bands/qc0/pcp1.6/match_geod_pergla/RGI62/b_080/L5/summary/climate_statistics_{rgi_reg}.csv')#[['glacier_prcp_scaling_factor']]


        _g_stats_l.append(_g_stats)
        _c_stats_l.append(_c_stats)


    pd_glac_stats_all = pd.concat(_g_stats_l)
    pd_clim_stats_all = pd.concat(_c_stats_l)

    pd_glac_stats_all.index = pd_glac_stats_all.rgi_id
    pd_clim_stats_all.index = pd_clim_stats_all.rgi_id

    pd_glac_stats_all_d[clim_p] = pd_glac_stats_all.loc[pd_glac_stats_all['mu_star_glacierwide'].dropna().index]
    pd_clim_stats_all_d[clim_p] = pd_clim_stats_all.loc[pd_glac_stats_all['mu_star_glacierwide'].dropna().index]


### get winter prcp. (back from prcp. fac. )
pd_glac_stats_all = pd_glac_stats_all_d['W5E5_pcp_via\nwinterprcp']
prcp_fac_dist = pd_glac_stats_all['glacier_prcp_scaling_factor']#####.values
winter_prcp_factor_ab = -1.0614, 3.9200
winter_prcp_factor_range = 0.1, 10
a, b = winter_prcp_factor_ab  # cfg.PARAMS['winter_prcp_factor_ab']
r0, r1 = winter_prcp_factor_range  # cfg.PARAMS['winter_prcp_factor_range']
assert np.all(prcp_fac_dist < r1)
assert np.all(prcp_fac_dist > r0)
w_prcp = np.exp((prcp_fac_dist - b)/a)
pd_glac_stats_all.loc[w_prcp.index, 'winter_prcp'] = w_prcp.values


pd_glac_stats_all['glacier_prcp_scaling_factor'].quantile([0,0.5, 1])


#pd_rel_error_area_reg_qc0


# In[67]:


pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg[pd_rel_error_area_reg.qc == 'qc0']
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.loc[(pd_rel_error_area_reg_qc0.border == 160) | (pd_rel_error_area_reg_qc0.clim_pcp == 'CRU_pcp2.5')|(pd_rel_error_area_reg_qc0.clim_pcp == 'ERA5_pcp1.6')] # only have border 80 for CRU ... 
pd_rel_error_area_reg_qc0.index = pd_rel_error_area_reg_qc0['rgi_reg']
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.loc[pd_rel_error_area_reg_qc0.match == 'match_geod_pergla']
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0[['rel_error_area_in_percent','clim_pcp']]
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.pivot(columns=['clim_pcp'])
pd_rel_error_area_reg_qc0.columns = pd_rel_error_area_reg_qc0.columns.droplevel(0)
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.astype("float32")
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.rename({'all_without_19':'all_no19'})


# #### Comparison of relative failing glacier area for different climate datasets & W5E5 variable prcp. factor distributions

# In[68]:


sns.set(font_scale = 1.3)
plt.figure(figsize=(16,10))
plt.subplot(111)
plt.title('Relative failing glacier area per RGI region\n(match_geod_pergla, qc0)', fontsize=20)
#plt.title(', fontsize=22)
pd_rel_error_area_reg_qc0 = pd_rel_error_area_reg_qc0.round(2)
sns.heatmap(pd_rel_error_area_reg_qc0,
            linewidths=.5, cmap="PuRd", cbar_kws={'label':'relative failing glacier area (in %)'},
           annot=True, fmt=".2f")
ax = plt.gca()
ax.figure.axes[-1].yaxis.label.set_size(18)
ax.set_ylabel('RGI region', fontsize=20);
ax.set_xticklabels(['CRU_pcp2.5\n(b80)', 'ERA5_pcp1.6\n(b80)', 'W5E5_pcp2.6\n(b160)', 'W5E5_pcp_via\nwinter_prcp (b160)'])
ax.set_xlabel('climate dataset & prcp. fac', fontsize=20)
plt.savefig('relative_failing_glacier_area_regional_match_geod_pergla_clim_pcp.png')

plt.figure(figsize=(24,10))
plt.subplot(122)
ax = plt.gca()
sns.boxplot(x = 'rgi_region', y='glacier_prcp_scaling_factor', data = pd_glac_stats_all, ax = ax)
plt.axhline(2.6, ls= '--', lw = 3, color = 'grey', label = 'applied constant prcp. fac\n(median from reference glaciers matching std.)')
plt.axhline(pd_glac_stats_all['glacier_prcp_scaling_factor'].quantile([0.5]).values,
            ls= ':', lw = 3, color = 'gray', label = 'median prcp. fac over all glaciers')
plt.legend()
plt.title('glacier-specific varying prcp. fac depending on the winter prcp. (using W5E5)', fontsize=20)

plt.subplot(121)
ax = plt.gca()
sns.boxplot(x = 'rgi_region', y='winter_prcp', data = pd_glac_stats_all, ax = ax)

plt.legend()
plt.ylabel(r'winter daily precipitation (kg m$^{-2}$, avg. 1979-2019)')
plt.title('(uncorrected) winter prcp. with W5E5', fontsize=20)
plt.tight_layout()
plt.savefig('W5E5_prcp_fac_via_winterpcp_boxplots.png')


# - When using a constant prcp. fac of 2.6 with W5E5, we have in most RGI regions, the same or more missing glacier area compared to CRU or ERA5
#     - a larger failing glacier area occurs specifically for RGI region 13 & 17, and to a smaller extent RGI region 16
#     - a slighlty smaller failing glacier area occurs for the largest RGI region 5 & for RGI region 19
# - When using W5E5 together with a glacier-specific winter prcp. dependent prcp. fac, we have for all RGI regions the same or much less failing glacier area compared to the median prcp. fac W5E5 gdirs.
#     - this effect is strongly visible for RGI regions 13 & 17: 
#         - With W5E5 using a variable prcp. fac there is for RGI region 13 less failing glacier area compared to CRU or constant prcp. fac W5E5, but still more than when using ERA5. 
#         - Glaciers in RGI13 have rather low amount of precipitation. When adding more prcp. to it by using a larger prcp. fac of ~5, less glaciers seem to fail. Probably in ERA5 more prcp. happens for RGI region 13? (similar as for CRU, mostly the glaciers that have positive observed MB can not be calibrated). 
# 
# -> **When comparing all datasets, the climate which results in the least amount of failing glacier area is still ERA5, but W5E5 that uses variable winter prcp. is similar (only slightly worse in RGI region 13 & 17).**

# In[ ]:


# ##### Comparison of ref_hgt_calib_diff
# 

# In[ ]:


plt.figure(figsize=(24,10))
plt.subplot(122)
ax = plt.gca()
sns.boxplot(x = 'rgi_region', y='ref_hgt_calib_diff', data = pd_glac_stats_all, ax = ax)
#plt.axhline(2.6, ls= '--', lw = 3, color = 'grey', label = '')
#plt.axhline(pd_glac_stats_all['glacier_prcp_scaling_factor'].quantile([0.5]).values,
#            ls= ':', lw = 3, color = 'gray', label = 'median prcp. fac over all glaciers')
#plt.legend()
#plt.title('glacier-specific varying prcp. fac depending on the winter prcp. (using W5E5)', fontsize=20)


# #### Comparison of precipitation between the datasets

# In[222]:


pd_glac_running = []
for clim_p in ['CRU_pcp2.5', 'ERA5_pcp1.6', 'W5E5_pcp2.6', 'W5E5_pcp_via\nwinterprcp']:
    pd_glac_running.append(pd_clim_stats_all_d[clim_p].index)
pd_glac_running_all= list(set(pd_glac_running[0]).intersection(pd_glac_running[1]).intersection(pd_glac_running[2]).intersection(pd_glac_running[3]))
_pd_prcp_l = []
for clim_p in ['CRU_pcp2.5', 'ERA5_pcp1.6', 'W5E5_pcp2.6', 'W5E5_pcp_via\nwinterprcp']:

    _pd_prcp = pd.DataFrame()
    _pd_prcp['rgi_id'] = pd_glac_running_all
    _pd_prcp['1980-2010_avg_prcp'] = pd_clim_stats_all_d[clim_p].loc[pd_glac_running_all]['1980-2010_avg_prcp'].values
    _pd_prcp['1980-2010_avg_prcpsol_mean_elev'] = pd_clim_stats_all_d[clim_p].loc[pd_glac_running_all]['1980-2010_avg_prcpsol_mean_elev'].values
    _pd_prcp['ref_hgt_calib_diff'] = pd_glac_stats_all_d[clim_p].loc[pd_glac_running_all]['ref_hgt_calib_diff'].values
    _pd_prcp['rgi_region'] =  pd_glac_stats_all_d[clim_p].loc[pd_glac_running_all]['rgi_region'].values
    # replace it by 0 for the statistics...
    _pd_prcp.loc[np.isnan(_pd_prcp['ref_hgt_calib_diff']), 'ref_hgt_calib_diff'] = 0
    _pd_prcp['clim_pcp'] = clim_p
    _pd_prcp['prcp_factor'] = 'corrected_prcp'

    _pd_prcp_l.append(_pd_prcp)
    
    if 'winterprcp' in clim_p:
        pcp = pd_glac_stats_all_d[clim_p].loc[pd_glac_running_all]['glacier_prcp_scaling_factor'].values
    else:
        pcp = float(clim_p[-3:])
    _pd_prcp_no_prcp_fac = pd.DataFrame()
    _pd_prcp_no_prcp_fac['rgi_id'] = pd_glac_running_all
    _pd_prcp_no_prcp_fac['1980-2010_avg_prcp'] = pd_clim_stats_all_d[clim_p].loc[pd_glac_running_all]['1980-2010_avg_prcp'].values/pcp
    _pd_prcp_no_prcp_fac['1980-2010_avg_prcpsol_mean_elev'] = pd_clim_stats_all_d[clim_p].loc[pd_glac_running_all]['1980-2010_avg_prcpsol_mean_elev'].values/pcp
    
    # just to have a nicer table, these things are the same
    _pd_prcp_no_prcp_fac['ref_hgt_calib_diff'] = pd_glac_stats_all_d[clim_p].loc[pd_glac_running_all]['ref_hgt_calib_diff'].values
    _pd_prcp_no_prcp_fac['rgi_region'] =  pd_glac_stats_all_d[clim_p].loc[pd_glac_running_all]['rgi_region'].values
    # replace it by 0 for the statistics...
    _pd_prcp_no_prcp_fac.loc[np.isnan(_pd_prcp_no_prcp_fac['ref_hgt_calib_diff']), 'ref_hgt_calib_diff'] = 0  
    _pd_prcp_no_prcp_fac['clim_pcp'] = clim_p

    _pd_prcp_no_prcp_fac['prcp_factor'] = 'uncorrected_prcp'

    _pd_prcp_l.append(_pd_prcp_no_prcp_fac)

pd_prcp = pd.concat(_pd_prcp_l)
pd_prcp = pd_prcp.reset_index()


# In[223]:


pd_prcp


# In[166]:


#sns.kdeplot(x='1980-2010_avg_prcp', hue='clim_pcp', data = pd_prcp, log_scale=True) #[['clim_pcp', '1980-2010_avg_prcp']])


# In[183]:


fig, axs = plt.subplots(1,2, figsize=(28,8))
sns.kdeplot(x='1980-2010_avg_prcp', hue='clim_pcp', data = pd_prcp, cut = True, lw = 2, ax = axs[0]) #[['clim_pcp', '1980-2010_avg_prcp']])
sns.kdeplot(x='1980-2010_avg_prcpsol_mean_elev', hue='clim_pcp', data = pd_prcp, cut = True, ax = axs[1], lw=2) #[['clim_pcp', '1980-2010_avg_prcp']])


# In[237]:


hue_order = ['uncorrected_prcp', 'corrected_prcp']
fig, axs = plt.subplots(2,2, figsize=(30,30))
sns.violinplot(x='1980-2010_avg_prcp', y='clim_pcp', data = pd_prcp, cut = 0, inner='quartile', lw = 2, split = True, hue='prcp_factor',hue_order =hue_order,
              ax = axs[0][0]) #[['clim_pcp', '1980-2010_avg_prcp']])
axs[0][0].set_xlim([-100,3300])
axs[0][0].set_title('zoom')
sns.violinplot(x='1980-2010_avg_prcp', y='clim_pcp', data = pd_prcp, cut = 0, inner='quartile', lw = 2, split = True, hue='prcp_factor',hue_order =hue_order,
              ax = axs[0][1]) #[['clim_pcp', '1980-2010_avg_prcp']])
axs[0][1].set_xlim([-100,29000])

sns.violinplot(x='1980-2010_avg_prcpsol_mean_elev', y='clim_pcp', data = pd_prcp, cut = 0, inner = 'quartile', split = True, hue='prcp_factor',hue_order =hue_order,
              ax = axs[1][0]) #[['clim_pcp', '1980-2010_avg_prcp']])
axs[1][0].set_xlim([-20,2300])
sns.violinplot(x='1980-2010_avg_prcpsol_mean_elev', y='clim_pcp', data = pd_prcp, cut = 0, inner = 'quartile', split = True, hue='prcp_factor',hue_order =hue_order,
              ax = axs[1][1]) #[['clim_pcp', '1980-2010_avg_prcp']])
axs[1][1].set_xlim([-20,17700])
plt.tight_layout()
plt.savefig('global_glacier_prcp_distributions.png')


# In[229]:


hue_order = ['uncorrected_prcp', 'corrected_prcp']
fig, axs = plt.subplots(2,1, figsize=(28,24))
sns.violinplot(x='1980-2010_avg_prcp', y='clim_pcp', data = pd_prcp, cut = True, inner='quartile', lw = 2, split = True, hue='prcp_factor',hue_order =hue_order,
              ax = axs[0]) #[['clim_pcp', '1980-2010_avg_prcp']])
axs[0].set_xlim([-100,4000])
sns.violinplot(x='1980-2010_avg_prcpsol_mean_elev', y='clim_pcp', data = pd_prcp, cut = True, inner = 'quartile', split = True, hue='prcp_factor',hue_order =hue_order,
              ax = axs[1]) #[['clim_pcp', '1980-2010_avg_prcp']])
axs[1].set_xlim([-100,3000])


# In[ ]:


sns.catplot(x='ref_hgt_calib_diff', y='clim_pcp', data = pd_prcp, #cut = True, lw = 2,
            kind='boxen',  aspect=2) #[['clim_pcp', '1980-2010_avg_prcp']])


# In[250]:


pd_prcp['applied temp. bias\nref_hgt_calib_diff * -0.0065'] = pd_prcp['ref_hgt_calib_diff']*-0.0065 


# In[258]:


pd_prcp.groupby('clim_pcp').quantile([0.025, 0.975])['applied temp. bias\nref_hgt_calib_diff * -0.0065']


# In[254]:


pd_prcp.groupby('clim_pcp').mean()['applied temp. bias\nref_hgt_calib_diff * -0.0065']


# In[275]:


xr.open_dataset('/home/users/lschuster/www_lschuster/isimip3b/isimip3b_tasAdjust_std_monthly/ukesm1-0-ll_r1i1p1f2_w5e5_ssp585_tasAdjust_std_global_monthly_2015_2100.nc')


# In[276]:


plt.subplots(1,1,figsize=(15,8))
ax = plt.gca()
sns.boxenplot(x='applied temp. bias\nref_hgt_calib_diff * -0.0065', y='clim_pcp',
               data = pd_prcp,
              saturation=0.6,
              ax=ax) 
sns.boxplot(x='applied temp. bias\nref_hgt_calib_diff * -0.0065', y='clim_pcp',
               data = pd_prcp,
            #kind='boxen',  
            whis = [5,95], 
                    linewidth=4,
                    saturation=0.8,
                    capprops={'color':'grey', 'alpha':0.8},
                    whiskerprops={'color':'grey', 'alpha':0.8},
            fliersize=0, color = 'grey',
              ax=ax)
plt.title('90% intervals in grey')


# In[ ]:


uncorrected:
ERA5 has the most  and W5E5 the least amount of prcp. over glacier regions
ERA5 has the most and W5E5 & CRU similar lower amount of solid prcp.
correcting with constant factor:
balances out most "global" differences for prcp. between CRU, ERA5, W5E5,
but differences remain in the corrected solid prcp. amount: even with larger prcp. correction factors, ERA5 has still considerably larger solid prcp. amounts.
correcting with variable prcp. fac (and mostly larger prcp. fac.) for W5E5:
results for most glaciers in even more total and solid prcp. than with ERA5_pcp1.6 (and CRU). But, for some glaciers ERA5 gives still more prcp.


# In[240]:


fig, axs= plt.subplot(1,1,figsize=(15,8))
sns.violinplot(x='ref_hgt_calib_diff', y='clim_pcp', data = pd_prcp, cut=0,#cut = True, lw = 2,
            kind='boxen',  axs) #[['clim_pcp', '1980-2010_avg_prcp']])
plt.gca().set_xlabel('ref_hgt_calib_diff\n reference - uncorrected height')
# if positive: the height is now higher, hence it is acting like reducing the temperature !!!


# In[ ]:


sns.catplot(x='ref_hgt_calib_diff', y='clim_pcp', data = pd_prcp, #cut = True, lw = 2,
            col='rgi_region', kind='boxen', col_wrap=4, aspect=2) #[['clim_pcp', '1980-2010_avg_prcp']])


# In[283]:


pd_prcp


# In[286]:


fig,axs=plt.subplots(5,4,figsize=(30,25))
#ax = plt.gca()
sns.catplot(x='applied temp. bias\nref_hgt_calib_diff * -0.0065', y='clim_pcp',
               data = pd_prcp, kind='boxen', col_wrap=4, aspect=2,col='rgi_region',
              saturation=0.6 #,ax=axs,
              ) 
sns.catplot(x='applied temp. bias\nref_hgt_calib_diff * -0.0065', y='clim_pcp',
               data = pd_prcp,kind='box', col_wrap=4, aspect=2,col='rgi_region',
            #kind='boxen',  
            whis = [5,95], 
                    linewidth=4,
                    saturation=0.8,
                    capprops={'color':'grey', 'alpha':0.8},
                    whiskerprops={'color':'grey', 'alpha':0.8},
            fliersize=0, color = 'grey',ax=axs,
              )
plt.suptitle('90% intervals in grey')


# In[280]:


sns.catplot(x='ref_hgt_calib_diff', y='clim_pcp', data = pd_prcp, #cut = True, lw = 2,
            col='rgi_region', kind='boxen', col_wrap=4, aspect=2) #[['clim_pcp', '1980-2010_avg_prcp']])


# In[203]:


sns.catplot(x='ref_hgt_calib_diff', y='clim_pcp', data = pd_prcp, #cut = True, lw = 2,
            col='rgi_region', kind='violin', col_wrap=4, aspect=2) #[['clim_pcp', '1980-2010_avg_prcp']])


# In[101]:


# In[ ]:


# In[ ]:


# In[ ]:


pd_clim_stats_all_d['CRU_pcp2.5'][]
pd_glac_stats_all_d['CRU_pcp2.5'].columns
pd_glac_stats_all['ref_hgt_calib_diff'].dropna()


# In[ ]:


# #### Some more regional checks, just to see the type of the error:

# **RGI region 13 check**
# 
# - What is so specific in RGI region 13 that **match_geod_pergla** works less good using CRU specifically (more than twice as much errors and 3\*times as much area, see below) than when using the direct reference glacier calibration data (no_match)?
#     - The problem is actually that precipitation is so low for these glaciers, that it can not reproduce the observed geodetic **positive** specific mass balance.

# In[183]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_pergla_qc0_b80_rgi_13'].head()


# In[284]:


plt.hist(pd_geodetic.loc[dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_pergla_qc0_b80_rgi_13'].index].dmdtda,
        label='CRU_pcp2.5')
plt.hist(pd_geodetic.loc[dfserr_reg['L5_elev_bands_2.6_GSWP3_W5E5_match_geod_pergla_2.6_qc0_b160_rgi_13'].index].dmdtda, alpha = 0.5, label = 'GSWP3_W5E5_constant_prcp_fac')
plt.hist(pd_geodetic.loc[dfserr_reg['L5_elev_bands_via_winterprcp_GSWP3_W5E5_match_geod_pergla_winterprcp_qc0_b160_rgi_13'].index].dmdtda,
         alpha = 0.5, label = 'GSWP3_W5E5 variable (larger) prcp. fac')
plt.legend()
plt.xlabel('dmdtda')


# - all failing glaciers have a positive mass balance. This is the reason that glaciers are failing. 
#     - maybe a larger precipitation factor for CRU would be necessary in this region specifically !!!
#     - If we use instead ERA5 (see below), much less glaciers fail in this region!
#     - Using a variable (and mostly) larger prcp. fac results in much less failing glaciers with W5E5. Then only those glaciers with larger positive dmdtda from the last 20 years are still failing

# In[285]:


print('with match_geod_pergla:')
print(f"amount of glaciers failing in RGI 13: {len(dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_pergla_qc0_b80_rgi_13'])}")
print(f"failing glacier area in RGI 13:  {dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_pergla_qc0_b80_rgi_13'].rgi_area_km2.sum():0.1f} km2")


# In[286]:


# what are the glaciers failing for no match???
dfserr_reg['L5_elev_bands_pcp2.5_CRU_no_match_qc0_b80_rgi_13'].head()


# In[287]:


print('with no_match:')
print(f"amount of glaciers failing in RGI 13: {len(dfserr_reg['L5_elev_bands_pcp2.5_CRU_no_match_qc0_b80_rgi_13'])}")
print(f"failing glacier area in RGI 13:  {dfserr_reg['L5_elev_bands_pcp2.5_CRU_no_match_qc0_b80_rgi_13'].rgi_area_km2.sum():0.1f} km2")


# Let's get a list of only those that fail for match_geod_pergla, but not for no_match!

# In[288]:


rgi_fail_no_match = dfserr_reg['L5_elev_bands_pcp2.5_CRU_no_match_qc0_b80_rgi_13'].index
rgi_fail_match_geod_pergla = dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_pergla_qc0_b80_rgi_13'].index.values
rgi_fail_both = rgi_fail_no_match & rgi_fail_match_geod_pergla
pd_failing_rgi13_pergla = dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_pergla_qc0_b80_rgi_13']


# In[289]:


pd_failing_rgi13_pergla.drop(rgi_fail_both)


# It is not just one glacier that does not work with the match_geod_pergla method, but rather a lot of different mid-size glaciers. 

# When using a varying prcp. fac, there are much less errors in region 13. We will have to check the prcp. fac distribution 

# In[ ]:


# **RGI region 5 check**

# In[290]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc0_b80_rgi_05'].head()


# In[291]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc3_b160_rgi_05'].head()


# In[292]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc3_b160_rgi_05']['error_msg'].iloc[0]


# In[293]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc0_b80_rgi_05'].rgi_area_km2.sum()


# In[294]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc3_b160_rgi_05'].rgi_area_km2.sum()


# - around 20% more glacier area fails with qc3 in RGI reg 5 (with match_geod and CRU), because of bigger glaciers with MassBalanceCalibrationErrors!!!.

# **RGI region 16 check**

# In[295]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc0_b80_rgi_16'].head()


# In[296]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc0_b80_rgi_16'].rgi_area_km2.sum()


# In[297]:


dfserr_reg['L5_elev_bands_pcp2.5_CRU_match_geod_qc3_b160_rgi_16'].rgi_area_km2.sum()


# - there is around 50% more failing glacier area for qc0 than for qc3 (with match_geod and CRU)

# In[ ]:


# #####  RGI region 17 check

# In[200]:


dfserr_reg['L5_elev_bands_2.6_GSWP3_W5E5_match_geod_pergla_2.6_qc0_b160_rgi_17'].head()


# ## A closer look to RGI region 19: we only look here at ERA5 & qc3

# In[38]:


sns.set_style('whitegrid')

exp = 'elev_bands'
qc = 'qc3'
pcp = 'pcp1.6'
clim = 'ERA5'
# when we compare between ERA5 and CRU, we have to omit region 19 because CRU has no climate date for RGI region 19!
rgi_reg = '19'
missing = []
# different match options only available for elev_bands
dfserr_rgi19 = {}
pd_rel_error_area_rgi19 = pd.DataFrame(columns=['rel_error_area_in_percent', 'level', 'exp',
                                              'clim_pcp', 'match', 'qc', 'border', 'rgi_reg'])

plt.figure(figsize=(16,18))
n = 1
for match in ['no_match', 'match_geod', 'match_geod_pergla', 'match_geod_pergla_massredis']:  # 
    for exp in ['elev_bands', 'centerlines']:
        plt.subplot(4,2,n)
        n += 1
        try:
            dfserr, pd_rel_error_area = error_analysis_w_plot(dfserr=dfserr_rgi19, pd_rel_error_area=pd_rel_error_area_rgi19,
                                                              level=level,exp=exp, pcp =pcp, clim=clim, 
                                                              qc=qc,border=border, match=match,
                                                              rgi_reg = rgi_reg,
                                                              subplot=True, xlim=300)
        except:
            missing.append(f'{level}_{exp}_{pcp}_{clim}_{match}_{qc}')
            plt.title(f'missing: {level}_{exp}_{pcp}_{clim}_{match}_{qc}')
plt.tight_layout()


# <a id='id-rgi-19'></a>

# in RGI 19 with ERA5 and qc3:
# - most errors are coming from preprocessing level 2 or lower (e.g. (simple_)glacier_masks or elevation_band_flowline).
# - similar as for the other regions, more errors occur for 'no_match' or 'match_geod' than for 'match_geod_pergla'. 
# - However, 'match_geod_pergla_massredis' (~1% relative failing glacier area), has twice as large failing glacier area than 'no_match'

# In[ ]: