#!/usr/bin/env python
# coding: utf-8

# In[48]:


import pandas as pd
import geopandas as gpd
import xarray as xr
import rioxarray as rxr
import os
import xarray as xr
import geopandas as gpd
import os
from osgeo import gdal


os.chdir(r'C:/Users/jtrum/world_bank/data/')


# In[39]:


datadir = 'C:/Users/jtrum/world_bank/data/'

pluvial = ['P_1in5', 'P_1in10', 'P_1in20', 'P_1in50', 'P_1in75', 'P_1in100', 'P_1in200', 'P_1in250', 'P_1in500', 'P_1in1000']
fluvial_undefined = ['FU_1in5', 'FU_1in10', 'FU_1in20', 'FU_1in50', 'FU_1in75', 'FU_1in100', 'FU_1in200', 'FU_1in250', 'FU_1in500', 'FU_1in1000']

data_dict = {}

for i in pluvial:
    data_dict[i] = xr.open_dataset(datadir + f'AngolaFathom/Angola/pluvial/{i}.tif')

for i in fluvial_undefined:
    data_dict[i] = xr.open_dataset(datadir + f'AngolaFathom/Angola/fluvial_undefended/{i}.tif')

# crop the 20 rasters to the extent of catchment basin
catchment = gpd.read_file(datadir + 'catchment.geojson')
clip = catchment.geometry

# set crs of rasters to equal crs of catchment
for i in data_dict:
    data_dict[i] = data_dict[i].rio.write_crs(catchment.crs)
    # print crs of raster, then crs of catchment
    print(data_dict[i].rio.crs)
    print(catchment.crs)

# clip rasters to catchment extent
for i in data_dict:
    data_dict[i] = data_dict[i].rio.clip(clip, from_disk=True)
    print(i, 'clipped')


# In[50]:


def binarize_band(raster_path, threshold=0.15):
    raster_dataset = gdal.Open(raster_path, gdal.GA_Update)
    
    if raster_dataset is None:
        print(f"Unable to open {raster_path}")
        return
    
    band = raster_dataset.GetRasterBand(1)
    band_data = band.ReadAsArray()
    
    binary_data = (band_data > threshold).astype(int)
    
    new_band = raster_dataset.GetRasterBand(1)  # You can choose any band number you want
    new_band.WriteArray(binary_data)
    
    raster_dataset.FlushCache()
    raster_dataset = None

    print(f"Binarized {raster_path}")

# Define the threshold value
threshold = 0.15

# Clip rasters to catchment extent
for i in data_dict:
    data_dict[i] = data_dict[i].rio.clip(clip, from_disk=True)
    print(i, 'clipped')
    
    # Get the path to the clipped raster file
    clipped_raster_path = datadir + i + '_clipped.tif'
    data_dict[i].to_netcdf(clipped_raster_path)  # Save the clipped raster as a NetCDF
    
    # Binarize the band data in the clipped raster
    binarize_band(clipped_raster_path, threshold)


# In[53]:


# turn the clipped rasters into a dictionary
clipped_dict = {}

for i in pluvial:
    clipped_dict[i] = xr.open_dataset(datadir + f'{i}_clipped.tif')

for i in fluvial_undefined:
    clipped_dict[i] = xr.open_dataset(datadir + f'{i}_clipped.tif')


# In[69]:


from scipy import stats

# Define the groups based on your criteria
groups = [
    ['P_1in5', 'P_1in10'],
    ['P_1in20', 'P_1in50', 'P_1in75'],
    ['P_1in100', 'P_1in200', 'P_1in250', 'P_1in500', 'P_1in1000'],
    ['FU_1in5', 'FU_1in10'],
    ['FU_1in20', 'FU_1in50', 'FU_1in75'],
    ['FU_1in100', 'FU_1in200', 'FU_1in250', 'FU_1in500', 'FU_1in1000']
]

def compute_mode_value(rasters):
    # Stack the rasters along a new dimension
    stacked_data = xr.concat([rasters[i]['band_data'] for i in range(len(rasters))], dim='raster')
    
    # Compute the mode along the 'raster' dimension
    mode_data = stats.mode(stacked_data, axis=0, nan_policy='omit')

    # Extract the mode values as a DataArray
    mode_values = xr.DataArray(mode_data.mode, dims=stacked_data.dims, coords=stacked_data.coords)

    return mode_values

# Create a dictionary to store mode values for each group
mode_values_dict = {}

# Compute mode values for each group
for group_indices, group_name in enumerate(groups):
    group_rasters = [clipped_dict[i] for i in group_name]
    mode_values = compute_mode_value(group_rasters)
    mode_values_dict[f'Group_{group_indices + 1}'] = mode_values

# The mode values for each group can be accessed as mode_values_dict['Group_1'], mode_values_dict['Group_2'], etc.


# In[75]:


# now plot the 6 groups post-binarization
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
import matplotlib.gridspec as gridspec


# Define the groups based on your criteria

groups = [
    

# In[17]:


#print as a dataframe 
for i in data_dict:
    data_dict[i] = data_dict[i].to_dataframe()
    # turn each dataframe into its own variable so we don't have to call the dictionary every time
    globals()[i] = data_dict[i]
    print(i, 'converted to dataframe')


# In[20]:


# use x and y to make into geodataframe
for i in data_dict:
    data_dict[i] = data_dict[i].reset_index()
    data_dict[i] = gpd.GeoDataFrame(data_dict[i], geometry=gpd.points_from_xy(data_dict[i].x, data_dict[i].y))
    print(i, 'converted to geodataframe')


# In[30]:


# make a dictionary of the dataframes
df_dict = {'P_1in5', 
           'P_1in10',}


# In[32]:


# for each dataframe, in the 'band_data' column, if the value is greater than or equal to 0.15, in a new column called 'threshold' put a 1, all else put a 0. use their actual dataframes and not the dictionary
P_1in5['threshold'] = P_1in5['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in10['threshold'] = P_1in10['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in20['threshold'] = P_1in20['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in50['threshold'] = P_1in50['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in75['threshold'] = P_1in75['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in100['threshold'] = P_1in100['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in200['threshold'] = P_1in200['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in250['threshold'] = P_1in250['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in500['threshold'] = P_1in500['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
P_1in1000['threshold'] = P_1in1000['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in5['threshold'] = FU_1in5['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in10['threshold'] = FU_1in10['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in20['threshold'] = FU_1in20['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in50['threshold'] = FU_1in50['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in75['threshold'] = FU_1in75['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in100['threshold'] = FU_1in100['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in200['threshold'] = FU_1in200['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in250['threshold'] = FU_1in250['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in500['threshold'] = FU_1in500['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)
FU_1in1000['threshold'] = FU_1in1000['band_data'].apply(lambda x: 1 if x >= 0.15 else 0)


# In[34]:


# make 6 groupings of the dataframes, where each will be a new dataframe using the x and y columns and then the threshold values from each of the dataframes in the group, with the column being named 'threshold' + the name of the dataframe
# group 1 (P_1in5, P_1in10)
G1 = pd.DataFrame(P_1in5, columns=['band', 'x', 'y'])
G1['threshold_P_1in5'] = P_1in5['threshold']
G1['threshold_P_1in10'] = P_1in10['threshold']

# group 2 (P_1in20, P_1in50, P_1in75)
G2 = pd.DataFrame(P_1in20, columns=['band', 'x', 'y'])
G2['threshold_P_1in20'] = P_1in20['threshold']
G2['threshold_P_1in50'] = P_1in50['threshold']
G2['threshold_P_1in75'] = P_1in75['threshold']

# group 3 (P_1in100, P_1in200, P_1in250, P_1in500, P_1in1000)
G3 = pd.DataFrame(P_1in100, columns=['band', 'x', 'y'])
G3['threshold_P_1in100'] = P_1in100['threshold']
G3['threshold_P_1in200'] = P_1in200['threshold']
G3['threshold_P_1in250'] = P_1in250['threshold']
G3['threshold_P_1in500'] = P_1in500['threshold']
G3['threshold_P_1in1000'] = P_1in1000['threshold']

# group 4 (FU_1in5, FU_1in10)
G4 = pd.DataFrame(FU_1in5, columns=['band', 'x', 'y'])
G4['threshold_FU_1in5'] = FU_1in5['threshold']
G4['threshold_FU_1in10'] = FU_1in10['threshold']

# group 5 (FU_1in20, FU_1in50, FU_1in75)
G5 = pd.DataFrame(FU_1in20, columns=['band', 'x', 'y'])
G5['threshold_FU_1in20'] = FU_1in20['threshold']
G5['threshold_FU_1in50'] = FU_1in50['threshold']
G5['threshold_FU_1in75'] = FU_1in75['threshold']

# group 6 (FU_1in100, FU_1in200, FU_1in250, FU_1in500, FU_1in1000)
G6 = pd.DataFrame(FU_1in100, columns=['band', 'x', 'y'])
G6['threshold_FU_1in100'] = FU_1in100['threshold']
G6['threshold_FU_1in200'] = FU_1in200['threshold']
G6['threshold_FU_1in250'] = FU_1in250['threshold']
G6['threshold_FU_1in500'] = FU_1in500['threshold']
G6['threshold_FU_1in1000'] = FU_1in1000['threshold']


# In[37]:


# take the mode of each of the threshold columns and make it the new column 'threshold' in each of the groups
G1['threshold'] = G1[['threshold_P_1in5', 'threshold_P_1in10']].mode(axis=1)[0]
# print("G1 threshold column created")
# G2['threshold'] = G2[['threshold_P_1in20', 'threshold_P_1in50', 'threshold_P_1in75']].mode(axis=1)[0]
# print("G2 threshold column created")
# G3['threshold'] = G3[['threshold_P_1in100', 'threshold_P_1in200', 'threshold_P_1in250', 'threshold_P_1in500', 'threshold_P_1in1000']].mode(axis=1)[0]
# print("G3 threshold column created")
# G4['threshold'] = G4[['threshold_FU_1in5', 'threshold_FU_1in10']].mode(axis=1)[0]
# print("G4 threshold column created")
# G5['threshold'] = G5[['threshold_FU_1in20', 'threshold_FU_1in50', 'threshold_FU_1in75']].mode(axis=1)[0]
# print("G5 threshold column created")
# G6['threshold'] = G6[['threshold_FU_1in100', 'threshold_FU_1in200', 'threshold_FU_1in250', 'threshold_FU_1in500', 'threshold_FU_1in1000']].mode(axis=1)[0]
# print("G6 threshold column created")


# In[69]:


# turn all dataframes into own variables
for i in data_dict:
    globals()[i] = data_dict[i]
    print(i, 'converted to variable')


# In[80]:


# drop na values
for i in data_dict:
    data_dict[i] = data_dict[i].dropna()
    print(i, 'dropped na values')

len(P_1in5)


# In[63]:


# import mode function
from scipy.stats import mode

# create grouping categories per return period per flood type
P_under1 = ['P_1in1000', 'P_1in500', 'P_1in250', 'P_1in200', 'P_1in100']
P_1to10 = ['P_1in75', 'P_1in50', 'P_1in20'] 
P_over10 = ['P_1in10', 'P_1in5']
FU_under1 = ['FU_1in1000', 'FU_1in500', 'FU_1in250', 'FU_1in200', 'FU_1in100']
FU_1to10 = ['FU_1in75', 'FU_1in50', 'FU_1in20']
FU_over10 = ['FU_1in10', 'FU_1in5']

# create empty dictionary to store mode values
mode_dict = {}

# Calculate mode for each return period per flood type
for i in P_under1 + P_1to10 + P_over10 + FU_under1 + FU_1to10 + FU_over10:
    # Get the data array from the xarray Dataset
    data_array = data_dict[i].to_array()
    # Calculate the mode along the 'band' dimension (axis=0)
    mode_values = mode(data_array, axis=0, nan_policy='omit')
    # Store the mode values in the dictionary
    mode_dict[i] = mode_values.mode[0]

    print(i, 'mode calculated:', mode_values.mode[0])


# In[47]:


# set rasters to same CRS as catchment
for i in data_dict:
    data_dict[i] = data_dict[i].rio.reproject(catchment.crs, resampling=0)
    print(i, 'reprojected')


# In[48]:


import geopandas as gpd
from scipy.stats import mode

# Set the data directory
datadir = 'C:/Users/jtrum/world_bank/data/'

# Load the catchment boundary
catchment = gpd.read_file(datadir + 'catchment.geojson')

# Define the probability groups
groups = {
    'low_prob': ['P_1in1000', 'P_1in500', 'P_1in250', 'P_1in200', 'P_1in100'],
    'medium_prob': ['P_1in75', 'P_1in50', 'P_1in20'],
    'high_prob': ['P_1in10', 'P_1in5']
}

# Create empty DataArrays to store binary classifications
binary_classifications = {}

for group_name, group_rasters in groups.items():
    binary_data = xr.open_dataarray(datadir + f'AngolaFathom/Angola/pluvial/{group_rasters[0]}.tif')
    
    for raster_name in group_rasters[1:]:
        data = xr.open_dataarray(datadir + f'AngolaFathom/Angola/pluvial/{raster_name}.tif')
        binary_data = binary_data + data
    
    # Set values to 1 if they are greater than or equal to half of the raster count
    binary_data = (binary_data >= len(group_rasters) / 2).astype(float)
    
    binary_classifications[group_name] = binary_data
    print(group_name, 'done')

# Now, you have six binary classifications in the 'binary_classifications' dictionary


# In[49]:


# change the crs of low_prob, medium_prob, and high_prob to match catchment
for i in binary_classifications:
    binary_classifications[i] = binary_classifications[i].rio.reproject(catchment.crs, resampling=0)
    print(i, 'reprojected')


# In[50]:


# crop the low_prob, medium_prob, and high_prob rasters to the extent of catchment basin
clip = catchment.geometry

for i in binary_classifications:
    binary_classifications[i] = binary_classifications[i].rio.clip(clip, from_disk=True)
    print(i, 'clipped')


# In[32]:


# view data as a dataframe
df = binary_classifications['low_prob'].to_dataframe()
df.head()


# In[25]:


import xarray as xr
import geopandas as gpd
import numpy as np
from scipy.stats import mode

# Create groups for probability thresholds
thresholds = {
    'low_prob': (0, 0.01),  # <1% probability
    'medium_prob': (0.01, 0.1),  # 1-10% probability
    'high_prob': (0.1, 1)  # >10% probability
}

# Create a dictionary to store the grouped data
grouped_data = {key: {} for key in thresholds}

# Load and crop the rasters
for i in pluvial + fluvial_undefined:
    data = xr.open_dataset(datadir + f'AngolaFathom/Angola/pluvial/{i}.tif')
    # Crop the data to the extent of the catchment
    data = data.sel(x=slice(catchment.total_bounds[0], catchment.total_bounds[2]),
                    y=slice(catchment.total_bounds[1], catchment.total_bounds[3]))
    # Group the data based on probability thresholds
    for key, (min_prob, max_prob) in thresholds.items():
        mask = (data['probability'] >= min_prob) & (data['probability'] < max_prob)
        grouped_data[key][i] = data.where(mask, drop=True)

# Create a dictionary to store the binary classifications
binary_classification = {key: {} for key in thresholds}

# Create a binary classification based on majority values
for group, group_data in grouped_data.items():
    for i in pluvial + fluvial_undefined:
        binary_data = group_data[i].copy()
        # Replace NaN values with -1 to handle them separately
        binary_data = binary_data.fillna(-1)
        # Compute the mode (majority value) for each cell
        mode_result = mode(binary_data, axis=0, nan_policy='omit')
        # Set majority value to 1, if there's a majority of 1 values
        binary_data = np.where(mode_result.mode == 1, 1, 0)
        # Set NaN values back to NaN
        binary_data = binary_data.astype(float)
        binary_classification[group][i] = xr.DataArray(binary_data, coords=binary_data.coords, dims=binary_data.dims)

# Now you have `binary_classification` dictionary containing binary data for each group.