#!/usr/bin/env python # coding: utf-8 # In[48]: import pandas as pd import geopandas as gpd import xarray as xr import rioxarray as rxr import os import xarray as xr import geopandas as gpd import os from osgeo import gdal os.chdir(r'C:/Users/jtrum/world_bank/data/') # In[39]: datadir = 'C:/Users/jtrum/world_bank/data/' pluvial = ['P_1in5', 'P_1in10', 'P_1in20', 'P_1in50', 'P_1in75', 'P_1in100', 'P_1in200', 'P_1in250', 'P_1in500', 'P_1in1000'] fluvial_undefined = ['FU_1in5', 'FU_1in10', 'FU_1in20', 'FU_1in50', 'FU_1in75', 'FU_1in100', 'FU_1in200', 'FU_1in250', 'FU_1in500', 'FU_1in1000'] data_dict = {} for i in pluvial: data_dict[i] = xr.open_dataset(datadir + f'AngolaFathom/Angola/pluvial/{i}.tif') for i in fluvial_undefined: data_dict[i] = xr.open_dataset(datadir + f'AngolaFathom/Angola/fluvial_undefended/{i}.tif') # crop the 20 rasters to the extent of catchment basin catchment = gpd.read_file(datadir + 'catchment.geojson') clip = catchment.geometry # set crs of rasters to equal crs of catchment for i in data_dict: data_dict[i] = data_dict[i].rio.write_crs(catchment.crs) # print crs of raster, then crs of catchment print(data_dict[i].rio.crs) print(catchment.crs) # clip rasters to catchment extent for i in data_dict: data_dict[i] = data_dict[i].rio.clip(clip, from_disk=True) print(i, 'clipped') # In[50]: def binarize_band(raster_path, threshold=0.15): raster_dataset = gdal.Open(raster_path, gdal.GA_Update) if raster_dataset is None: print(f"Unable to open {raster_path}") return band = raster_dataset.GetRasterBand(1) band_data = band.ReadAsArray() binary_data = (band_data > threshold).astype(int) new_band = raster_dataset.GetRasterBand(1) # You can choose any band number you want new_band.WriteArray(binary_data) raster_dataset.FlushCache() raster_dataset = None print(f"Binarized {raster_path}") # Define the threshold value threshold = 0.15 # Clip rasters to catchment extent for i in data_dict: data_dict[i] = data_dict[i].rio.clip(clip, from_disk=True) print(i, 'clipped') # Get the path to the clipped raster file clipped_raster_path = datadir + i + '_clipped.tif' data_dict[i].to_netcdf(clipped_raster_path) # Save the clipped raster as a NetCDF # Binarize the band data in the clipped raster binarize_band(clipped_raster_path, threshold) # In[53]: # turn the clipped rasters into a dictionary clipped_dict = {} for i in pluvial: clipped_dict[i] = xr.open_dataset(datadir + f'{i}_clipped.tif') for i in fluvial_undefined: clipped_dict[i] = xr.open_dataset(datadir + f'{i}_clipped.tif') # In[69]: from scipy import stats # Define the groups based on your criteria groups = [ ['P_1in5', 'P_1in10'], ['P_1in20', 'P_1in50', 'P_1in75'], ['P_1in100', 'P_1in200', 'P_1in250', 'P_1in500', 'P_1in1000'], ['FU_1in5', 'FU_1in10'], ['FU_1in20', 'FU_1in50', 'FU_1in75'], ['FU_1in100', 'FU_1in200', 'FU_1in250', 'FU_1in500', 'FU_1in1000'] ] def compute_mode_value(rasters): # Stack the rasters along a new dimension stacked_data = xr.concat([rasters[i]['band_data'] for i in range(len(rasters))], dim='raster') # Compute the mode along the 'raster' dimension mode_data = stats.mode(stacked_data, axis=0, nan_policy='omit') # Extract the mode values as a DataArray mode_values = xr.DataArray(mode_data.mode, dims=stacked_data.dims, coords=stacked_data.coords) return mode_values # Create a dictionary to store mode values for each group mode_values_dict = {} # Compute mode values for each group for group_indices, group_name in enumerate(groups): group_rasters = [clipped_dict[i] for i in group_name] mode_values = compute_mode_value(group_rasters) mode_values_dict[f'Group_{group_indices + 1}'] = mode_values # The mode values for each group can be accessed as mode_values_dict['Group_1'], mode_values_dict['Group_2'], etc. # In[75]: # now plot the 6 groups post-binarization import matplotlib.pyplot as plt import matplotlib.colors as colors import matplotlib.patches as mpatches import matplotlib.lines as mlines import matplotlib.gridspec as gridspec # Define the groups based on your criteria groups = [ # In[17]: #print as a dataframe for i in data_dict: data_dict[i] = data_dict[i].to_dataframe() # turn each dataframe into its own variable so we don't have to call the dictionary every time globals()[i] = data_dict[i] print(i, 'converted to dataframe') # In[20]: # use x and y to make into geodataframe for i in data_dict: data_dict[i] = data_dict[i].reset_index() data_dict[i] = gpd.GeoDataFrame(data_dict[i], geometry=gpd.points_from_xy(data_dict[i].x, data_dict[i].y)) print(i, 'converted to geodataframe') # In[30]: # make a dictionary of the dataframes df_dict = {'P_1in5', 'P_1in10',} # In[32]: # for each dataframe, in the 'band_data' column, if the value is greater than or equal to 0.15, in a new column called 'threshold' put a 1, all else put a 0. use their actual dataframes and not the dictionary P_1in5['threshold'] = P_1in5['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in10['threshold'] = P_1in10['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in20['threshold'] = P_1in20['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in50['threshold'] = P_1in50['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in75['threshold'] = P_1in75['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in100['threshold'] = P_1in100['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in200['threshold'] = P_1in200['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in250['threshold'] = P_1in250['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in500['threshold'] = P_1in500['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) P_1in1000['threshold'] = P_1in1000['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in5['threshold'] = FU_1in5['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in10['threshold'] = FU_1in10['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in20['threshold'] = FU_1in20['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in50['threshold'] = FU_1in50['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in75['threshold'] = FU_1in75['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in100['threshold'] = FU_1in100['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in200['threshold'] = FU_1in200['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in250['threshold'] = FU_1in250['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in500['threshold'] = FU_1in500['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) FU_1in1000['threshold'] = FU_1in1000['band_data'].apply(lambda x: 1 if x >= 0.15 else 0) # In[34]: # make 6 groupings of the dataframes, where each will be a new dataframe using the x and y columns and then the threshold values from each of the dataframes in the group, with the column being named 'threshold' + the name of the dataframe # group 1 (P_1in5, P_1in10) G1 = pd.DataFrame(P_1in5, columns=['band', 'x', 'y']) G1['threshold_P_1in5'] = P_1in5['threshold'] G1['threshold_P_1in10'] = P_1in10['threshold'] # group 2 (P_1in20, P_1in50, P_1in75) G2 = pd.DataFrame(P_1in20, columns=['band', 'x', 'y']) G2['threshold_P_1in20'] = P_1in20['threshold'] G2['threshold_P_1in50'] = P_1in50['threshold'] G2['threshold_P_1in75'] = P_1in75['threshold'] # group 3 (P_1in100, P_1in200, P_1in250, P_1in500, P_1in1000) G3 = pd.DataFrame(P_1in100, columns=['band', 'x', 'y']) G3['threshold_P_1in100'] = P_1in100['threshold'] G3['threshold_P_1in200'] = P_1in200['threshold'] G3['threshold_P_1in250'] = P_1in250['threshold'] G3['threshold_P_1in500'] = P_1in500['threshold'] G3['threshold_P_1in1000'] = P_1in1000['threshold'] # group 4 (FU_1in5, FU_1in10) G4 = pd.DataFrame(FU_1in5, columns=['band', 'x', 'y']) G4['threshold_FU_1in5'] = FU_1in5['threshold'] G4['threshold_FU_1in10'] = FU_1in10['threshold'] # group 5 (FU_1in20, FU_1in50, FU_1in75) G5 = pd.DataFrame(FU_1in20, columns=['band', 'x', 'y']) G5['threshold_FU_1in20'] = FU_1in20['threshold'] G5['threshold_FU_1in50'] = FU_1in50['threshold'] G5['threshold_FU_1in75'] = FU_1in75['threshold'] # group 6 (FU_1in100, FU_1in200, FU_1in250, FU_1in500, FU_1in1000) G6 = pd.DataFrame(FU_1in100, columns=['band', 'x', 'y']) G6['threshold_FU_1in100'] = FU_1in100['threshold'] G6['threshold_FU_1in200'] = FU_1in200['threshold'] G6['threshold_FU_1in250'] = FU_1in250['threshold'] G6['threshold_FU_1in500'] = FU_1in500['threshold'] G6['threshold_FU_1in1000'] = FU_1in1000['threshold'] # In[37]: # take the mode of each of the threshold columns and make it the new column 'threshold' in each of the groups G1['threshold'] = G1[['threshold_P_1in5', 'threshold_P_1in10']].mode(axis=1)[0] # print("G1 threshold column created") # G2['threshold'] = G2[['threshold_P_1in20', 'threshold_P_1in50', 'threshold_P_1in75']].mode(axis=1)[0] # print("G2 threshold column created") # G3['threshold'] = G3[['threshold_P_1in100', 'threshold_P_1in200', 'threshold_P_1in250', 'threshold_P_1in500', 'threshold_P_1in1000']].mode(axis=1)[0] # print("G3 threshold column created") # G4['threshold'] = G4[['threshold_FU_1in5', 'threshold_FU_1in10']].mode(axis=1)[0] # print("G4 threshold column created") # G5['threshold'] = G5[['threshold_FU_1in20', 'threshold_FU_1in50', 'threshold_FU_1in75']].mode(axis=1)[0] # print("G5 threshold column created") # G6['threshold'] = G6[['threshold_FU_1in100', 'threshold_FU_1in200', 'threshold_FU_1in250', 'threshold_FU_1in500', 'threshold_FU_1in1000']].mode(axis=1)[0] # print("G6 threshold column created") # In[69]: # turn all dataframes into own variables for i in data_dict: globals()[i] = data_dict[i] print(i, 'converted to variable') # In[80]: # drop na values for i in data_dict: data_dict[i] = data_dict[i].dropna() print(i, 'dropped na values') len(P_1in5) # In[63]: # import mode function from scipy.stats import mode # create grouping categories per return period per flood type P_under1 = ['P_1in1000', 'P_1in500', 'P_1in250', 'P_1in200', 'P_1in100'] P_1to10 = ['P_1in75', 'P_1in50', 'P_1in20'] P_over10 = ['P_1in10', 'P_1in5'] FU_under1 = ['FU_1in1000', 'FU_1in500', 'FU_1in250', 'FU_1in200', 'FU_1in100'] FU_1to10 = ['FU_1in75', 'FU_1in50', 'FU_1in20'] FU_over10 = ['FU_1in10', 'FU_1in5'] # create empty dictionary to store mode values mode_dict = {} # Calculate mode for each return period per flood type for i in P_under1 + P_1to10 + P_over10 + FU_under1 + FU_1to10 + FU_over10: # Get the data array from the xarray Dataset data_array = data_dict[i].to_array() # Calculate the mode along the 'band' dimension (axis=0) mode_values = mode(data_array, axis=0, nan_policy='omit') # Store the mode values in the dictionary mode_dict[i] = mode_values.mode[0] print(i, 'mode calculated:', mode_values.mode[0]) # In[47]: # set rasters to same CRS as catchment for i in data_dict: data_dict[i] = data_dict[i].rio.reproject(catchment.crs, resampling=0) print(i, 'reprojected') # In[48]: import geopandas as gpd from scipy.stats import mode # Set the data directory datadir = 'C:/Users/jtrum/world_bank/data/' # Load the catchment boundary catchment = gpd.read_file(datadir + 'catchment.geojson') # Define the probability groups groups = { 'low_prob': ['P_1in1000', 'P_1in500', 'P_1in250', 'P_1in200', 'P_1in100'], 'medium_prob': ['P_1in75', 'P_1in50', 'P_1in20'], 'high_prob': ['P_1in10', 'P_1in5'] } # Create empty DataArrays to store binary classifications binary_classifications = {} for group_name, group_rasters in groups.items(): binary_data = xr.open_dataarray(datadir + f'AngolaFathom/Angola/pluvial/{group_rasters[0]}.tif') for raster_name in group_rasters[1:]: data = xr.open_dataarray(datadir + f'AngolaFathom/Angola/pluvial/{raster_name}.tif') binary_data = binary_data + data # Set values to 1 if they are greater than or equal to half of the raster count binary_data = (binary_data >= len(group_rasters) / 2).astype(float) binary_classifications[group_name] = binary_data print(group_name, 'done') # Now, you have six binary classifications in the 'binary_classifications' dictionary # In[49]: # change the crs of low_prob, medium_prob, and high_prob to match catchment for i in binary_classifications: binary_classifications[i] = binary_classifications[i].rio.reproject(catchment.crs, resampling=0) print(i, 'reprojected') # In[50]: # crop the low_prob, medium_prob, and high_prob rasters to the extent of catchment basin clip = catchment.geometry for i in binary_classifications: binary_classifications[i] = binary_classifications[i].rio.clip(clip, from_disk=True) print(i, 'clipped') # In[32]: # view data as a dataframe df = binary_classifications['low_prob'].to_dataframe() df.head() # In[25]: import xarray as xr import geopandas as gpd import numpy as np from scipy.stats import mode # Create groups for probability thresholds thresholds = { 'low_prob': (0, 0.01), # <1% probability 'medium_prob': (0.01, 0.1), # 1-10% probability 'high_prob': (0.1, 1) # >10% probability } # Create a dictionary to store the grouped data grouped_data = {key: {} for key in thresholds} # Load and crop the rasters for i in pluvial + fluvial_undefined: data = xr.open_dataset(datadir + f'AngolaFathom/Angola/pluvial/{i}.tif') # Crop the data to the extent of the catchment data = data.sel(x=slice(catchment.total_bounds[0], catchment.total_bounds[2]), y=slice(catchment.total_bounds[1], catchment.total_bounds[3])) # Group the data based on probability thresholds for key, (min_prob, max_prob) in thresholds.items(): mask = (data['probability'] >= min_prob) & (data['probability'] < max_prob) grouped_data[key][i] = data.where(mask, drop=True) # Create a dictionary to store the binary classifications binary_classification = {key: {} for key in thresholds} # Create a binary classification based on majority values for group, group_data in grouped_data.items(): for i in pluvial + fluvial_undefined: binary_data = group_data[i].copy() # Replace NaN values with -1 to handle them separately binary_data = binary_data.fillna(-1) # Compute the mode (majority value) for each cell mode_result = mode(binary_data, axis=0, nan_policy='omit') # Set majority value to 1, if there's a majority of 1 values binary_data = np.where(mode_result.mode == 1, 1, 0) # Set NaN values back to NaN binary_data = binary_data.astype(float) binary_classification[group][i] = xr.DataArray(binary_data, coords=binary_data.coords, dims=binary_data.dims) # Now you have `binary_classification` dictionary containing binary data for each group.