#!/usr/bin/env python
# coding: utf-8

# # Getting started with ELA
# 
# This notebook uses data covering a few kilometres around the township of [Bungendore](https://www.google.com.au/maps/@-35.251235,149.4173155,34159m/data=!3m1!1e3) in NSW, Australia. 
# 
# For convenience compressed data for this notebook is available for download from [this link](https://cloudstor.aarnet.edu.au/plus/s/gTAVfJZ5Dve15ud). This data will be imported in the section 'Importing data' in this notebook. Once you adjust the data path to your liking, this notebook should run fine.
# 
# Sample data was derived from the Australian Bureau of Meteorology [National Groundwater Information System](http://www.bom.gov.au/water/groundwater/ngis/index.shtml) and from [ELVIS - Elevation and Depth - Foundation Spatial Data](http://elevation.fsdf.org.au/). You can check the licensing for these data; the short version is that use for learning purposes is fine.
# 

# ## Purpose
# 
# This notebook is a realistic example of one case study the pyela package addresses. What we obtain at the end of the workflow is a 3D gridded model of primary lithologies. A visual outcome (normally interactive) is:

# ![3D Interpolated overlay primary lithology clay](img/3d_overlay_bungendore_clay_lithology.png)

# ## Importing python packages

# In[1]:


import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show
import geopandas as gpd


# In[2]:


# Only set to True for co-dev of ela from this use case:
ela_from_source = False
# ela_from_source = True


# In[3]:


if ela_from_source:
    if ('ELA_SRC' in os.environ):
        root_src_dir = os.environ['ELA_SRC']
    elif sys.platform == 'win32':
        root_src_dir = r'C:\src\github_jm\pyela'
    else:
        username = os.environ['USER']
        root_src_dir = os.path.join('/home', username, 'src/github_jm/pyela')
    pkg_src_dir = root_src_dir
    sys.path.append(pkg_src_dir)

from ela.textproc import *
from ela.utils import *
from ela.classification import *
from ela.visual import *
from ela.spatial import SliceOperation


# ## Importing data
# 
# There are two main sets of information we need: the borehole lithology logs, and the spatial information in the surface elevation (DEM) and geolocation of a subset of bores around Bungendore. 

# In[4]:


data_path = None


# You probably want to explicitly set `data_path` to the location where you put the 'Bungendore' folder e.g:

# In[5]:


#data_path = '/home/myusername/data' # On Linux, if you now have the folder /home/myusername/data/Bungendore
#data_path = r'C:\data\Lithology'  # windows, if you have C:\data\Lithology\Bungendore


# Otherwise a fallback for the pyela developer(s)

# In[6]:


if data_path is None:
    if ('ELA_DATA' in os.environ):
        data_path = os.environ['ELA_DATA']
    elif sys.platform == 'win32':
        data_path = r'C:\data\Lithology'
    else:
        username = os.environ['USER']
        data_path = os.path.join('/home', username, 'data/Lithology')


# In[7]:


bungendore_datadir = os.path.join(data_path, 'Bungendore')
bidgee_path = os.path.join(bungendore_datadir, 'gw_shp_murrumbidgee_river/shp_murrumbidgee_river')
bungendore_bore_locations_fn = os.path.join(bungendore_datadir,'gw_shp_murrumbidgee_river/shp_bungendore.shp')


# Read the lithology logs from the whole Murrumbidgee catchment - we will subset later.

# In[8]:


lithology_logs = pd.read_csv(os.path.join(bidgee_path, 'NGIS_LithologyLog.csv'))


# The DEM raster and the bore location shapefile do not use the same projection (coordinate reference system) so we reproject one of them. We choose the raster's UTM.

# In[9]:


bungendore_raster = rasterio.open(os.path.join(bungendore_datadir,'CLIP_66949','CLIP_reproj_UTM55.tif'))


# In[10]:


bore_locations_raw = gpd.read_file(bungendore_bore_locations_fn)


# In[11]:


bore_locations_raw.crs, bungendore_raster.crs


# In[12]:


bore_locations = bore_locations_raw.to_crs(bungendore_raster.crs)


# In[13]:


bore_locations.head()


# In[14]:


# backup, but likely obsolete:
# raster_projstring = "+proj=utm +zone=55 +south +datum=WGS84 +units=m +no_defs"
# bore_locations = bore_locations_raw.to_crs(raster_projstring)
# bore_locations.head()


# ### Geolocation of data

# In[15]:


fig, ax = plt.subplots(figsize=(12, 12))
show(bungendore_raster,title='East of Bungendore, AU', cmap='terrain',  ax=ax)
bore_locations.plot(ax=ax, facecolor='black')


# In[16]:


lithology_logs.head(n=10)


# Note that around 2500 data points do not have their AHD height but do have their depth. This will be remediated when we calculate our own AHD heights.

# ### Subset to the location of interest
# 
# The lithology logs are for all of the Murrumbidgee catchment, which is much larger than the area of interest and for which we have the geolocation of boreholes. We subset to the location of interest 

# In[17]:


df = lithology_logs


# Let's subset the logs based on spatial locations, to keep only those "nearby" the DEM we have near the locality of Bungendore in this case study. First define a few constants: 

# In[18]:


df.columns


# In[19]:


DEPTH_FROM_COL = 'FromDepth'
DEPTH_TO_COL = 'ToDepth'

TOP_ELEV_COL = 'TopElev'
BOTTOM_ELEV_COL = 'BottomElev'

LITHO_DESC_COL = 'Description'
HYDRO_CODE_COL = 'HydroCode'


# ### Merging the geolocation from the shapefile and lithology records

# The geopandas data frame has a column geometry listing `POINT` objects. 'ela' includes  `get_coords_from_gpd_shape` to extrace the coordinates to a simpler structure. 'ela' has predefined column names (e.g. EASTING_COL) defined for easting/northing information, that we can use to name our coordinate information.

# In[20]:


geoloc = get_coords_from_gpd_shape(bore_locations, colname='geometry', out_colnames=[EASTING_COL, NORTHING_COL])
geoloc[HYDRO_CODE_COL] = bore_locations[HYDRO_CODE_COL]
geoloc.head()


# With this data frame we can perform two operations in one go: subsetting the lithology records to only the 640 bores of interest, and adding to the result the x/y geolocations to the data frame.

# In[21]:


len(geoloc), len(df)


# In[22]:


df = pd.merge(df, geoloc, how='inner', on=HYDRO_CODE_COL, sort=False, copy=True, indicator=False, validate=None)


# In[23]:


len(df)


# In[24]:


df.head()


# ### Round up 'depth to' and 'depth from' columns
# 
# We round the depth related columns to the upper integer value and drop the entries where the resulting depths have degenerated to 0. `ela` has a class `DepthsRounding` to facilitate this operations on lithology records with varying column names.
# 
# We first clean up height/depths columns to make sure they are numeric.

# In[25]:


def as_numeric(x):
    if isinstance(x, float):
        return x
    if x == 'None':
        return np.nan
    elif x is None:
        return np.nan
    elif isinstance(x, str):
        return float(x)
    else:
        return float(x)


# In[26]:


df[DEPTH_FROM_COL] = df[DEPTH_FROM_COL].apply(as_numeric)
df[DEPTH_TO_COL] = df[DEPTH_TO_COL].apply(as_numeric)
df[TOP_ELEV_COL] = df[TOP_ELEV_COL].apply(as_numeric)
df[BOTTOM_ELEV_COL] = df[BOTTOM_ELEV_COL].apply(as_numeric)


# In[27]:


dr = DepthsRounding(DEPTH_FROM_COL, DEPTH_TO_COL)


# In[28]:


"Before rounding heights we have " + str(len(df)) + " records"


# In[29]:


df = dr.round_to_metre_depths(df, np.round, True)
"After removing thin sliced entries of less than a metre, we are left with " + str(len(df)) + " records left"


# ## Exploring the descriptive lithology 

# In[30]:


descs = df[LITHO_DESC_COL]
descs = descs.reset_index()
descs = descs[LITHO_DESC_COL]
descs.head()


# The description column as read seems to be objects. Other columns seem to be objects when they should be numeric. We define two functions to clean these.

# In[31]:


def clean_desc(x):
    if isinstance(x, float):
        return u''
    elif x is None:
        return u''
    else:
        # python2 return unicode(x)        
        return x


# In[32]:


y = [clean_desc(x) for x in descs]


# In[33]:


from striplog import Lexicon
lex = Lexicon.default()


# In[34]:


y = clean_lithology_descriptions(y, lex)


# We get a flat list of all the "tokens" but remove stop words ('s', 'the' and the like)

# In[35]:


y = v_lower(y)
vt = v_word_tokenize(y)
flat = np.concatenate(vt)


# In[36]:


import nltk
from nltk.corpus import stopwords


# In[37]:


stoplist = stopwords.words('english')
exclude = stoplist + ['.',',',';',':','(',')','-']
flat = [word for word in flat if word not in exclude]


# In[38]:


len(set(flat))


# In[39]:


df_most_common= token_freq(flat, 50)


# In[40]:


plot_freq(df_most_common)


# There are terms such as 'sandy', 'clayey', 'silty' and so on. Let's define functions to detect terms derived from lithology classes, and their frequency. Given the likely skewness, we use a y log scale. 

# In[41]:


plot_freq_for_root(flat, 'sand')


# In[42]:


plot_freq_for_root(flat, 'clay')


# In[43]:


# TODO: add a section that defines additional clean up e.g. 'sand/clay/soil' or dashed union composite terms
# split_composite_term('sand/clay/soil', '/').replace('/', ' / ')


# In[44]:


df_most_common


# ## Defining lithology classes and finding primary/secondary lithologies
# 
# From the list of most common tokens, we may want to define lithology classes as follows:

# In[45]:


df[LITHO_DESC_COL] = y


# In[46]:


lithologies = ['clay','sand','gravel','granite','shale','silt','topsoil','loam','soil','slate','sandstone']


# And to capture any of these we devise a regular expression:

# In[47]:


any_litho_markers_re = r'sand|clay|ston|shale|silt|granit|soil|gravel|loam|slate'
regex = re.compile(any_litho_markers_re)


# In[48]:


my_lithologies_numclasses = create_numeric_classes(lithologies)


# In[49]:


lithologies_dict = dict([(x,x) for x in lithologies])
lithologies_dict['sands'] = 'sand'
lithologies_dict['clays'] = 'clay'
lithologies_dict['shales'] = 'shale'
lithologies_dict['claystone'] = 'clay'
lithologies_dict['siltstone'] = 'silt'
lithologies_dict['limesand'] = 'sand' # ??
lithologies_dict['calcarenite'] = 'limestone' # ??
lithologies_dict['calcitareous'] = 'limestone' # ??
lithologies_dict['mudstone'] = 'silt' # ??
lithologies_dict['capstone'] = 'limestone' # ??
lithologies_dict['ironstone'] = 'sandstone' # ??
#lithologies_dict['topsoil'] = 'soil' # ??


# In[50]:


lithologies_adjective_dict = {
    'sandy' :  'sand',
    'clayey' :  'clay',
    'clayish' :  'clay',
    'shaley' :  'shale',
    'silty' :  'silt',
    'gravelly' :  'gravel'
}


# In[51]:


v_tokens = v_word_tokenize(y)
litho_terms_detected = v_find_litho_markers(v_tokens, regex=regex)


# Let's see if we detect these lithology markers in each bore log entries  

# In[52]:


zero_mark = [x for x in litho_terms_detected if len(x) == 0 ]
at_least_one_mark = [x for x in litho_terms_detected if len(x) >= 1]
at_least_two_mark = [x for x in litho_terms_detected if len(x) >= 2]
print('There are %s entries with no marker, %s entries with at least one, %s with at least two'%(len(zero_mark),len(at_least_one_mark),len(at_least_two_mark)))


# Note: probably need to think of precanned facilities in ela to assess the detection rate in such EDA. Maybe wordcloud not such a bad idea too.

# In[53]:


descs_zero_mark = [y[i] for i in range(len(litho_terms_detected)) if len(litho_terms_detected[i]) == 0 ]


# In[54]:


descs_zero_mark[1:20]


# In[55]:


primary_litho = v_find_primary_lithology(litho_terms_detected, lithologies_dict)


# In[56]:


secondary_litho = v_find_secondary_lithology(litho_terms_detected, primary_litho, lithologies_adjective_dict, lithologies_dict)


# In[57]:


df[PRIMARY_LITHO_COL]=primary_litho
df[SECONDARY_LITHO_COL]=secondary_litho


# In[58]:


df[PRIMARY_LITHO_NUM_COL] = v_to_litho_class_num(primary_litho, my_lithologies_numclasses)
df[SECONDARY_LITHO_NUM_COL] = v_to_litho_class_num(secondary_litho, my_lithologies_numclasses)


# In[59]:


df.head()


# ## Converting depth below ground to Australian Height Datum elevation
# 
# While the bore entries have columns for AHD elevations, many appear to be missing data. Since we have a DEM of the region we can correct this.

# In[60]:


cd = HeightDatumConverter(bungendore_raster)


# In[61]:


df = cd.add_height(df, 
        depth_from_col=DEPTH_FROM_COL, depth_to_col=DEPTH_TO_COL, 
        depth_from_ahd_col=DEPTH_FROM_AHD_COL, depth_to_ahd_col=DEPTH_TO_AHD_COL, 
        easting_col=EASTING_COL, northing_col=NORTHING_COL, drop_na=False)


# In[62]:


df.head()


# In[63]:


# to be reused in experimental notebooks:
classified_logs_filename = os.path.join(data_path, 'Bungendore','classified_logs.pkl')
df.to_pickle(classified_logs_filename)


# 
# ## Interpolate over a regular grid
# 

# In[64]:


# max/min bounds
shp_bbox = get_bbox(bore_locations)
shp_bbox


# In[65]:


raster_bbox = bungendore_raster.bounds
raster_bbox


# In[66]:


x_min = max(shp_bbox[0], raster_bbox[0])
x_max = min(shp_bbox[2], raster_bbox[2])
y_min = max(shp_bbox[1], raster_bbox[1])
y_max = min(shp_bbox[3], raster_bbox[3])


# In[67]:


grid_res = 150
m = create_meshgrid_cartesian(x_min, x_max, y_min, y_max, grid_res)


# In[68]:


[x.shape for x in m]


# In[69]:


dem_array = surface_array(bungendore_raster, x_min, y_min, x_max, y_max, grid_res)


# In[70]:


dem_array.shape


# In[71]:


plt.imshow(to_carto(dem_array), cmap='terrain')


# In[72]:


dem_array_data = {'bounds': (x_min, x_max, y_min, y_max), 'grid_res': grid_res, 'mesh_xy': m, 'dem_array': dem_array}


# In[73]:


import pickle

fp = os.path.join(bungendore_datadir, 'dem_array_data.pkl')
if not os.path.exists(fp):
    with open(fp, 'wb') as handle:
        pickle.dump(dem_array_data, handle, protocol=pickle.HIGHEST_PROTOCOL)


# In[74]:


df.head(10)


# We need to define min and max heights on the Z axis for which we interoplate. We use the KNN algorithm with 10 neighbours. We should use a domain such that there are enough points for each height. Let's find visually heights with at least 10 records

# In[75]:


n_bins=100
p = df.hist(column=[DEPTH_FROM_AHD_COL,DEPTH_TO_AHD_COL], sharex=True, sharey=True, bins=n_bins, figsize=(15,5))
for axes in p:
    axes[0].set_yscale("log", nonposy='clip')


# In[76]:


n_neighbours=10
ahd_min=620
ahd_max=830

z_ahd_coords = np.arange(ahd_min,ahd_max,1)
dim_x,dim_y = m[0].shape
dim_z = len(z_ahd_coords)
dims = (dim_x,dim_y,dim_z)


# In[77]:


lithology_3d_array=np.empty(dims)


# In[78]:


gi = GridInterpolation(easting_col=EASTING_COL, northing_col=NORTHING_COL)


# In[79]:


gi.interpolate_volume(lithology_3d_array, df, PRIMARY_LITHO_NUM_COL, z_ahd_coords, n_neighbours, m)


# In[80]:


# Burn DEM into grid
z_index_for_ahd = z_index_for_ahd_functor(b=-ahd_min)
# check that we get the expected indexes for an AHD height
z_index_for_ahd(800), z_index_for_ahd(+630)


# In[81]:


dem_array.shape, m[0].shape, lithology_3d_array.shape


# In[82]:


burn_volume(lithology_3d_array, dem_array, z_index_for_ahd, below=False)


# ## 2D visualisations

# In[83]:


#lithologies =           ['clay',  'sand',      'gravel',    'granite', 'shale',  'silt',           'topsoil',      'loam',     'soil',    'slate',      'sandstone']
lithology_color_names = ['olive', 'yellow', 'lightgrey',      'dimgray', 'teal',  'cornsilk',     'saddlebrown', 'rosybrown', 'chocolate', 'lightslategrey', 'gold']


# In[84]:


lithology_cmap = discrete_classes_colormap(lithology_color_names) # Later for exporting to RGB geotiffs??
litho_legend_display_info = [(lithology_cmap[i], lithologies[i], lithology_color_names[i]) for i in range(len(lithologies))]


# In[85]:


litho_legend = legend_fig(litho_legend_display_info)


# In[86]:


cms = cartopy_color_settings(lithology_color_names)


# In[87]:


imgplot = plt.imshow(to_carto(lithology_3d_array[:, :, z_index_for_ahd(660)]), cmap=cms['cmap'])
title = plt.title('Primary litho at +660mAHD')


# In[88]:


imgplot = plt.imshow(to_carto(lithology_3d_array[:, :, z_index_for_ahd(700)]), cmap=cms['cmap'])
title = plt.title('Primary litho at +700mAHD')


# ## 3D visualisation

# In[89]:


from ela.visual3d import *


# In[90]:


xx, yy = dem_array_data['mesh_xy']


# In[91]:


from mayavi import mlab


# In[92]:


vis_litho = LithologiesClassesVisual3d(lithologies, lithology_color_names, 'black')


# In[93]:


# TODO: problematic with this data - investigate
# vis_litho.render_classes_planar(lithology_3d_array, 'Primary lithology')


# In[94]:


# vis_litho.render_class(lithology_3d_array, 0)


# ela has facilities to visualise overlaid information: DEM, classified bore logs, and volumes of interpolated lithologies. This is important to convey .
# 
# First a bit of data filling for visual purposes, as NaN lithology class codes may cause issues.

# In[95]:


df_infilled = df.fillna({PRIMARY_LITHO_NUM_COL: -1.0})
# df_2 = df_1[(df_1[DEPTH_TO_AHD_COL] > (ahd_min-20))]


# In[96]:


# A factor to apply to Z coordinates, otherwise things would be squashed visually along the heights.
# Would prefer a visual only scaling factor, but could not find a way to do so. 
Z_SCALING = 20.0


# In[97]:


z_coords = np.arange(ahd_min,ahd_max,1)


# In[98]:


overlay_vis_litho = LithologiesClassesOverlayVisual3d(lithologies, lithology_color_names, 'black', dem_array_data, z_coords, Z_SCALING, df_infilled, PRIMARY_LITHO_NUM_COL)


# In[99]:


def view_class(value):
    f = overlay_vis_litho.view_overlay(value, lithology_3d_array)
    return f


# In[100]:


# f = view_class(0.0)


# ![3D Interpolated overlay primary lithology clay](img/3d_overlay_bungendore_clay_lithology.png)

# In[101]:


f = view_class(1.0)


# ## Slicing and exporting as Geotiff
# 
# The interpolated volume is naturally using AHD coordinates for height. We may want to slice data such that we have maps of lithologies at particular depths below ground level. `SliceOperation` is here for this.
# 

# In[102]:


sops = SliceOperation(dem_array, z_index_for_ahd)


# In[103]:


depths = sops.from_ahd_to_depth_below_ground_level(lithology_3d_array, from_depth = 0, to_depth = 10)


# In[104]:


depths.shape


# In[105]:


def plt_litho(depth_bgl):
    imgplot = plt.imshow(to_carto(depths[:,:,10-depth_bgl]), cmap=cms['cmap'])
    title = plt.title('Primary litho at %s m BGL'%depth_bgl)


# In[106]:


plt_litho(0)


# Although the DEM had no missing data, there are some areas with no interpolated lithology classification. These correspond to elevations where there were not enough records at a given AHD to interpolate from. 

# In[107]:


plt_litho(10)


# 'ela' includes facilities to export slices to the GeoTIFF format. Features are available through the `GeotiffExporter` class

# In[108]:


from rasterio.transform import from_origin
transform = from_origin(x_min, y_max, grid_res, grid_res)


# In[109]:


from ela.io import GeotiffExporter
ge = GeotiffExporter(bungendore_raster.crs, transform)


# In[110]:


litho_depth = to_carto(depths[:,:,0])
lithology_cmap = discrete_classes_colormap(lithology_color_names)


# In[111]:


# ge.export_rgb_geotiff(litho_depth, '/home/xxxyyy/tmp/test.tif', lithology_cmap)


# The resulting file (RGB channels) should be visible from most explorer windows (Windows or Linux). Other functions export the georeference values.  

# In[ ]: