#!/usr/bin/env python
# coding: utf-8

# # Bathymetry data from the Caribbean 
# 
# This dataset is a compilation of several single-beam bathymetry surveys downloaded from the [NOAA NCEI](https://ngdc.noaa.gov/mgg/geodas/trackline.html). The data are originally in MGD77 format and include a header with metadata on each survey ([`MGD77_921744.h77t`](MGD77_921744.h77t)). The original data file was compressed with LZMA to save space and make it possible to upload it to this GitHub repository.
# 
# License: Public domain
# 
# Original source: https://ngdc.noaa.gov/mgg/geodas/trackline.html (region selected from the web interface; no direct download link)

# In[1]:


import os
import lzma
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyproj
import verde as vd
import pooch


# ## Load and clean the data
# 
# The dataset are LZMA compressed and are basically a tab delimited table. We can use pandas to read this directly.

# In[2]:


data_full = pd.read_csv(
    "MGD77_921744.m77t.xz", 
    sep="\t", 
    usecols=[0, 4, 5, 9],
    dtype=dict(SURVEY_ID="str", LON="float64", LAT="float64", CORR_DEPTH="float64"),
).dropna().reset_index(drop=True)


# Rename the columns to something easier to type, use full names, and include units.

# In[3]:


data_full = data_full.rename(columns=dict(SURVEY_ID="survey_id", LAT="latitude", LON="longitude", CORR_DEPTH="depth_m"))
data_full


# There is quite a bit of data so we want to slice it a bit to cover a smaller area where the bathymetry is more interesting.

# In[4]:


region_latlon = (-88, -59, 7, 24)
data = data_full[vd.inside((data_full.longitude, data_full.latitude), region=region_latlon)].reset_index(drop=True)
data


# Calculate projected coordinates so we can plot the data for inspection.

# In[5]:


proj_string = f"+proj=cea +ellips=WGS84 +lat_ts={np.mean(region_latlon[2:]):.1f} +lon_0={np.mean(region_latlon[:2]):.1f}"
projection = pyproj.Proj(proj_string)
coordinates = projection(data.longitude.values, data.latitude.values)
region = vd.get_region(coordinates)


# Plot the full data to see what we have.

# In[6]:


plt.figure(figsize=(16, 8))
plt.title("Bathymetry data for the Caribbean")
plt.scatter(*coordinates, c=data.depth_m, s=1)
plt.colorbar(label="bathymetry (m)", pad=0)
plt.axis("scaled")
plt.xlim(*region[:2])
plt.ylim(*region[2:])
plt.xlabel("easting (m)")
plt.ylabel("northing (m)")
plt.savefig("preview.jpg", dpi=200, bbox_inches="tight")


# ## Export

# Set the export precision for each column by converting them to strings prior to saving the CSV.

# In[7]:


export = data.copy()
# Meter level precision for coordinates and depth. 
# This is a very conservative estimate of actual precision.
export["longitude"] = export.longitude.map(lambda x: "{:.5f}".format(x))
export["latitude"] = export.latitude.map(lambda x: "{:.5f}".format(x))
export["depth_m"] = export.depth_m.map(lambda x: "{:.0f}".format(x))
export.head()


# Save the data to a file, calculate it's size and checksums.

# In[8]:


fname = "caribbean-bathymetry.csv.xz"
export.to_csv(fname, index=False)
print(f"size: {os.path.getsize(fname) / 1e6} Mb")
for alg in ["md5", "sha256"]:
    print(f"{alg}:{pooch.file_hash(fname, alg=alg)}")


# Try loading the data back to see if it works.

# In[9]:


pd.read_csv(fname)


# ## Try out some processing
# 
# Experiment with some processing functions to see if the results are interesting (for documentation) and can calculte in a reasonable amount of time (suitable for CI).

# Block reduce and plot to check how long that would take.

# In[10]:


get_ipython().run_cell_magic('time', '', 'spacing = 10e3\ncoordinates_mean, bathymetry_mean = vd.BlockReduce(np.median, spacing=spacing).filter(coordinates, data.depth_m)\n_, bathymetry_std = vd.BlockReduce(np.std, spacing=spacing).filter(coordinates, data.depth_m)\n')


# In[11]:


bathymetry_mean.size


# That's suitable for block reductions, trends, and nearest neighbor interpolations. For plain spline interpolation, we can always take a subsection of the data (around an island or the trench).

# In[12]:


plt.figure(figsize=(16, 16))
plt.subplot(2, 1, 1)
plt.scatter(*coordinates_mean, c=bathymetry_mean, s=3)
plt.colorbar(label="bathymetry (m)", pad=0)
plt.axis("scaled")
plt.xlim(*region[:2])
plt.ylim(*region[2:])
plt.subplot(2, 1, 2)
plt.scatter(*coordinates_mean, c=bathymetry_std, s=3, vmax=1000)
plt.colorbar(label="standard deviation (m)", pad=0)
plt.axis("scaled")
plt.xlim(*region[:2])
plt.ylim(*region[2:])