Here in this data preparation jupyter notebook, we will prepare our data that will go into a Convolutional Neural Network model later.
import glob
import hashlib
import io
import json
import os
import shutil
import sys
import requests
import tqdm
import yaml
import geopandas as gpd
import pygmt as gmt
import IPython.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyproj
import quilt
import rasterio
import rasterio.mask
import rasterio.plot
import shapely.geometry
import skimage.util.shape
import xarray as xr
print("Python :", sys.version.split("\n")[0])
print("Geopandas :", gpd.__version__)
print("GMT :", gmt.__version__)
print("Numpy :", np.__version__)
print("Rasterio :", rasterio.__version__)
print("Scikit-image :", skimage.__version__)
print("Xarray :", xr.__version__)
Python : 3.6.6 | packaged by conda-forge | (default, Oct 11 2018, 14:33:06) Geopandas : 0.4.0+26.g9e584cc GMT : 0.0.1a0+16.g7004aa0 Numpy : 1.14.5 Rasterio : 1.0.13 Scikit-image : 0.14.2 Xarray : 0.11.3
def download_to_path(path: str, url: str):
r"""
Download from a url to a path
>>> download_to_path(path="highres/Data_20171204_02.csv",
... url="https://data.cresis.ku.edu/data/rds/2017_Antarctica_Basler/csv_good/Data_20171204_02.csv")
<Response [200]>
>>> open("highres/Data_20171204_02.csv").readlines()
['LAT,LON,UTCTIMESOD,THICK,ELEVATION,FRAME,SURFACE,BOTTOM,QUALITY\n']
>>> os.remove(path="highres/Data_20171204_02.csv")
"""
# if not os.path.exists(path=path):
r = requests.get(url=url, stream=True)
with open(file=path, mode="wb") as fd:
for chunk in r.iter_content(chunk_size=1024):
fd.write(chunk)
return r
def check_sha256(path: str):
"""
Returns SHA256 checksum of a file
>>> download_to_path(path="highres/Data_20171204_02.csv",
... url="https://data.cresis.ku.edu/data/rds/2017_Antarctica_Basler/csv_good/Data_20171204_02.csv")
<Response [200]>
>>> check_sha256("highres/Data_20171204_02.csv")
'53cef7a0d28ff92b30367514f27e888efbc32b1bda929981b371d2e00d4c671b'
>>> os.remove(path="highres/Data_20171204_02.csv")
"""
with open(file=path, mode="rb") as afile:
sha = hashlib.sha256(afile.read())
return sha.hexdigest()
def parse_datalist(
yaml_file: str = "data_list.yml",
record_path: str = "files",
schema: list = [
"citekey",
"folder",
"location",
"resolution",
["doi", "dataset"],
["doi", "literature"],
],
) -> pd.DataFrame:
assert yaml_file.endswith((".yml", ".yaml"))
with open(file=yaml_file, mode="r") as yml:
y = yaml.load(stream=yml)
datalist = pd.io.json.json_normalize(
data=y, record_path=record_path, meta=schema, sep="_"
)
return datalist
# Pretty print table with nice column order and clickable url links
pprint_table = lambda df, folder: IPython.display.HTML(
df.query(expr="folder == @folder")
.reindex(columns=["folder", "filename", "url", "sha256"])
.style.format({"url": lambda url: f'<a target="_blank" href="{url}">{url}</a>'})
.render(uuid=f"{folder}")
)
dataframe = parse_datalist()
# Code to autogenerate README.md files in highres/lowres/misc folders from data_list.yml
columns = ["Filename", "Location", "Resolution", "Literature Citation", "Data Citation"]
for folder, md_header in [
("lowres", "Low Resolution"),
("highres", "High Resolution"),
("misc", "Miscellaneous"),
]:
assert folder in pd.unique(dataframe["folder"])
md_name = f"{folder}/README.md"
with open(file=md_name, mode="w") as md_file:
md_file.write(f"# {md_header} Antarctic datasets\n\n")
md_file.write("Note: This file was automatically generated from ")
md_file.write("[data_list.yml](/data_list.yml) using ")
md_file.write("[data_prep.ipynb](/data_prep.ipynb)\n\n")
md_table = pd.DataFrame(columns=columns)
md_table.loc[0] = ["---", "---", "---", "---", "---"]
keydf = dataframe.groupby("citekey").aggregate(lambda x: set(x).pop())
for row in keydf.query(expr="folder == @folder").itertuples():
filecount = len(dataframe[dataframe["citekey"] == row.Index])
extension = os.path.splitext(row.filename)[-1]
row_dict = {
"Filename": row.filename
if filecount == 1
else f"{filecount} *{extension} files",
"Location": row.location,
"Resolution": row.resolution,
"Literature Citation": f"[{row.Index}]({row.doi_literature})",
"Data Citation": f"[DOI]({row.doi_dataset})"
if row.doi_dataset != "nan"
else None,
}
md_table = md_table.append(other=row_dict, ignore_index=True)
md_table.to_csv(path_or_buf=md_name, mode="a", sep="|", index=False)
for dataset in dataframe.query(expr="folder == 'lowres'").itertuples():
path = f"{dataset.folder}/{dataset.filename}" # path to download the file to
if not os.path.exists(path=path):
download_to_path(path=path, url=dataset.url)
assert check_sha256(path=path) == dataset.sha256
pprint_table(dataframe, "lowres")
folder | filename | url | sha256 | |
---|---|---|---|---|
0 | lowres | bedmap2_bed.tif | http://data.pgc.umn.edu/elev/dem/bedmap2/bedmap2_bed.tif | 28e2ca7656d61b0bc7f8f8c1db41914023e0cab1634e0ee645f38a87d894b416 |
with rasterio.open("lowres/bedmap2_bed.tif") as raster_source:
rasterio.plot.show(source=raster_source, cmap="BrBG_r")
for dataset in dataframe.query(expr="folder == 'misc'").itertuples():
path = f"{dataset.folder}/{dataset.filename}" # path to download the file to
if not os.path.exists(path=path):
download_to_path(path=path, url=dataset.url)
assert check_sha256(path=path) == dataset.sha256
pprint_table(dataframe, "misc")
folder | filename | url | sha256 | |
---|---|---|---|---|
1 | misc | REMA_100m_dem.tif | http://data.pgc.umn.edu/elev/dem/setsm/REMA/mosaic/v1.1/100m/REMA_100m_dem.tif | 80c9fa41ccc69be1d2cd4a367d56168321d1079e7260a1996089810db25172f6 |
2 | misc | REMA_200m_dem_filled.tif | http://data.pgc.umn.edu/elev/dem/setsm/REMA/mosaic/v1.1/200m/REMA_200m_dem_filled.tif | f750893861a1a268c8ffe0ba7db36c933223bbf5fcbb786ecef3f052b20f9b8a |
3 | misc | MEaSUREs_IceFlowSpeed_450m.tif | http://data.pgc.umn.edu/gis/packages/quantarctica/Quantarctica3/Glaciology/MEaSUREs%20Ice%20Flow%20Velocity/MEaSUREs_IceFlowSpeed_450m.tif | 4a4efc3a84204c3d67887e8d7fa1186467b51e696451f2832ebbea3ca491c8a8 |
for dataset in dataframe.query(expr="folder == 'highres'").itertuples():
path = f"{dataset.folder}/{dataset.filename}" # path to download the file to
if not os.path.exists(path=path):
download_to_path(path=path, url=dataset.url)
assert check_sha256(path=path) == dataset.sha256
pprint_table(dataframe, "highres")
folder | filename | url | sha256 | |
---|---|---|---|---|
4 | highres | bed_WGS84_grid.txt | http://ramadda.nerc-bas.ac.uk/repository/entry/get/Polar%20Data%20Centre/DOI/Rutford%20Ice%20Stream%20bed%20elevation%20DEM%20from%20radar%20data/bed_WGS84_grid.txt?entryid=synth%3A54757cbe-0b13-4385-8b31-4dfaa1dab55e%3AL2JlZF9XR1M4NF9ncmlkLnR4dA%3D%3D | 7396e56cda5adb82cecb01f0b3e01294ed0aa6489a9629f3f7e8858ea6cb91cf |
5 | highres | 2007t1.txt | nan | 04bdbd3c8e814cbc8f0d324277e339a46cc90a8dc23434d11815a8966951e766 |
6 | highres | 2007tr.txt | nan | 3858a1e58e17b2816920e1b309534cee0391f72a6a0aa68d57777b030e70e9a3 |
7 | highres | 2010tr.txt | nan | 751ea56acc5271b3fb54893ed59e05ff485187a6fc5daaedf75946d730805b80 |
8 | highres | istar08.txt | nan | ed03c64332e8d406371c74a66f3cd21fb3f78ee498ae8408c355879bb89eb13d |
9 | highres | istar18.txt | nan | 3e69d86f28e26810d29b0b9309090684dcb295c0dd39007fe9ee0d1285c57804 |
10 | highres | istar15.txt | nan | 59c981e8c96f73f3a5bd98be6570e101848b4f67a12d98a577292e7bcf776b17 |
11 | highres | istar13.txt | nan | f5bcf80c7ea5095e2eabf72b69a264bf36ed56af5cb67976f9428f560e5702a2 |
12 | highres | istar17.txt | nan | f51a674dc27d6e0b99d199949a706ecf96ea807883c1901fea186efc799a36e8 |
13 | highres | istar07.txt | nan | c81ec04290433f598ce4368e4aae088adeeabb546913edc44c54a5a5d7593e93 |
14 | highres | 2009_Antarctica_DC8.csv | https://data.cresis.ku.edu/data/rds/2009_Antarctica_DC8/csv_good/2009_Antarctica_DC8.csv | 1b9fe0faf4ef217794c2a1de9ef8cfa45f5949efdc4e925930d31c0554cf0ca2 |
15 | highres | 2009_Antarctica_TO.csv | https://data.cresis.ku.edu/data/rds/2009_Antarctica_TO/csv_good/2009_Antarctica_TO.csv | 7a90c5955fa881b4fb88e45ff11629e60ff9ad045c07bf4c6e3aa1f7d1a9361d |
16 | highres | 2009_Antarctica_TO_Gambit.csv | https://data.cresis.ku.edu/data/rds/2009_Antarctica_TO_Gambit/csv_good/2009_Antarctica_TO_Gambit.csv | 93da613223733a4850283b700060afdb14f1002fe5613b8d78c6d3be83e34072 |
17 | highres | 2010_Antarctica_DC8.csv | https://data.cresis.ku.edu/data/rds/2010_Antarctica_DC8/csv_good/2010_Antarctica_DC8.csv | f725a8dbc21d31601b99ccaf9f5282ecd516f2ff966d268b4e735ea1af2014e6 |
18 | highres | 2011_Antarctica_DC8.csv | https://data.cresis.ku.edu/data/rds/2011_Antarctica_DC8/csv_good/2011_Antarctica_DC8.csv | 38aba2a39b0d58b72827f25cfcd667fc943f25c0024d3c52cb1b9e65e9e76163 |
19 | highres | 2011_Antarctica_TO.csv | https://data.cresis.ku.edu/data/rds/2011_Antarctica_TO/csv_good/2011_Antarctica_TO.csv | 4bf37750b9986ce582c9fd1f3a6ac622fc17f3b3ecb07b7a7132eb3797ee31d1 |
20 | highres | 2012_Antarctica_DC8.csv | https://data.cresis.ku.edu/data/rds/2012_Antarctica_DC8/csv_good/2012_Antarctica_DC8.csv | 5c6701b8c34bd57517b93e8e18f32e4579d6e2f56e4796bd7140b3e338544007 |
21 | highres | 2013_Antarctica_Basler.csv | https://data.cresis.ku.edu/data/rds/2013_Antarctica_Basler/csv_good/2013_Antarctica_Basler.csv | 56609027b4af04ba078ae093772916341bd1d6ab5f110de11b21294507733cc8 |
22 | highres | 2013_Antarctica_P3.csv | https://data.cresis.ku.edu/data/rds/2013_Antarctica_P3/csv_good/2013_Antarctica_P3.csv | 9de95030f49ce0bbf107eb72418db2845c39822872a6c9aa10f023148262f658 |
23 | highres | 2014_Antarctica_DC8.csv | https://data.cresis.ku.edu/data/rds/2014_Antarctica_DC8/csv_good/2014_Antarctica_DC8.csv | bd8c8674ba66508c64303725bfe45b3365467d01f69cfa8ec4258a3ced05e5bf |
24 | highres | 2016_Antarctica_DC8.csv | https://data.cresis.ku.edu/data/rds/2016_Antarctica_DC8/csv_good/2016_Antarctica_DC8.csv | ec3b514dfcae265f5b8643eeb3503be8a0a6531e563faf9f12cb67f2b618a741 |
25 | highres | 2017_Antarctica_P3.csv | https://data.cresis.ku.edu/data/rds/2017_Antarctica_P3/csv_good/2017_Antarctica_P3.csv | 9208a64fefe2f4a6e7f08d44c0af0c35400cd814590c32b8eb02f1545bfc8bec |
26 | highres | 2017_Antarctica_Basler.csv | https://data.cresis.ku.edu/data/rds/2017_Antarctica_Basler/csv_good/2017_Antarctica_Basler.csv | c97d0d92f3095ee8c3941d915028728423758594cc95e7b819889b51693f0712 |
Our processing step involves two stages:
Cleaning up the raw vector data, performing necessary calculations and reprojections to EPSG:3031.
Convert the cleaned vector data table via an interpolation function to a raster grid.
def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
"""
Converts ascii txt/csv files to xyz pandas.DataFrame via
a JSON Pipeline file similar to the one used by PDAL.
>>> os.makedirs(name="/tmp/highres", exist_ok=True)
>>> download_to_path(path="/tmp/highres/2011_Antarctica_TO.csv",
... url="https://data.cresis.ku.edu/data/rds/2011_Antarctica_TO/csv_good/2011_Antarctica_TO.csv")
<Response [200]>
>>> _ = shutil.copy(src="highres/20xx_Antarctica_TO.json", dst="/tmp/highres")
>>> df = ascii_to_xyz(pipeline_file="/tmp/highres/20xx_Antarctica_TO.json")
>>> df.head(2)
x y z
0 345580.826265 -1.156471e+06 -377.2340
1 345593.322948 -1.156460e+06 -376.6332
>>> shutil.rmtree(path="/tmp/highres")
"""
assert os.path.exists(pipeline_file)
assert pipeline_file.endswith((".json"))
# Read json file first
j = json.loads(open(pipeline_file).read())
jdf = pd.io.json.json_normalize(j, record_path="pipeline")
jdf = jdf.set_index(keys="type")
reader = jdf.loc["readers.text"] # check how to read the file(s)
## Basic table read
skip = int(reader.skip) # number of header rows to skip
sep = reader.separator # delimiter to use
names = reader.header.split(sep=sep) # header/column names as list
usecols = reader.usecols.split(sep=sep) # column names to use
path_pattern = os.path.join(os.path.dirname(pipeline_file), reader.filename)
files = [file for file in glob.glob(path_pattern)]
assert len(files) > 0 # check that there are actually files being matched!
df = pd.concat(
pd.read_csv(f, sep=sep, header=skip, names=names, usecols=usecols)
for f in files
)
df.reset_index(drop=True, inplace=True) # reset index after concatenation
## Advanced table read with conversions
try:
# Perform math operations
newcol, expr = reader.converters.popitem()
df[newcol] = df.eval(expr=expr)
# Drop unneeded columns
dropcols = reader.dropcols.split(sep=sep)
df.drop(columns=dropcols, inplace=True)
except AttributeError:
pass
assert len(df.columns) == 3 # check that we have 3 columns i.e. x, y, z
df.sort_index(axis="columns", inplace=True) # sort cols alphabetically
df.set_axis(labels=["x", "y", "z"], axis="columns", inplace=True) # lower case
## Reproject x and y coordinates if necessary
try:
reproject = jdf.loc["filters.reprojection"]
p1 = pyproj.Proj(init=reproject.in_srs)
p2 = pyproj.Proj(init=reproject.out_srs)
reproj_func = lambda x, y: pyproj.transform(p1=p1, p2=p2, x=x, y=y)
x2, y2 = reproj_func(np.array(df["x"]), np.array(df["y"]))
df["x"] = pd.Series(x2)
df["y"] = pd.Series(y2)
except KeyError:
pass
return df
xyz_dict = {}
for pf in sorted(glob.glob("highres/*.json")):
print(f"Processing {pf} pipeline", end=" ... ")
name = os.path.splitext(os.path.basename(pf))[0]
xyz_dict[name] = ascii_to_xyz(pipeline_file=pf)
print(f"{len(xyz_dict[name])} datapoints")
Processing highres/2007tx.json pipeline ... 42995 datapoints Processing highres/2010tr.json pipeline ... 84922 datapoints Processing highres/201x_Antarctica_Basler.json pipeline ... 2325792 datapoints Processing highres/20xx_Antarctica_DC8.json pipeline ... 12840213 datapoints Processing highres/20xx_Antarctica_TO.json pipeline ... 2895926 datapoints Processing highres/bed_WGS84_grid.json pipeline ... 244279 datapoints Processing highres/istarxx.json pipeline ... 396369 datapoints
def get_region(xyz_data: pd.DataFrame) -> str:
"""
Gets the bounding box region of an xyz pandas.DataFrame in string
format xmin/xmax/ymin/ymax rounded to 5 decimal places.
Used for the -R 'region of interest' parameter in GMT.
>>> xyz_data = pd.DataFrame(np.random.RandomState(seed=42).rand(30).reshape(10, 3))
>>> get_region(xyz_data=xyz_data)
'0.05808/0.83244/0.02058/0.95071'
"""
xmin, ymin, _ = xyz_data.min(axis="rows")
xmax, ymax, _ = xyz_data.max(axis="rows")
return f"{xmin:.5f}/{xmax:.5f}/{ymin:.5f}/{ymax:.5f}"
def xyz_to_grid(
xyz_data: pd.DataFrame,
region: str,
spacing: int = 250,
tension: float = 0.35,
outfile: str = None,
mask_cell_radius: int = 3,
):
"""
Performs interpolation of x, y, z point data to a raster grid.
>>> xyz_data = 1000*pd.DataFrame(np.random.RandomState(seed=42).rand(60).reshape(20, 3))
>>> region = get_region(xyz_data=xyz_data)
>>> grid = xyz_to_grid(xyz_data=xyz_data, region=region, spacing=250)
>>> grid.to_array().shape
(1, 5, 5)
>>> grid.to_array().values
array([[[403.17618 , 544.92535 , 670.7824 , 980.75055 , 961.47723 ],
[379.0757 , 459.26407 , 314.38297 , 377.78555 , 546.0469 ],
[450.67664 , 343.26 , 88.391594, 260.10492 , 452.3337 ],
[586.09906 , 469.74008 , 216.8168 , 486.9802 , 642.2116 ],
[451.4794 , 652.7244 , 325.77896 , 879.8973 , 916.7921 ]]],
dtype=float32)
"""
## Preprocessing with blockmedian
with gmt.helpers.GMTTempFile(suffix=".txt") as tmpfile:
with gmt.clib.Session() as lib:
file_context = lib.virtualfile_from_matrix(matrix=xyz_data.values)
with file_context as infile:
kwargs = {"V": "", "R": region, "I": f"{spacing}+e"}
arg_str = " ".join(
[infile, gmt.helpers.build_arg_string(kwargs), "->" + tmpfile.name]
)
lib.call_module(module="blockmedian", args=arg_str)
x, y, z = np.loadtxt(fname=tmpfile.name, unpack=True)
## XYZ point data to NetCDF grid via GMT surface
grid = gmt.surface(
x=x,
y=y,
z=z,
region=region,
spacing=f"{spacing}+e",
T=tension,
V="",
M=f"{mask_cell_radius}c",
)
## Save grid to NetCDF with projection information
if outfile is not None:
grid.to_netcdf(path=outfile) ##TODO add CRS!!
return grid
grid_dict = {}
for name in xyz_dict.keys():
print(f"Gridding {name}", end=" ... ")
xyz_data = xyz_dict[name]
region = get_region(xyz_data)
grid_dict[name] = xyz_to_grid(
xyz_data=xyz_data, region=region, outfile=f"highres/{name}.nc"
)
print(f"done! {grid_dict[name].to_array().shape}")
Gridding 2007tx ... done! (1, 266, 74) Gridding 2010tr ... done! (1, 92, 115) Gridding 201x_Antarctica_Basler ... done! (1, 9062, 7437) Gridding 20xx_Antarctica_DC8 ... done! (1, 12388, 15326) Gridding 20xx_Antarctica_TO ... done! (1, 7671, 12287) Gridding bed_WGS84_grid ... done! (1, 123, 163) Gridding istarxx ... done! (1, 552, 377)
grids = sorted(glob.glob("highres/*.nc"))
fig, axarr = plt.subplots(
nrows=1 + ((len(grids) - 1) // 3), ncols=3, squeeze=False, figsize=(15, 15)
)
for i, grid in enumerate(grids):
with rasterio.open(grid) as raster_source:
rasterio.plot.show(
source=raster_source, cmap="BrBG_r", ax=axarr[i // 3, i % 3], title=grid
)
def get_window_bounds(
filepath: str, height: int = 32, width: int = 32, step: int = 4
) -> list:
"""
Reads in a raster and finds tiles for them according to a stepped moving window.
Returns a list of bounding box coordinates corresponding to a tile that looks like
[(minx, miny, maxx, maxy), (minx, miny, maxx, maxy), ...]
>>> xr.DataArray(
... data=np.zeros(shape=(36, 32)),
... coords={"x": np.arange(1, 37), "y": np.arange(1, 33)},
... dims=["x", "y"],
... ).to_netcdf(path="/tmp/tmp_wb.nc")
>>> get_window_bounds(filepath="/tmp/tmp_wb.nc")
Tiling: /tmp/tmp_wb.nc ... 2
[(0.5, 4.5, 32.5, 36.5), (0.5, 0.5, 32.5, 32.5)]
>>> os.remove("/tmp/tmp_wb.nc")
"""
assert height == width # make sure it's a square!
assert height % 2 == 0 # make sure we are passing in an even number
with xr.open_rasterio(filepath) as dataset:
print(f"Tiling: {filepath} ... ", end="")
# Vectorized 'loop' along the raster image from top to bottom, and left to right
# Get boolean true/false mask of where the data/nodata pixels lie
mask = dataset.to_masked_array(copy=False).mask
mask = mask[0, :, :] # change to shape (height, width)
# Sliding window view of the input geographical raster image
window_views = skimage.util.shape.view_as_windows(
arr_in=mask, window_shape=(height, width), step=step
)
filled_tiles = ~window_views.any(
axis=(-2, -1)
) # find tiles which are fully filled, i.e. no blank/NODATA pixels
tile_indexes = np.argwhere(filled_tiles) # get x and y index of filled tiles
# Convert x,y tile indexes to bounding box coordinates
windows = [
rasterio.windows.Window(
col_off=ulx * step, row_off=uly * step, width=width, height=height
)
for uly, ulx in tile_indexes
]
window_bounds = [
rasterio.windows.bounds(
window=window,
transform=rasterio.Affine(*dataset.transform),
width=width,
height=height,
)
for window in windows
]
print(len(window_bounds))
return window_bounds
filepaths = sorted([g for g in glob.glob("highres/*.nc") if g != "highres/2007tx.nc"])
window_bounds = [get_window_bounds(filepath=grid) for grid in filepaths]
window_bounds_concat = np.concatenate([w for w in window_bounds]).tolist()
print(f"Total number of tiles: {len(window_bounds_concat)}")
Tiling: highres/2010tr.nc ... 164 Tiling: highres/201x_Antarctica_Basler.nc ... 961 Tiling: highres/20xx_Antarctica_DC8.nc ... 19 Tiling: highres/20xx_Antarctica_TO.nc ... 989 Tiling: highres/bed_WGS84_grid.nc ... 172 Tiling: highres/istarxx.nc ... 175 Total number of tiles: 2480
gdf = pd.concat(
objs=[
gpd.GeoDataFrame(
pd.Series(
data=len(window_bound) * [os.path.basename(filepath)], name="grid_name"
),
crs={"init": "epsg:3031"},
geometry=[shapely.geometry.box(*bound) for bound in window_bound],
)
for filepath, window_bound in zip(filepaths, window_bounds)
]
).reset_index(drop=True)
gdf.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fa9296b8c18>
gdf.to_file(filename="model/train/tiles_3031.geojson", driver="GeoJSON")
gdf.to_crs(crs={"init": "epsg:4326"}).to_file(
filename="model/train/tiles_4326.geojson", driver="GeoJSON"
)
def selective_tile(
filepath: str,
window_bounds: list,
padding: int = 0,
out_shape: tuple = None,
gapfill_raster_filepath: str = None,
) -> np.ndarray:
"""
Reads in raster and tiles them selectively.
Tiles will go according to list of window_bounds.
Output shape can be set to e.g. (16,16) to resample input raster to
some desired shape/resolution.
>>> xr.DataArray(
... data=np.random.RandomState(seed=42).rand(64).reshape(8, 8),
... coords={"x": np.arange(8), "y": np.arange(8)},
... dims=["x", "y"],
... ).to_netcdf(path="/tmp/tmp_st.nc", mode="w")
>>> selective_tile(
... filepath="/tmp/tmp_st.nc",
... window_bounds=[(1.0, 4.0, 3.0, 6.0), (2.0, 5.0, 4.0, 7.0)],
... )
Tiling: /tmp/tmp_st.nc
array([[[[0.18485446, 0.96958464],
[0.4951769 , 0.03438852]]],
<BLANKLINE>
<BLANKLINE>
[[[0.04522729, 0.32533032],
[0.96958464, 0.77513283]]]], dtype=float32)
>>> os.remove("/tmp/tmp_st.nc")
"""
array_list = []
with rasterio.open(filepath) as dataset:
print(f"Tiling: {filepath}")
for window_bound in window_bounds:
if padding > 0:
window_bound = (
window_bound[0] - padding, # minx
window_bound[1] - padding, # miny
window_bound[2] + padding, # maxx
window_bound[3] + padding, # maxy
)
window = rasterio.windows.from_bounds(
*window_bound, transform=dataset.transform, precision=None
).round_offsets()
# Read the raster according to the crop window
array = dataset.read(
indexes=list(range(1, dataset.count + 1)),
masked=True,
window=window,
out_shape=out_shape,
)
assert array.ndim == 3 # check that we have shape like (1, height, width)
assert array.shape[0] == 1 # channel-first (assuming only 1 channel)
try:
assert not array.mask.any() # check that there are no NAN values
except AssertionError:
# Replace pixels from another raster if available, else raise error
if gapfill_raster_filepath is not None:
with rasterio.open(gapfill_raster_filepath) as dataset2:
window2 = rasterio.windows.from_bounds(
*window_bound, transform=dataset2.transform, precision=None
).round_offsets()
array2 = dataset2.read(
indexes=list(range(1, dataset2.count + 1)),
masked=True,
window=window2,
out_shape=array.shape[1:],
)
np.copyto(
dst=array, src=array2, where=array.mask
) # fill in gaps where mask is True
else:
plt.imshow(array.data[0, :, :])
plt.show()
raise ValueError(
f"Tile has missing data, try passing in gapfill_raster_filepath"
)
# assert array.shape[0] == array.shape[1] # check that height==width
array_list.append(array.data.astype(dtype=np.float32))
return np.stack(arrays=array_list)
geodataframe = gpd.read_file("model/train/tiles_3031.geojson")
filepaths = geodataframe.grid_name.unique()
window_bounds = [
[geom.bounds for geom in geodataframe.query("grid_name == @filepath").geometry]
for filepath in filepaths
]
window_bounds_concat = np.concatenate([w for w in window_bounds]).tolist()
hireses = [
selective_tile(filepath=f"highres/{f}", window_bounds=w)
for f, w in zip(filepaths, window_bounds)
]
hires = np.concatenate(hireses)
print(hires.shape, hires.dtype)
Tiling: highres/2010tr.nc Tiling: highres/201x_Antarctica_Basler.nc Tiling: highres/20xx_Antarctica_DC8.nc Tiling: highres/20xx_Antarctica_TO.nc Tiling: highres/bed_WGS84_grid.nc Tiling: highres/istarxx.nc (2480, 1, 32, 32) float32
lores = selective_tile(
filepath="lowres/bedmap2_bed.tif", window_bounds=window_bounds_concat, padding=1000
)
print(lores.shape, lores.dtype)
Tiling: lowres/bedmap2_bed.tif (2480, 1, 10, 10) float32
rema = selective_tile(
filepath="misc/REMA_100m_dem.tif",
window_bounds=window_bounds_concat,
padding=1000,
gapfill_raster_filepath="misc/REMA_200m_dem_filled.tif",
)
print(rema.shape, rema.dtype)
Tiling: misc/REMA_100m_dem.tif (2480, 1, 100, 100) float32
measuresiceflow = selective_tile(
filepath="misc/MEaSUREs_IceFlowSpeed_450m.tif",
window_bounds=window_bounds_concat,
padding=1000,
out_shape=(20, 20),
)
print(measuresiceflow.shape, measuresiceflow.dtype)
Tiling: misc/MEaSUREs_IceFlowSpeed_450m.tif (2480, 1, 20, 20) float32
We'll save the numpy arrays to the filesystem first. We label inputs as X (low resolution bed DEMs) and W (miscellaneous). Groundtruth high resolution bed DEMs are labelled as Y.
Also, we'll serve the data up on the web using:
os.makedirs(name="model/train", exist_ok=True)
np.save(file="model/train/W1_data.npy", arr=rema)
np.save(file="model/train/W2_data.npy", arr=measuresiceflow)
np.save(file="model/train/X_data.npy", arr=lores)
np.save(file="model/train/Y_data.npy", arr=hires)
Login -> Build -> Push
quilt.login()
Launching a web browser... If that didn't work, please visit the following URL: https://pkg.quiltdata.com/login Failed to launch the browser: Command '['xdg-open', 'https://pkg.quiltdata.com/login']' returned non-zero exit status 3.
# Tiled datasets for training neural network
quilt.build(package="weiji14/deepbedmap/model/train/W1_data", path=rema)
quilt.build(package="weiji14/deepbedmap/model/train/W2_data", path=measuresiceflow)
quilt.build(package="weiji14/deepbedmap/model/train/X_data", path=lores)
quilt.build(package="weiji14/deepbedmap/model/train/Y_data", path=hires)
# Original datasets for neural network predictions on bigger area
quilt.build(
package="weiji14/deepbedmap/lowres/bedmap2_bed", path="lowres/bedmap2_bed.tif"
)
quilt.build(
package="weiji14/deepbedmap/misc/REMA_100m_dem", path="misc/REMA_100m_dem.tif"
)
quilt.build(
package="weiji14/deepbedmap/misc/REMA_200m_dem_filled",
path="misc/REMA_200m_dem_filled.tif",
)
quilt.build(
package="weiji14/deepbedmap/misc/MEaSUREs_IceFlowSpeed_450m",
path="misc/MEaSUREs_IceFlowSpeed_450m.tif",
)
quilt.push(package="weiji14/deepbedmap", is_public=True)
Fetching upload URLs from the registry...
0%| | 0.00/6.48G [00:00<?, ?B/s]
Uploading 11 fragments (6480586427 bytes)...
26%|██▌ | 1.69G/6.48G [00:01<190:46:53, 6.97kB/s]
Fragment 28e2ca7656d61b0bc7f8f8c1db41914023e0cab1634e0ee645f38a87d894b416 already uploaded; skipping. Fragment 1f66fe557ce079c063597f0b04d15862f67af2c9dd4f286801851e0c71f0e869 already uploaded; skipping. Fragment ca9c41a8dd56097e40865d2e65c65d299c22fc17608ddb6c604c532a69936307 already uploaded; skipping. Fragment 4a4efc3a84204c3d67887e8d7fa1186467b51e696451f2832ebbea3ca491c8a8 already uploaded; skipping. Fragment f1f660d1287225c30b8b2cbf2a727283d807a1ee443153519cbf407a08937965 already uploaded; skipping. Fragment f750893861a1a268c8ffe0ba7db36c933223bbf5fcbb786ecef3f052b20f9b8a already uploaded; skipping. Fragment 80c9fa41ccc69be1d2cd4a367d56168321d1079e7260a1996089810db25172f6 already uploaded; skipping.
100%|██████████| 6.48G/6.48G [00:10<00:00, 635MB/s]
Uploading package metadata... Updating the 'latest' tag... Push complete. weiji14/deepbedmap is live: https://quiltdata.com/package/weiji14/deepbedmap