Video lesson #9 notebook¶

Part 1: Pandas `Series` and `DataFrame` objects¶

In [ ]:

# Import Pandas (and NumPy, because Pandas is built on NumPy)
import numpy as np
import pandas as pd

In [ ]:

# Create two new Pandas Series objects
s1 = pd.Series(index=[2016,2017,2018,2019,2020],
               data=[4.1,5.2,6.3,7.4,8.5],
               name='Temperature')
s2 = pd.Series(index=[2016,2017,2018,2019,2020],
               data=[35.5,35.0,34.5,34.0,33.5],
               name='Salinity')

# Series still have a length, as with lists and NumPy arrays
print(len(s1))

In [ ]:

# Extract parts of the Series object
print(s1.index)         # get index as Index object (not very useful)
print(s1.index.values)  # get index converted into NumPy array
print(s1.values)        # get data converted into NumPy array

Int64Index([2016, 2017, 2018, 2019, 2020], dtype='int64')
[2016 2017 2018 2019 2020]
[4.1 5.2 6.3 7.4 8.5]

In [ ]:

# Select data from Series object using .iloc (Python/NumPy-style selection by position)
print('\nOption 1:\n', s1.iloc[3])                             # use a single integer index (returns the value)
print('\nOption 2:\n', s1.iloc[[2,3,4]])                       # use a list or array of integer indices (returns a Series)
print('\nOption 3:\n', s1.iloc[2:5])                           # use a slice of integer indices (returns a Series)
print('\nOption 4:\n', s1.iloc[[False,False,True,True,True]])  # use a Boolean array (returns a Series)

# Select data from Series object using .loc (selection by label)
print('\nOption 5:\n', s1.loc[2019])              # use a single label of the index (NOT an integer position along the index)
print('\nOption 6:\n', s1.loc[[2018,2019,2020]])  # use a list or array of labels
print('\nOption 7:\n', s1.loc[2018:2020])         # use a slice of labels (UNLIKE standard Python/NumPy slices, the end value is inclusive)

# Remember that you have to use .values to convert a Series to a NumPy array:
print('\nReminder:\n', s1.loc[2018:2020].values)

Option 1:
 7.4

Option 2:
 2018    6.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64

Option 3:
 2018    6.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64

Option 4:
 2018    6.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64

Option 5:
 7.4

Option 6:
 2018    6.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64

Option 7:
 2018    6.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64

Reminder:
 [6.3 7.4 8.5]

In [ ]:

# Changing values of a Series using the indexing options above
s1.loc[2018] = 5.3
print(s1)
s1.iloc[3:5] = [6.4,7.5]
print(s1)
s1.loc[2018:2020] += 1
print(s1)

2016    4.1
2017    5.2
2018    5.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64
2016    4.1
2017    5.2
2018    5.3
2019    6.4
2020    7.5
Name: Temperature, dtype: float64
2016    4.1
2017    5.2
2018    6.3
2019    7.4
2020    8.5
Name: Temperature, dtype: float64

In [ ]:

# Add a new value to a Series using a new index label
s1.loc[2021] = 9.6
print(s1)

2016    4.1
2017    5.2
2018    6.3
2019    7.4
2020    8.5
2021    9.6
Name: Temperature, dtype: float64

In [ ]:

# Two ways of creating a Pandas DataFrame object

# Option 1: join two or more Series objects
df = pd.concat([s1,s2],axis=1)

# Option 2: provide a dictionary with the data lists or NumPy arrays
df = pd.DataFrame(index=[2016,2017,2018,2019,2020],
                  data={'Temperature':[4.1,5.2,6.3,7.4,8.5],
                        'Salinity':[35.5,35.0,34.5,34.0,33.5]})

In [ ]:

# Get information about the DataFrame object
print(df.shape)    # get dimensions
print(df.size)     # get number of data values
print(df)          # print() still works, but is not as nice looking as display()
display(df)        # display() opens the display interface, a more nicely formatted view of the object
df.describe()      # get useful summary statistics

(5, 2)
10
      Temperature  Salinity
2016          4.1      35.5
2017          5.2      35.0
2018          6.3      34.5
2019          7.4      34.0
2020          8.5      33.5

	Temperature	Salinity
2016	4.1	35.5
2017	5.2	35.0
2018	6.3	34.5
2019	7.4	34.0
2020	8.5	33.5

Out[ ]:

	Temperature	Salinity
count	5.000000	5.000000
mean	6.300000	34.500000
std	1.739253	0.790569
min	4.100000	33.500000
25%	5.200000	34.000000
50%	6.300000	34.500000
75%	7.400000	35.000000
max	8.500000	35.500000

In [ ]:

# Extract parts of the DataFrame object
print(df.index.values)       # get index as a NumPy array
print(df.columns.values)     # get column names as a NumPy array
print(df.values)             # get data as a NumPy array
print(df['Salinity'].values) # get one column as a NumPy array
                             #  (similar to dictionary indexing)

[2016 2017 2018 2019 2020]
['Temperature' 'Salinity']
[[ 4.1 35.5]
 [ 5.2 35. ]
 [ 6.3 34.5]
 [ 7.4 34. ]
 [ 8.5 33.5]]
[35.5 35.  34.5 34.  33.5]

In [ ]:

# Select data from Series object using .iloc or .loc
print('\nExample 1:\n', df.iloc[3])                    # use a single index (returns a Series)
print('\nExample 2:\n', df.loc[2019])                  # use a single label (returns a Series)
print('\nExample 3:\n', df.iloc[2:5])                  # use a slice of integer indices (returns a DataFrame)
print('\nExample 4:\n', df.loc[2018:2020])             # use a slice of labels (returns a DataFrame)
print('\nExample 5:\n', df['Temperature'].loc[2019])   # select a column AND choose a single row (returns the value)
print('\nExample 6:\n', df[['Temperature','Salinity']].loc[2019])  # select multiple columns AND choose a single row (returns a Series)
print('\nExample 7:\n', df[df['Temperature'] > 6.0])   # use a Boolean condition applied to one column (returns a DataFrame)

# NOTE: changing values using .iloc and .loc selection works similar to as shown above with Series

Example 1:
 Temperature     7.4
Salinity       34.0
Name: 2019, dtype: float64

Example 2:
 Temperature     7.4
Salinity       34.0
Name: 2019, dtype: float64

Example 3:
       Temperature  Salinity
2018          6.3      34.5
2019          7.4      34.0
2020          8.5      33.5

Example 4:
       Temperature  Salinity
2018          6.3      34.5
2019          7.4      34.0
2020          8.5      33.5

Example 5:
 7.4

Example 6:
 Temperature     7.4
Salinity       34.0
Name: 2019, dtype: float64

Example 7:
       Temperature  Salinity
2018          6.3      34.5
2019          7.4      34.0
2020          8.5      33.5

In [ ]:

# Apply NumPy functions to Series and DataFrame objects
print('\nExample 1:\n', df.mean())              # take the mean along the index (axis 0)
print('\nExample 2:\n', df.mean(axis=0))        # same as above
print('\nExample 3:\n', df.mean(axis=1))        # take the mean along the columns (axis 1)
print('\nExample 4:\n', df.mean(skipna=True))   # ignore NaN values (if present) when taking the mean

# Combine column extraction, selection by label, and applying a NumPy function
print('\nExample 5:\n', df['Salinity'].loc[2017:].mean())   # returns a single value

Example 1:
 Temperature     6.3
Salinity       34.5
dtype: float64

Example 2:
 Temperature     6.3
Salinity       34.5
dtype: float64

Example 3:
 2016    19.8
2017    20.1
2018    20.4
2019    20.7
2020    21.0
dtype: float64

Example 4:
 Temperature     6.3
Salinity       34.5
dtype: float64

Example 5:
 34.25

In [ ]:

# Save a Pandas DataFrame as a CSV file
# df.to_csv('filepath/including/filename.csv')

# Read a CSV file as a Pandas DataFrame (more powerful than np.genfromtxt()!)
# See available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
# df = pd.read_csv('filepath/including/filename.csv',delimiter=',',delim_whitespace=False,header=0)

# Read an Excel spreadsheet as a Pandas DataFrame
# See available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
# df = pd.read_excel('filepath/including/filename.xlsx',sheet_name='Sheet1')

Part 2: xarray `DataArray` and `Dataset` objects¶

In [ ]:

# Import xarray (and other libraries, because they are helpful when working with xarray files)
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

In [ ]:

# You'll need to install the netCDF4 library to work with netCDF files
# You should only need to run this line of code once per Colab notebook,
#   so comment it out or delete it afterwards
# !pip install netcdf4

In [ ]:

# Give Colab access to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

In [ ]:

# NOTE: you'll need to change this variable to match your own filepath
filepath = 'drive/My Drive/OCEAN 215 - Autumn \'20/OCEAN 215 - Autumn \'20 - Course documents/' \
           + 'Video lesson slides and notebooks/2020-11-03 - lesson #9 data/bsose_monthly_velocities.nc'

# This is how we load a netCDF file
# (This method is safe on Colab for files
#  up to about 0.5 GB [500 MB] in size)
data = xr.open_dataset(filepath)

In [ ]:

# Examine structure of xarray Dataset using the interactive display() interface
display(data)    # Note there are 2 variables (eastward velocities, northward velocities)
                 #   4 dimensions, and 4 coordinates (time, lat, lon, depth), so each variable is a 4D array

# Note that you can click the buttons to view attributes (page icon) and actual data values (cylinder icon)

In [ ]:

# Extract data variables from the Dataset, similar to extracting columns from a Pandas DataFrame
# display(data['V'])
display(data['U'])   # gives an xarray DataArray

# Analogy: 2+ xarray DataArrays = an xarray Dataset
#     vs.  2+ Pandas Series = a Pandas DataFrame

In [ ]:

# You can do mathematical calculations between xarray DataArrays, as long as their dimensions match

# Example: calculate current speed using Pythagorean theorem: 
#          speed = sqrt(U^2 + V^2)
speed = (data['U']**2 + data['V']**2)**0.5
display(speed)

# Note that the coordinates and dimensions remained the same:

In [ ]:

# Note that each variable has its own attributes, which we can view above, or access using .attrs
print(data['U'].attrs)

{'units': 'm/s', 'long_name': 'Zonal Component of Velocity (m/s)', 'standard_name': 'UVEL', 'mate': 'VVEL'}

In [ ]:

# And the attributes themselves are a dictionary, so we retrieve values using a key:
print(data['U'].attrs['units'])

# You can also change attributes, using the key to change its value inside the dictionary:
data['U'].attrs['units'] = 'meters/second'

m/s

In [ ]:

# Once you've selected a variable using brackets, you can index into it using .isel()
#
# This retrieves the value at the 0th index along the time coordinate, 0th index along latitude, etc.
# Analogous to writing u[0,200,500,0] for a NumPy array
data['U'].isel(time=0,lat=200,lon=500,depth=0)              # returns a single value, still wrapped in 4-D Dataset format

# Notice below which coordinate values we've indexed into:

In [ ]:

# You can convert a single-value Dataset result to a number using float() or .item():
print(data['U'].isel(time=0,lat=200,lon=500,depth=0).item())
print(float(data['U'].isel(time=0,lat=200,lon=500,depth=0)))

0.1258898824453354
0.1258898824453354

In [ ]:

# You can select multiple indices using .isel()
data['U'].isel(time=0,lat=200,lon=500,depth=[0,1,2,3,4])     # analogous to NumPy: u[0,0,0,[0,1,2,3,4]]
data['U'].isel(time=0,lat=200,lon=500,depth=slice(0,5))      # analogous to NumPy: u[0,0,0,0:5]

# Notice below that the result has a dimension of 5 depths, and we see the depths range from 2.1 m to 146.5 m:

In [ ]:

# Multiple results can be converted from a Dataset to the underlying NumPy array using .values:
data['U'].isel(time=0,lat=200,lon=500,depth=slice(0,5)).values

Out[ ]:

array([0.12588988, 0.05039841, 0.05717332, 0.06155456, 0.057382  ],
      dtype=float32)

In [ ]:

# This also works when the underlying NumPy array has more than one dimension (e.g. is 2-D, 3-D, etc.):
display(data['U'].isel(time=0,lat=slice(200,204),lon=slice(500,504),depth=0))
data['U'].isel(time=0,lat=slice(200,204),lon=slice(500,504),depth=0).values

# Calling .values on the result gave a 4x4 NumPy array:

In [ ]:

# And you can select a value or multiple values along coordinate(s) using .sel():
data['U'].sel(time=datetime(2012,1,30,20),lat=-52.70605,lon=-13.0,depth=2.1)

In [ ]:

# Slicing works similarly between .isel() (slice by index) and .sel() (slice by value):
data['U'].sel(time=datetime(2012,1,30,20,0,0),lat=-52.70605,lon=-13.0,depth=slice(2,147))    # slicing values don't have to be exact

In [ ]:

# Sometimes you don't know the exact coordinate values, so you can ask xarray to find the 'nearest' values:
data['U'].sel(time=datetime(2012,1,30),lat=-53,lon=-13,depth=2,method='nearest')    # slicing values don't have to be exact

In [ ]:

# Other examples of slicing to get a 2D NumPy array

# Here, the remaining dimensions are latitude and longitude
# (because we've selected a single time and single depth)
display(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,
                      lat=slice(-50,-40),lon=slice(0,120)))

# Here, the remaining dimensions are depth and longitude
# (because we've selected a single time and single latitude)
display(data['U'].sel(time=datetime(2012,1,30,20),depth=slice(200,1000),
                      lon=slice(-120,0)).sel(lat=-57,method='nearest'))

In [ ]:

# You can reduce data from an xarray DataFrame by applying a NumPy function:

# .mean() calculates the average over both of the remaining axes (depth and latitude)
print(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,
                    lat=slice(-50,-40),lon=slice(0,120)).mean().item())

0.16497819125652313

In [ ]:

# .mean(dim='lon') calculates the average across the longitude dimension,
# leaving only latitude as the remaining dimension
display(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,
                      lat=slice(-50,-40),lon=slice(0,120)).mean(dim='lon'))

xarray.DataArray

'U'

lat: 42

0.19636832 0.19726074 0.19570175 ... 0.112251155 0.108821966

array([0.19636832, 0.19726074, 0.19570175, 0.19753072, 0.1995998 ,
       0.20088746, 0.20197098, 0.20239758, 0.20189096, 0.20043223,
       0.19856165, 0.19670185, 0.19461559, 0.19198534, 0.18877912,
       0.18521579, 0.18209256, 0.18030445, 0.1799029 , 0.18019168,
       0.17991017, 0.17799716, 0.17403169, 0.1685389 , 0.16236207,
       0.15637264, 0.15085906, 0.14584213, 0.14138193, 0.13736448,
       0.13341603, 0.12943429, 0.12593448, 0.12360686, 0.12302325,
       0.12359596, 0.12365858, 0.1221991 , 0.11970461, 0.11638598,
       0.11225116, 0.10882197], dtype=float32)

Coordinates: (3)

time
()
datetime64[ns]
2012-01-30T20:00:00
long_name :
Time
standard_name :
time
axis :
T
```
array('2012-01-30T20:00:00.000000000', dtype='datetime64[ns]')
```

lat

(lat)

float32

-49.78614 -49.570454 ... -40.151318

units :: degrees_north
standard_name :: latitude
long_name :: Latitude
axis :: Y

array([-49.78614 , -49.570454, -49.3538  , -49.136192, -48.91762 , -48.698093,
       -48.4776  , -48.256153, -48.033737, -47.810356, -47.586014, -47.360703,
       -47.134426, -46.90718 , -46.67897 , -46.4498  , -46.21965 , -45.988537,
       -45.75645 , -45.523396, -45.28937 , -45.05438 , -44.818413, -44.58148 ,
       -44.34357 , -44.1047  , -43.864857, -43.62405 , -43.38228 , -43.13953 ,
       -42.895817, -42.65114 , -42.40549 , -42.15888 , -41.911297, -41.662758,
       -41.413254, -41.16278 , -40.91135 , -40.658962, -40.405617, -40.151318],
      dtype=float32)

depth
()
float32
2.1
units :
m
standard_name :
depth
long_name :
Vertical coordinate of cell center
axis :
-Z
```
array(2.1, dtype=float32)
```

Attributes: (0)

In [ ]:

# Save the result (keeping it in xarray format, not NumPy, to keep the latitude coordinate)
lat_velocities = data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,
                               lat=slice(-50,-40),lon=slice(0,120)).mean(dim='lon')

# So this gave the eastward velocity averaged over all longitudes in the swath,
# so it's a 1-D array (a line) over latitude
plt.figure(figsize=(4,4))
plt.plot(lat_velocities['lat'],lat_velocities.values,c='k')
plt.xlabel('Latitude (°N)')
plt.ylabel('Eastward velocity (m/s)')
plt.grid()

Video lesson #9 notebook¶

Part 1: Pandas Series and DataFrame objects¶

Part 2: xarray DataArray and Dataset objects¶

Part 1: Pandas `Series` and `DataFrame` objects¶

Part 2: xarray `DataArray` and `Dataset` objects¶