# Import Pandas (and NumPy, because Pandas is built on NumPy) import numpy as np import pandas as pd # Create two new Pandas Series objects s1 = pd.Series(index=[2016,2017,2018,2019,2020], data=[4.1,5.2,6.3,7.4,8.5], name='Temperature') s2 = pd.Series(index=[2016,2017,2018,2019,2020], data=[35.5,35.0,34.5,34.0,33.5], name='Salinity') # Series still have a length, as with lists and NumPy arrays print(len(s1)) # Extract parts of the Series object print(s1.index) # get index as Index object (not very useful) print(s1.index.values) # get index converted into NumPy array print(s1.values) # get data converted into NumPy array # Select data from Series object using .iloc (Python/NumPy-style selection by position) print('\nOption 1:\n', s1.iloc[3]) # use a single integer index (returns the value) print('\nOption 2:\n', s1.iloc[[2,3,4]]) # use a list or array of integer indices (returns a Series) print('\nOption 3:\n', s1.iloc[2:5]) # use a slice of integer indices (returns a Series) print('\nOption 4:\n', s1.iloc[[False,False,True,True,True]]) # use a Boolean array (returns a Series) # Select data from Series object using .loc (selection by label) print('\nOption 5:\n', s1.loc[2019]) # use a single label of the index (NOT an integer position along the index) print('\nOption 6:\n', s1.loc[[2018,2019,2020]]) # use a list or array of labels print('\nOption 7:\n', s1.loc[2018:2020]) # use a slice of labels (UNLIKE standard Python/NumPy slices, the end value is inclusive) # Remember that you have to use .values to convert a Series to a NumPy array: print('\nReminder:\n', s1.loc[2018:2020].values) # Changing values of a Series using the indexing options above s1.loc[2018] = 5.3 print(s1) s1.iloc[3:5] = [6.4,7.5] print(s1) s1.loc[2018:2020] += 1 print(s1) # Add a new value to a Series using a new index label s1.loc[2021] = 9.6 print(s1) # Two ways of creating a Pandas DataFrame object # Option 1: join two or more Series objects df = pd.concat([s1,s2],axis=1) # Option 2: provide a dictionary with the data lists or NumPy arrays df = pd.DataFrame(index=[2016,2017,2018,2019,2020], data={'Temperature':[4.1,5.2,6.3,7.4,8.5], 'Salinity':[35.5,35.0,34.5,34.0,33.5]}) # Get information about the DataFrame object print(df.shape) # get dimensions print(df.size) # get number of data values print(df) # print() still works, but is not as nice looking as display() display(df) # display() opens the display interface, a more nicely formatted view of the object df.describe() # get useful summary statistics # Extract parts of the DataFrame object print(df.index.values) # get index as a NumPy array print(df.columns.values) # get column names as a NumPy array print(df.values) # get data as a NumPy array print(df['Salinity'].values) # get one column as a NumPy array # (similar to dictionary indexing) # Select data from Series object using .iloc or .loc print('\nExample 1:\n', df.iloc[3]) # use a single index (returns a Series) print('\nExample 2:\n', df.loc[2019]) # use a single label (returns a Series) print('\nExample 3:\n', df.iloc[2:5]) # use a slice of integer indices (returns a DataFrame) print('\nExample 4:\n', df.loc[2018:2020]) # use a slice of labels (returns a DataFrame) print('\nExample 5:\n', df['Temperature'].loc[2019]) # select a column AND choose a single row (returns the value) print('\nExample 6:\n', df[['Temperature','Salinity']].loc[2019]) # select multiple columns AND choose a single row (returns a Series) print('\nExample 7:\n', df[df['Temperature'] > 6.0]) # use a Boolean condition applied to one column (returns a DataFrame) # NOTE: changing values using .iloc and .loc selection works similar to as shown above with Series # Apply NumPy functions to Series and DataFrame objects print('\nExample 1:\n', df.mean()) # take the mean along the index (axis 0) print('\nExample 2:\n', df.mean(axis=0)) # same as above print('\nExample 3:\n', df.mean(axis=1)) # take the mean along the columns (axis 1) print('\nExample 4:\n', df.mean(skipna=True)) # ignore NaN values (if present) when taking the mean # Combine column extraction, selection by label, and applying a NumPy function print('\nExample 5:\n', df['Salinity'].loc[2017:].mean()) # returns a single value # Save a Pandas DataFrame as a CSV file # df.to_csv('filepath/including/filename.csv') # Read a CSV file as a Pandas DataFrame (more powerful than np.genfromtxt()!) # See available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html # df = pd.read_csv('filepath/including/filename.csv',delimiter=',',delim_whitespace=False,header=0) # Read an Excel spreadsheet as a Pandas DataFrame # See available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html # df = pd.read_excel('filepath/including/filename.xlsx',sheet_name='Sheet1') # Import xarray (and other libraries, because they are helpful when working with xarray files) import numpy as np import pandas as pd import xarray as xr from datetime import datetime, timedelta import matplotlib.pyplot as plt # You'll need to install the netCDF4 library to work with netCDF files # You should only need to run this line of code once per Colab notebook, # so comment it out or delete it afterwards # !pip install netcdf4 # Give Colab access to Google Drive from google.colab import drive drive.mount('/content/drive') # NOTE: you'll need to change this variable to match your own filepath filepath = 'drive/My Drive/OCEAN 215 - Autumn \'20/OCEAN 215 - Autumn \'20 - Course documents/' \ + 'Video lesson slides and notebooks/2020-11-03 - lesson #9 data/bsose_monthly_velocities.nc' # This is how we load a netCDF file # (This method is safe on Colab for files # up to about 0.5 GB [500 MB] in size) data = xr.open_dataset(filepath) # Examine structure of xarray Dataset using the interactive display() interface display(data) # Note there are 2 variables (eastward velocities, northward velocities) # 4 dimensions, and 4 coordinates (time, lat, lon, depth), so each variable is a 4D array # Note that you can click the buttons to view attributes (page icon) and actual data values (cylinder icon) # Extract data variables from the Dataset, similar to extracting columns from a Pandas DataFrame # display(data['V']) display(data['U']) # gives an xarray DataArray # Analogy: 2+ xarray DataArrays = an xarray Dataset # vs. 2+ Pandas Series = a Pandas DataFrame # You can do mathematical calculations between xarray DataArrays, as long as their dimensions match # Example: calculate current speed using Pythagorean theorem: # speed = sqrt(U^2 + V^2) speed = (data['U']**2 + data['V']**2)**0.5 display(speed) # Note that the coordinates and dimensions remained the same: # Note that each variable has its own attributes, which we can view above, or access using .attrs print(data['U'].attrs) # And the attributes themselves are a dictionary, so we retrieve values using a key: print(data['U'].attrs['units']) # You can also change attributes, using the key to change its value inside the dictionary: data['U'].attrs['units'] = 'meters/second' # Once you've selected a variable using brackets, you can index into it using .isel() # # This retrieves the value at the 0th index along the time coordinate, 0th index along latitude, etc. # Analogous to writing u[0,200,500,0] for a NumPy array data['U'].isel(time=0,lat=200,lon=500,depth=0) # returns a single value, still wrapped in 4-D Dataset format # Notice below which coordinate values we've indexed into: # You can convert a single-value Dataset result to a number using float() or .item(): print(data['U'].isel(time=0,lat=200,lon=500,depth=0).item()) print(float(data['U'].isel(time=0,lat=200,lon=500,depth=0))) # You can select multiple indices using .isel() data['U'].isel(time=0,lat=200,lon=500,depth=[0,1,2,3,4]) # analogous to NumPy: u[0,0,0,[0,1,2,3,4]] data['U'].isel(time=0,lat=200,lon=500,depth=slice(0,5)) # analogous to NumPy: u[0,0,0,0:5] # Notice below that the result has a dimension of 5 depths, and we see the depths range from 2.1 m to 146.5 m: # Multiple results can be converted from a Dataset to the underlying NumPy array using .values: data['U'].isel(time=0,lat=200,lon=500,depth=slice(0,5)).values # This also works when the underlying NumPy array has more than one dimension (e.g. is 2-D, 3-D, etc.): display(data['U'].isel(time=0,lat=slice(200,204),lon=slice(500,504),depth=0)) data['U'].isel(time=0,lat=slice(200,204),lon=slice(500,504),depth=0).values # Calling .values on the result gave a 4x4 NumPy array: # And you can select a value or multiple values along coordinate(s) using .sel(): data['U'].sel(time=datetime(2012,1,30,20),lat=-52.70605,lon=-13.0,depth=2.1) # Slicing works similarly between .isel() (slice by index) and .sel() (slice by value): data['U'].sel(time=datetime(2012,1,30,20,0,0),lat=-52.70605,lon=-13.0,depth=slice(2,147)) # slicing values don't have to be exact # Sometimes you don't know the exact coordinate values, so you can ask xarray to find the 'nearest' values: data['U'].sel(time=datetime(2012,1,30),lat=-53,lon=-13,depth=2,method='nearest') # slicing values don't have to be exact # Other examples of slicing to get a 2D NumPy array # Here, the remaining dimensions are latitude and longitude # (because we've selected a single time and single depth) display(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1, lat=slice(-50,-40),lon=slice(0,120))) # Here, the remaining dimensions are depth and longitude # (because we've selected a single time and single latitude) display(data['U'].sel(time=datetime(2012,1,30,20),depth=slice(200,1000), lon=slice(-120,0)).sel(lat=-57,method='nearest')) # You can reduce data from an xarray DataFrame by applying a NumPy function: # .mean() calculates the average over both of the remaining axes (depth and latitude) print(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1, lat=slice(-50,-40),lon=slice(0,120)).mean().item()) # .mean(dim='lon') calculates the average across the longitude dimension, # leaving only latitude as the remaining dimension display(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1, lat=slice(-50,-40),lon=slice(0,120)).mean(dim='lon')) # Save the result (keeping it in xarray format, not NumPy, to keep the latitude coordinate) lat_velocities = data['U'].sel(time=datetime(2012,1,30,20),depth=2.1, lat=slice(-50,-40),lon=slice(0,120)).mean(dim='lon') # So this gave the eastward velocity averaged over all longitudes in the swath, # so it's a 1-D array (a line) over latitude plt.figure(figsize=(4,4)) plt.plot(lat_velocities['lat'],lat_velocities.values,c='k') plt.xlabel('Latitude (°N)') plt.ylabel('Eastward velocity (m/s)') plt.grid()