# Import Pandas (and NumPy, because Pandas is built on NumPy)importnumpyasnpimportpandasaspd
In [ ]:
# Create two new Pandas Series objectss1=pd.Series(index=[2016,2017,2018,2019,2020],data=[4.1,5.2,6.3,7.4,8.5],name='Temperature')s2=pd.Series(index=[2016,2017,2018,2019,2020],data=[35.5,35.0,34.5,34.0,33.5],name='Salinity')# Series still have a length, as with lists and NumPy arraysprint(len(s1))
5
In [ ]:
# Extract parts of the Series objectprint(s1.index)# get index as Index object (not very useful)print(s1.index.values)# get index converted into NumPy arrayprint(s1.values)# get data converted into NumPy array
# Select data from Series object using .iloc (Python/NumPy-style selection by position)print('\nOption 1:\n',s1.iloc[3])# use a single integer index (returns the value)print('\nOption 2:\n',s1.iloc[[2,3,4]])# use a list or array of integer indices (returns a Series)print('\nOption 3:\n',s1.iloc[2:5])# use a slice of integer indices (returns a Series)print('\nOption 4:\n',s1.iloc[[False,False,True,True,True]])# use a Boolean array (returns a Series)# Select data from Series object using .loc (selection by label)print('\nOption 5:\n',s1.loc[2019])# use a single label of the index (NOT an integer position along the index)print('\nOption 6:\n',s1.loc[[2018,2019,2020]])# use a list or array of labelsprint('\nOption 7:\n',s1.loc[2018:2020])# use a slice of labels (UNLIKE standard Python/NumPy slices, the end value is inclusive)# Remember that you have to use .values to convert a Series to a NumPy array:print('\nReminder:\n',s1.loc[2018:2020].values)
# Two ways of creating a Pandas DataFrame object# Option 1: join two or more Series objectsdf=pd.concat([s1,s2],axis=1)# Option 2: provide a dictionary with the data lists or NumPy arraysdf=pd.DataFrame(index=[2016,2017,2018,2019,2020],data={'Temperature':[4.1,5.2,6.3,7.4,8.5],'Salinity':[35.5,35.0,34.5,34.0,33.5]})
In [ ]:
# Get information about the DataFrame objectprint(df.shape)# get dimensionsprint(df.size)# get number of data valuesprint(df)# print() still works, but is not as nice looking as display()display(df)# display() opens the display interface, a more nicely formatted view of the objectdf.describe()# get useful summary statistics
# Extract parts of the DataFrame objectprint(df.index.values)# get index as a NumPy arrayprint(df.columns.values)# get column names as a NumPy arrayprint(df.values)# get data as a NumPy arrayprint(df['Salinity'].values)# get one column as a NumPy array# (similar to dictionary indexing)
# Select data from Series object using .iloc or .locprint('\nExample 1:\n',df.iloc[3])# use a single index (returns a Series)print('\nExample 2:\n',df.loc[2019])# use a single label (returns a Series)print('\nExample 3:\n',df.iloc[2:5])# use a slice of integer indices (returns a DataFrame)print('\nExample 4:\n',df.loc[2018:2020])# use a slice of labels (returns a DataFrame)print('\nExample 5:\n',df['Temperature'].loc[2019])# select a column AND choose a single row (returns the value)print('\nExample 6:\n',df[['Temperature','Salinity']].loc[2019])# select multiple columns AND choose a single row (returns a Series)print('\nExample 7:\n',df[df['Temperature']>6.0])# use a Boolean condition applied to one column (returns a DataFrame)# NOTE: changing values using .iloc and .loc selection works similar to as shown above with Series
Example 1:
Temperature 7.4
Salinity 34.0
Name: 2019, dtype: float64
Example 2:
Temperature 7.4
Salinity 34.0
Name: 2019, dtype: float64
Example 3:
Temperature Salinity
2018 6.3 34.5
2019 7.4 34.0
2020 8.5 33.5
Example 4:
Temperature Salinity
2018 6.3 34.5
2019 7.4 34.0
2020 8.5 33.5
Example 5:
7.4
Example 6:
Temperature 7.4
Salinity 34.0
Name: 2019, dtype: float64
Example 7:
Temperature Salinity
2018 6.3 34.5
2019 7.4 34.0
2020 8.5 33.5
In [ ]:
# Apply NumPy functions to Series and DataFrame objectsprint('\nExample 1:\n',df.mean())# take the mean along the index (axis 0)print('\nExample 2:\n',df.mean(axis=0))# same as aboveprint('\nExample 3:\n',df.mean(axis=1))# take the mean along the columns (axis 1)print('\nExample 4:\n',df.mean(skipna=True))# ignore NaN values (if present) when taking the mean# Combine column extraction, selection by label, and applying a NumPy functionprint('\nExample 5:\n',df['Salinity'].loc[2017:].mean())# returns a single value
Example 1:
Temperature 6.3
Salinity 34.5
dtype: float64
Example 2:
Temperature 6.3
Salinity 34.5
dtype: float64
Example 3:
2016 19.8
2017 20.1
2018 20.4
2019 20.7
2020 21.0
dtype: float64
Example 4:
Temperature 6.3
Salinity 34.5
dtype: float64
Example 5:
34.25
In [ ]:
# Save a Pandas DataFrame as a CSV file# df.to_csv('filepath/including/filename.csv')# Read a CSV file as a Pandas DataFrame (more powerful than np.genfromtxt()!)# See available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html# df = pd.read_csv('filepath/including/filename.csv',delimiter=',',delim_whitespace=False,header=0)# Read an Excel spreadsheet as a Pandas DataFrame# See available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html# df = pd.read_excel('filepath/including/filename.xlsx',sheet_name='Sheet1')
# Import xarray (and other libraries, because they are helpful when working with xarray files)importnumpyasnpimportpandasaspdimportxarrayasxrfromdatetimeimportdatetime,timedeltaimportmatplotlib.pyplotasplt
In [ ]:
# You'll need to install the netCDF4 library to work with netCDF files# You should only need to run this line of code once per Colab notebook,# so comment it out or delete it afterwards# !pip install netcdf4
In [ ]:
# Give Colab access to Google Drivefromgoogle.colabimportdrivedrive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [ ]:
# NOTE: you'll need to change this variable to match your own filepathfilepath='drive/My Drive/OCEAN 215 - Autumn \'20/OCEAN 215 - Autumn \'20 - Course documents/' \
+'Video lesson slides and notebooks/2020-11-03 - lesson #9 data/bsose_monthly_velocities.nc'# This is how we load a netCDF file# (This method is safe on Colab for files# up to about 0.5 GB [500 MB] in size)data=xr.open_dataset(filepath)
In [ ]:
# Examine structure of xarray Dataset using the interactive display() interfacedisplay(data)# Note there are 2 variables (eastward velocities, northward velocities)# 4 dimensions, and 4 coordinates (time, lat, lon, depth), so each variable is a 4D array# Note that you can click the buttons to view attributes (page icon) and actual data values (cylinder icon)
# Extract data variables from the Dataset, similar to extracting columns from a Pandas DataFrame# display(data['V'])display(data['U'])# gives an xarray DataArray# Analogy: 2+ xarray DataArrays = an xarray Dataset# vs. 2+ Pandas Series = a Pandas DataFrame
# You can do mathematical calculations between xarray DataArrays, as long as their dimensions match# Example: calculate current speed using Pythagorean theorem: # speed = sqrt(U^2 + V^2)speed=(data['U']**2+data['V']**2)**0.5display(speed)# Note that the coordinates and dimensions remained the same:
# And the attributes themselves are a dictionary, so we retrieve values using a key:print(data['U'].attrs['units'])# You can also change attributes, using the key to change its value inside the dictionary:data['U'].attrs['units']='meters/second'
m/s
In [ ]:
# Once you've selected a variable using brackets, you can index into it using .isel()## This retrieves the value at the 0th index along the time coordinate, 0th index along latitude, etc.# Analogous to writing u[0,200,500,0] for a NumPy arraydata['U'].isel(time=0,lat=200,lon=500,depth=0)# returns a single value, still wrapped in 4-D Dataset format# Notice below which coordinate values we've indexed into:
# You can convert a single-value Dataset result to a number using float() or .item():print(data['U'].isel(time=0,lat=200,lon=500,depth=0).item())print(float(data['U'].isel(time=0,lat=200,lon=500,depth=0)))
0.1258898824453354
0.1258898824453354
In [ ]:
# You can select multiple indices using .isel()data['U'].isel(time=0,lat=200,lon=500,depth=[0,1,2,3,4])# analogous to NumPy: u[0,0,0,[0,1,2,3,4]]data['U'].isel(time=0,lat=200,lon=500,depth=slice(0,5))# analogous to NumPy: u[0,0,0,0:5]# Notice below that the result has a dimension of 5 depths, and we see the depths range from 2.1 m to 146.5 m:
# Multiple results can be converted from a Dataset to the underlying NumPy array using .values:data['U'].isel(time=0,lat=200,lon=500,depth=slice(0,5)).values
# This also works when the underlying NumPy array has more than one dimension (e.g. is 2-D, 3-D, etc.):display(data['U'].isel(time=0,lat=slice(200,204),lon=slice(500,504),depth=0))data['U'].isel(time=0,lat=slice(200,204),lon=slice(500,504),depth=0).values# Calling .values on the result gave a 4x4 NumPy array:
# And you can select a value or multiple values along coordinate(s) using .sel():data['U'].sel(time=datetime(2012,1,30,20),lat=-52.70605,lon=-13.0,depth=2.1)
# Slicing works similarly between .isel() (slice by index) and .sel() (slice by value):data['U'].sel(time=datetime(2012,1,30,20,0,0),lat=-52.70605,lon=-13.0,depth=slice(2,147))# slicing values don't have to be exact
# Sometimes you don't know the exact coordinate values, so you can ask xarray to find the 'nearest' values:data['U'].sel(time=datetime(2012,1,30),lat=-53,lon=-13,depth=2,method='nearest')# slicing values don't have to be exact
# Other examples of slicing to get a 2D NumPy array# Here, the remaining dimensions are latitude and longitude# (because we've selected a single time and single depth)display(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,lat=slice(-50,-40),lon=slice(0,120)))# Here, the remaining dimensions are depth and longitude# (because we've selected a single time and single latitude)display(data['U'].sel(time=datetime(2012,1,30,20),depth=slice(200,1000),lon=slice(-120,0)).sel(lat=-57,method='nearest'))
# You can reduce data from an xarray DataFrame by applying a NumPy function:# .mean() calculates the average over both of the remaining axes (depth and latitude)print(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,lat=slice(-50,-40),lon=slice(0,120)).mean().item())
0.16497819125652313
In [ ]:
# .mean(dim='lon') calculates the average across the longitude dimension,# leaving only latitude as the remaining dimensiondisplay(data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,lat=slice(-50,-40),lon=slice(0,120)).mean(dim='lon'))
# Save the result (keeping it in xarray format, not NumPy, to keep the latitude coordinate)lat_velocities=data['U'].sel(time=datetime(2012,1,30,20),depth=2.1,lat=slice(-50,-40),lon=slice(0,120)).mean(dim='lon')# So this gave the eastward velocity averaged over all longitudes in the swath,# so it's a 1-D array (a line) over latitudeplt.figure(figsize=(4,4))plt.plot(lat_velocities['lat'],lat_velocities.values,c='k')plt.xlabel('Latitude (°N)')plt.ylabel('Eastward velocity (m/s)')plt.grid()