# initial imports
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import vortexasdk as v
/var/folders/js/1d4sm_gs33n2lr86b3p7pyvm0000gn/T/ipykernel_40860/3584211857.py:2: DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) but was not found to be installed on your system. If this would cause problems for you, please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 import pandas as pd
We start by storing the datetimes and IDs needed to filter for crude flows out of the US in the last 7 weeks.
# datetimes to access last 7 weeks of data
now = datetime.utcnow()
seven_weeks_ago = now - relativedelta(weeks=7)
# Find US ID
us = [g.id for g in v.Geographies().search('united states').to_list() if 'country' in g.layer]
assert len(us) == 1
# Find crude ID
crude = [p.id for p in v.Products().search('crude').to_list() if p.name=='Crude']
assert len(crude) == 1
# Columns to pull out, and shortened names
required_columns = [
# primary vessel info
"vessels.0.name",
"vessels.0.vessel_class",
# product information and quantity
"product.group.label",
"product.category.label",
"quantity",
"status",
# loading info
"events.cargo_port_load_event.0.location.port.label",
"events.cargo_port_load_event.0.end_timestamp",
# unloading info
"events.cargo_port_unload_event.0.location.port.label",
"events.cargo_port_unload_event.0.location.country.label",
"events.cargo_port_unload_event.0.end_timestamp",
]
new_labels = [
"vessel_name",
"vessel_class",
"product_group",
"product_category",
"quantity",
"status",
"loading_port",
"loading_finish",
"unloading_port",
"unloading_country",
"unloading_finish",
]
relabel = dict(zip(required_columns,new_labels))
Do the search for crude flows out of the US (and not destined for another US location) in the last 7 weeks. Convert this to a DataFrame.
cms = v.CargoMovements().search(
filter_activity = 'loading_end',
filter_origins = us,
exclude_destinations = us,
filter_products = crude,
filter_time_min = seven_weeks_ago,
filter_time_max = now,
cm_unit = 'b'
).to_df(columns=required_columns).rename(relabel,axis=1)
# How many associated movements?
len(cms)
277
We add a new column to the DataFrame which gets the week of the year in which loading was completed:
cms['loading_week'] = cms['loading_finish'].dt.isocalendar().week
We then use groupby
to aggregate the data by week, and sum over the numeric data. This shows the total volume (in # of barrels) exported each week.
weekly_quantity = cms.groupby('loading_week').sum(numeric_only=True)
weekly_quantity
quantity | |
---|---|
loading_week | |
9 | 6177861 |
10 | 26019118 |
11 | 36710668 |
12 | 28675170 |
13 | 27427342 |
14 | 17971104 |
15 | 31144565 |
16 | 22209895 |
We could also break this information down further. Let's say we wanted to know the weekly exported quantities of each category of crude separately. It's possible to group by multiple categories:
quantity_by_category = cms.groupby(by = ['loading_week','product_category']).sum(numeric_only=True).reset_index()
quantity_by_category
loading_week | product_category | quantity | |
---|---|---|---|
0 | 9 | Light-Sweet | 4898641 |
1 | 9 | Medium-Sour | 1279220 |
2 | 10 | Heavy-Sour | 1292891 |
3 | 10 | Light-Sweet | 23657246 |
4 | 10 | Medium-Sour | 1068981 |
5 | 11 | Heavy-Sour | 1168289 |
6 | 11 | Light-Sweet | 31123847 |
7 | 11 | Medium-Sour | 4418532 |
8 | 12 | Heavy-Sour | 291878 |
9 | 12 | Light-Sweet | 27299960 |
10 | 12 | Medium-Sour | 1083332 |
11 | 13 | Light-Sweet | 26522943 |
12 | 13 | Medium-Sour | 904399 |
13 | 14 | Heavy-Sour | 317250 |
14 | 14 | Light-Sweet | 16403216 |
15 | 14 | Medium-Sour | 775952 |
16 | 14 | Medium-Sweet | 474686 |
17 | 15 | Heavy-Sour | 876933 |
18 | 15 | Light-Sweet | 29190571 |
19 | 15 | Medium-Sour | 1077061 |
20 | 16 | Light-Sweet | 19109858 |
21 | 16 | Medium-Sour | 3100037 |
This is not the most intuitive way of displaying the data. We can 'pivot' the table, so that each product category gets its own column:
quantity_by_category = quantity_by_category.pivot(index = 'loading_week',
columns = 'product_category',
values = 'quantity')
quantity_by_category
product_category | Heavy-Sour | Light-Sweet | Medium-Sour | Medium-Sweet |
---|---|---|---|---|
loading_week | ||||
9 | NaN | 4898641.0 | 1279220.0 | NaN |
10 | 1292891.0 | 23657246.0 | 1068981.0 | NaN |
11 | 1168289.0 | 31123847.0 | 4418532.0 | NaN |
12 | 291878.0 | 27299960.0 | 1083332.0 | NaN |
13 | NaN | 26522943.0 | 904399.0 | NaN |
14 | 317250.0 | 16403216.0 | 775952.0 | 474686.0 |
15 | 876933.0 | 29190571.0 | 1077061.0 | NaN |
16 | NaN | 19109858.0 | 3100037.0 | NaN |
This is better. However, a number of elements are 'NaN'. Clearly, these represent an absence of any exports of that category in that week - so we replace these with zeroes.
quantity_by_category = quantity_by_category.fillna(0)
quantity_by_category
product_category | Heavy-Sour | Light-Sweet | Medium-Sour | Medium-Sweet |
---|---|---|---|---|
loading_week | ||||
9 | 0.0 | 4898641.0 | 1279220.0 | 0.0 |
10 | 1292891.0 | 23657246.0 | 1068981.0 | 0.0 |
11 | 1168289.0 | 31123847.0 | 4418532.0 | 0.0 |
12 | 291878.0 | 27299960.0 | 1083332.0 | 0.0 |
13 | 0.0 | 26522943.0 | 904399.0 | 0.0 |
14 | 317250.0 | 16403216.0 | 775952.0 | 474686.0 |
15 | 876933.0 | 29190571.0 | 1077061.0 | 0.0 |
16 | 0.0 | 19109858.0 | 3100037.0 | 0.0 |
Another way of breaking down the exports data is by receiving country.
quantity_by_destination = cms.groupby('unloading_country').sum(numeric_only=True)[['quantity']]
quantity_by_destination.sort_values(by='quantity',ascending = False, inplace=True)
quantity_by_destination.head()
quantity | |
---|---|
unloading_country | |
Netherlands | 25729101 |
South Korea | 20622091 |
China | 16348697 |
Singapore | 15721615 |
United Kingdom | 15540232 |
We might not want data for all recipient countries in our data, so we can collect those besides the top 10 in an 'other' category.
# grab the top 10
top_destination_countries = quantity_by_destination.head(10)
# grab the bottom, sum their quantities and put into a DataFrame
rest = pd.DataFrame(index = ['Other'], columns = ['quantity'])
rest.loc['Other'] = quantity_by_destination[10:].sum().values
# join the two DataFrames
top_destination_countries = pd.concat([top_destination_countries, rest]).astype(int)
Finally, we can add a new column containing the percentage of total exports represented by exports to that country:
top_destination_countries['%'] = round(top_destination_countries['quantity']*100 / top_destination_countries['quantity'].sum(),2)
top_destination_countries
quantity | % | |
---|---|---|
Netherlands | 25729101 | 13.10 |
South Korea | 20622091 | 10.50 |
China | 16348697 | 8.33 |
Singapore | 15721615 | 8.01 |
United Kingdom | 15540232 | 7.92 |
Spain | 14611630 | 7.44 |
Canada | 12589674 | 6.41 |
India | 11868801 | 6.05 |
Malaysia | 8395456 | 4.28 |
France | 6847915 | 3.49 |
Other | 48060511 | 24.48 |
Note: In the next module, we will see how the Cargo Time Series endpoint offers another approach to this same kind of analysis
Make a pivoted table showing how exports to each of the top 5 desination countries varied over the 7 weeks.