#!/usr/bin/env python # coding: utf-8 # # Abstract # The aim of this notebook is to illustrate how analysts and data scientists can take advantage of [Vortexa's Python SDK](https://github.com/VorTECHsa/python-sdk) to gain real-time market insights of the oil industry and use Vortexa's data to build powerful predictive models. We will focus on United States Crude oil exports, which have skyrocketed the last couple of years and constitute on of the main drivers of the whole oil industry. In the first part of the notebook, we will familiarize with using the SDK and do an initial **exploratory analysis** of our data to get a sense of their nature. In the second part, we will build a **forecasting model** to predict US Crude exports for the next couple of months. Let's start! # # PS: Please note that the results on this notebook represent Vortexa's data as of 24th February 2020. These data are constantly evolving, refined and improved, therefore some (small) deviations in the numbers presented should be expected in future runs of the notebook. # In[1]: import pandas as pd import numpy as np from datetime import datetime from dateutil.relativedelta import relativedelta import matplotlib.pyplot as plt from vortexasdk import CargoMovements, VesselMovements, Products, Geographies import warnings warnings.filterwarnings("ignore") pd.options.display.max_columns = None pd.options.display.max_colwidth = 200 pd.options.display.max_rows = 200 # # Exploratory Analysis # ## Fetching Data # The first step is to define the limits of the time period we are interested to look at. Note, that the SDK can return a maximum of 4 years of historical data. In addition, since every entity that lives in Vortexa API is associated with a unique ID, we will need to retrieve the relevant IDs for our analysis. # In[2]: # Define filter limits END_DATE = datetime(2020,2,1) START_DATE = datetime(2016,2,1) print('Start date: {}'.format(START_DATE)) print('End date: {}'.format(END_DATE)) # In[3]: # Find United States ID us = [g.id for g in Geographies().search('united states').to_list() if 'country' in g.layer] print('United States polygon id: {}'.format(us[0])) # In[4]: # Check we've only got one ID for US assert len(us) == 1 # In[5]: # Find Crude ID crude = [p.id for p in Products().search('crude').to_list() if p.name=='Crude'] crude # In[6]: # Check we've only got one Crude ID assert len(crude) == 1 # In[7]: # Define the set of columns you want to be returned. For the complete set of available columns check the link: # https://vortechsa.github.io/python-sdk/endpoints/cargo_movements/#notes # Note that not all of the columns defined below are directly necessary for our analysis, but is a good opportunity # for beginner users of the SDK to get a grasp of the highly non-flat structure of the data and the level of detail # the data can dig into. required_columns = [ # A cargo movement can be carried by multiple vessels across various STS transfers. You can find all the vessels that # the cargo was onboard by inspecting the 'vessels.0', 'vessels.1' columns etc. # The 'vessels.0' columns shows the primary vessel associated with the cargo movement 'cargo_movement_id', 'vessels.0.name', 'vessels.0.imo', 'vessels.0.vessel_class', 'vessels.0.dwt', 'vessels.1.name', 'vessels.1.imo', 'vessels.1.vessel_class', 'vessels.1.dwt', 'vessels.2.name', 'vessels.2.imo', 'vessels.2.vessel_class', 'vessels.2.dwt', # Bring the loading Geography and Timestamps 'events.cargo_port_load_event.0.location.terminal.label', 'events.cargo_port_load_event.0.location.port.label', 'events.cargo_port_load_event.0.location.country.label', 'events.cargo_port_load_event.0.location.trading_region.label', 'events.cargo_port_load_event.0.location.shipping_region.label', 'events.cargo_port_load_event.1.location.port.label', 'events.cargo_port_load_event.2.location.port.label', 'events.cargo_port_load_event.0.start_timestamp', 'events.cargo_port_load_event.0.end_timestamp', 'events.cargo_port_load_event.1.end_timestamp', 'events.cargo_port_load_event.2.end_timestamp', # Bring STS associated info 'events.cargo_sts_event.0.start_timestamp', 'events.cargo_sts_event.0.location.sts_zone.label', 'events.cargo_sts_event.1.start_timestamp', 'events.cargo_sts_event.1.location.sts_zone.label', 'events.cargo_sts_event.2.start_timestamp', 'events.cargo_sts_event.2.location.sts_zone.label', # Bring the discharge Geography and Timestamps 'events.cargo_port_unload_event.0.location.terminal.label', 'events.cargo_port_unload_event.0.location.port.label', 'events.cargo_port_unload_event.0.location.country.label', 'events.cargo_port_unload_event.0.location.region.label', 'events.cargo_port_unload_event.0.location.trading_subregion.label', 'events.cargo_port_unload_event.0.location.shipping_region.label', 'events.cargo_port_unload_event.0.location.trading_region.label', 'events.cargo_port_unload_event.0.start_timestamp', 'events.cargo_port_unload_event.0.end_timestamp', 'events.cargo_port_unload_event.1.location.port.label', 'events.cargo_port_unload_event.1.end_timestamp', 'events.cargo_port_unload_event.2.location.port.label', 'events.cargo_port_unload_event.2.end_timestamp', #Bring any corporate information associated with the primary vessel 'vessels.0.corporate_entities.charterer.label', # Bring product information and quantity 'product.grade.label', 'product.category.label', 'product.group_product.label', 'product.group.label', 'quantity', # Is the vessel in transit, has it already discharged, or is it in floating storage? 'status' ] # Define a function to perform some basic manipulations in the data for ease of use def prepare_cms(cms, ignore_intra = True): """ Performs some basic data manipulation to the cargo movements DataFrame. By default, intra cargo movements, i.e. cargo movements that start and end in the same country are filtered out. """ # Convert date column to pandas datetime type cols = [c for c in cms.columns if 'timestamp' in c] for c in cols: cms[c] = pd.to_datetime(cms[c]).dt.tz_localize(None) # Rename columns for ease of use cms = cms.rename(columns = {'events.cargo_port_load_event.0.start_timestamp': 'loading_timestamp', 'events.cargo_port_load_event.0.end_timestamp': 'start_timestamp', 'events.cargo_port_unload_event.0.start_timestamp': 'unloading_timestamp', 'events.cargo_port_unload_event.0.end_timestamp': 'end_timestamp', 'events.cargo_port_load_event.0.location.port.label': 'loading_port', 'events.cargo_port_load_event.0.location.trading_region.label': 'loading_trading_region', 'events.cargo_port_load_event.0.location.country.label': 'loading_country', 'events.cargo_port_unload_event.0.location.country.label': 'unloading_country', 'product.category.label': 'product_category' }) # Calculate loading week and month cms['loading_week'] = cms['start_timestamp'].map(lambda x: x.to_period('W').start_time.date() if not pd.isnull(x) else x) cms['loading_month'] = cms['start_timestamp'].map(lambda x: x.to_period('M').start_time.date() if not pd.isnull(x) else x) # Depending on user input keep or ignore intra cargo movements, i.e. movements that start and end in the # same country if ignore_intra: cms = cms.loc[cms['loading_country'] != cms['unloading_country']] return cms # In[8]: # Query the SDK and convert result to DataFrame cms = CargoMovements().search( filter_activity = 'loading_end', filter_origins = us, filter_products = crude, filter_time_min = START_DATE, filter_time_max = END_DATE, cm_unit = 'b' ).to_df(columns = required_columns) print('Fetched {} crude cargo movements from United States'.format(cms.shape[0])) cms.head() # In[9]: # Perform some basic operations in the DataFrame cms = prepare_cms(cms, ignore_intra = True) print('{:,} crude movements remaining after removing intra-movements'.format(cms.shape[0])) cms.sample(5) # In[10]: # Check minimum and maximum start_timestamps print('Minimum start timestamp: {} \nMaximum start timestamp: {}'.\ format(cms['start_timestamp'].min(), cms['start_timestamp'].max())) # Having fetched our data, we are now ready to start exploring them and see what kind of questions we can answer. # ## Monthly US Exports # In[11]: monthly_quantity = cms.groupby('loading_month').\ agg({'quantity': 'sum'}).\ rename(columns = {'quantity': 'barrels'}) monthly_quantity.tail(10) # In[12]: ax = monthly_quantity.plot(kind='bar', figsize = (15,5), rot=60, title = 'US Monthly Crude Exports') ax.set_ylabel('Quantity (barrels)') ax.set_xlabel('Month') # ## Grades Breakdown # In[13]: quantity_by_category = cms.groupby(by = ['loading_month','product_category']).\ agg({'quantity': 'sum'}).\ rename(columns = {'quantity': 'barrels'}).\ reset_index() quantity_by_category = quantity_by_category.pivot(index = 'loading_month', columns = 'product_category', values = 'barrels') quantity_by_category = quantity_by_category.fillna(0) quantity_by_category.head() # In[14]: ax2 = quantity_by_category.plot.bar(stacked=True, figsize = (12,5), rot = 60, title = 'US Crude Exports \n Breakdown by Sulphur Content & API') ax2.set_ylabel('Quantity (barrels)') ax2.set_xlabel('Month') # ## Top Destinations # Who are the main **receivers** of US crude starting from 2019 onwards? # In[15]: NUM_TOP_DESTINATIONS = 10 # Filter all movements taht started from 2019 onwards cms_2019 = cms.loc[cms['start_timestamp'] >= '2019-01-01'] # Calculate quantity per destination quantity_per_destination = cms_2019.groupby('unloading_country').\ agg({'quantity': 'sum'}).\ sort_values(by = 'quantity', ascending = False) # Select top destinations and group together the remaining ones top_destination_countries = quantity_per_destination.head(NUM_TOP_DESTINATIONS) rest = pd.DataFrame(index = ['Other'], columns = ['quantity']) rest.loc['Other'] = quantity_per_destination[NUM_TOP_DESTINATIONS:].sum().values top_destination_countries = pd.concat([top_destination_countries, rest]) top_destination_countries['perc'] = round(top_destination_countries['quantity']*100 / top_destination_countries['quantity'].sum(),2) display(top_destination_countries) # In[16]: # Pick a colormap pallette and generate random colors vals = np.linspace(0,1,top_destination_countries.shape[0]) np.random.shuffle(vals) cmap = plt.cm.colors.ListedColormap(plt.cm.Greens(vals)) plt.figure(figsize = (10,5)) plt.pie(top_destination_countries['quantity'], labels = top_destination_countries.index.values, shadow = False, colors = cmap.colors, startangle=90, autopct = '%1.2f%%') plt.title('Top US Crude Destinations', fontsize=13) plt.axis('equal') plt.tight_layout() # # Simple Forecasting Model # Having done an initial exploration of the US crude exports, now is time to make it even more interesing and see how an analyst or data scientist could use these data to build a predictive model. For this purpose, we will try to predict **future** flows using an open source library called **Prophet** that was developed by Facebook. Although there is no free-lunch, Prophet is a very intuitive library, offers quick and handy results and is ideal for an initial iteration / baseline models on a wide variety of forecasting problems. # # It is based on a `Generalized Additive Model (GAM)` with three main components: `trend`, `seasonality` and `holidays`: # $$ y(t) = g(t) + s(t) + h(t) + \epsilon_{t}$$ # # For the purposes of this notebook, we will use the default parameters to fit our model. In order to install Prophet, you can simply run `pip install fbprophet` or `conda install -c conda-forge fbprophet` for conda users. # In[17]: from fbprophet import Prophet from fbprophet.plot import add_changepoints_to_plot from fbprophet.diagnostics import cross_validation from fbprophet.diagnostics import performance_metrics from fbprophet.plot import plot_cross_validation_metric # Firstly, we need to prepare our DataFrame. The input to Prophet must always be a DataFrame with two columns: `ds` (datestamp in Pandas expected format) and `y` (numerical value to be predicted). # In[18]: df = monthly_quantity.copy().reset_index() df = df.rename(columns = {'loading_month': 'ds', 'barrels': 'y'}) df.tail() # ## Model Fitting and Cross-Validation # We can now initialize and fit our model. Since we will deal with monthly data we explicitly ignore weekly seasonality. # In[19]: m = Prophet(weekly_seasonality=False) m.fit(df) # Prophet offers a built-in method for cross-validation using *simulated historical forecasts (SHFs)*. What this essentially does is to select some *cutoff* points in the history, for each of them fit a model using data only up to the cutoff point and then make predictions for a given forecast horizon. The user needs to specify the `horizon (H)` and optionally the size of the `initial` training period and the `period` (i.e. spacing) between the cutoff dates. As a heuristic, for a given horizon `H` it is suggested that a simulated forecast runs every `H/2` periods. Since we are dealing with monthly data we will set our initial training size to 2 years (24 months) to be able to account for yearly seasonality, our forecasted horizon to 6 months and the period to 3 months. # # One important thing to notice is the following: Prophet's build-in `cross_validation` method accepts period arguments that are **timedelta** objects. In this notebook, we are dealing with monthly data, however, pandas latest version (i.e. **1.0.1**) doesn't seem to work with a month unit in the `pd.to_timedelta` method (although pandas docs mention that an `M` option is supported as a unit, the following error pops-up when trying to run: *ValueError: Units 'M' and 'Y' are no longer supported, as they do not represent unambiguous timedelta values durations*). A work-around through this is to define periods in days and multiply by the **average number of days per month**, as defined in the Gregorian calendar (i.e. 30.436875). With this conversion the results will be identical with what `pd.to_timedelta()` with `unit=''M` would produce in older pandas versions. # In[20]: # Define initial training size, horizon and period days_per_month = 30.436875 CV_INITIAL_SIZE = 24*days_per_month CV_PERIOD_SIZE = 3*days_per_month CV_HORIZON_SIZE = 6*days_per_month # Run cross validation df_cv = cross_validation(m, initial = pd.to_timedelta(CV_INITIAL_SIZE, unit='D'), period = pd.to_timedelta(CV_PERIOD_SIZE, unit='D'), horizon = pd.to_timedelta(CV_HORIZON_SIZE, unit='D')) df_cv['cutoff'] = pd.to_datetime(df_cv['cutoff'].dt.date) df_cv # In the above DataFrame, we can see all the predicted values per cutoff point along with the uncertainty intervals. To illustrate this in a more clear manner, we will plot the actual and predicted values for all the simulated historical forecasts. # In[21]: def plot_predictions(m, df_cv): """ Plots actual and predicted values for all different cutoff points Input args: m: Prophet fitted model df_cv: Output DataFrame of Prophet cross_validation method """ i=1 # Iterate through the cutoff points for ct in df_cv['cutoff'].unique(): dfs = df_cv.loc[df_cv['cutoff'] == ct] fig = plt.figure(facecolor='w', figsize=(8, 4)) ax = fig.add_subplot(111) ax.plot(m.history['ds'].values, m.history['y'], 'k.') ax.plot(dfs['ds'].values, dfs['yhat'], ls = '-', c = '#0072B2') ax.fill_between(dfs['ds'].values, dfs['yhat_lower'], dfs['yhat_upper'], color = '#0072B2', alpha = 0.4) ax.axvline(x=pd.to_datetime(ct), c = 'gray', lw = 4, alpha = 0.5) ax.set_title('Cutoff date: {} \n Training set size: {} months'.\ format(pd.to_datetime(ct).date(), int((CV_INITIAL_SIZE + i*CV_PERIOD_SIZE) / days_per_month))) ax.set_ylabel('Quantity') ax.set_xlabel('Date') i+=1 def get_cv_performance_metrics(df_cv): """ Calculate basic performance metrics for the cross-validated results Input args: df_cv: Output DataFrame of Prophet cross_validation method """ dfp = performance_metrics(df_cv) dfpg = dfp.copy() dfpg['horizon_days'] = dfpg['horizon'].map(lambda x: round(x.days,-1)) dfpg = dfpg.groupby('horizon_days').agg({'horizon': 'first', 'rmse': 'mean', 'mae': 'mean', 'mape': 'mean', 'coverage': 'mean'}).\ reset_index(drop=True) return dfp, dfpg # In[22]: plot_predictions(m, df_cv) # Finally, let's extract some useful statistics for the prediction performance. We will look specifically on the performance of the model for various horizon sizes. # In[23]: dfp, dfpg = get_cv_performance_metrics(df_cv) display(dfpg) # In[24]: # Plot MAPE with respect to horizon plt.figure(figsize = (6,4)) plt.plot(dfpg['mape'], '*-') labels = [str(h.days) + ' days' for h in dfpg['horizon']] plt.xticks(np.arange(len(dfpg)), labels = labels, rotation = 20) plt.grid() plt.xlabel('Horizon') plt.ylabel('MAPE') plt.title('Cross-Validated MAPE for various horizon sizes') # As can be seen from the above MAPE plot, using 3.5 years of Vortexa's US Crude export data and a basic forecasting model with default settings and no parametrization we were able to predict exports in a 6 month horizon with MAPE up to 8%. Not bad at all! # ## Make Future Predictions # After the cross-validation we can now move to the final and most exciting part. Finally, let's predict how much crude oil United States will export in the upcoming 6 months. # In[25]: # Create a DataFrame with future dates MONTHS_AHEAD_TO_PREDICT = 6 future = m.make_future_dataframe(periods = MONTHS_AHEAD_TO_PREDICT, freq = 'MS') future.tail(MONTHS_AHEAD_TO_PREDICT) # In[26]: # Make predictions forecast = m.predict(future) forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(MONTHS_AHEAD_TO_PREDICT) # In[27]: # Plot forecasts along with historical data fig = m.plot(forecast) plt.plot(forecast.set_index('ds')['yhat'].tail(MONTHS_AHEAD_TO_PREDICT), '*', color = 'black', label = 'Forecasted values') plt.legend(loc = 'upper left') plt.title('US Crude Exports - {} months Forecasting'.format(MONTHS_AHEAD_TO_PREDICT)) plt.ylabel('Quantity (barrels)') plt.xlabel('Date') # In[28]: #fig2 = m.plot_components(forecast) # # Conclusion # For a baseline model with practically no fine tuning at all, the results look promising. Those data combined with additional regressors from the SDK (e.g. diesel exports from Rotterdam or crude exports from Russia) or even with proprietary data can create even more powerful models and shed light to the future patterns and trends of the global oil seaborne flows. Hoped you enjoyed! # In[ ]: # In[ ]: # In[ ]: