#!/usr/bin/env python # coding: utf-8 # # 🐼 2. With a Little Help from My Pandas API # In[ ]: import pykx as kx # ## Traffic # If we aim to enhance performance, we should let kdb handle the heavy lifting. This involves sticking to the PyKX object and avoiding the use of `pd()`. How dow it look like? # In[ ]: traffic = kx.q.read.csv("data/traffic.csv", "IPSJS", ",", True) traffic # Now, let's identify the errors to initiate the migration of the pandas code into PyKX. # In[ ]: traffic = traffic[traffic['error'] == 'N'] traffic = traffic.set_index(['fecha']) traffic_mad = traffic[['carga']].groupby(['fecha']).mean() traffic_mad # Wait a minute, it works! Fortunately, PyKX has made a significant effort to make this library Python-first. There's a Pandas API that allows us to interact with PyKX tables as if they were Pandas dataframes. # ## Weather # In[ ]: weather = kx.q.read.csv("data/weather.csv", "DUIFFFFFFFF", ",", True) # Does that mean we can keep the exact same code? Happy days! # In[ ]: weather[weather['precipitacion'].notnull()] # Not really! The Pandas API is still evolving, so you might encounter functionality that has not been implemented yet. However, you can always switch back and forth, converting to a Pandas dataframe, applying the expression as is, and then returning to the PyKX object. # In[ ]: weather = weather.pd() weather = weather[weather['precipitacion'].notnull()] weather = kx.toq(weather.reset_index()) # Unfortunately, this incurs a performance penalty. Switching objects between the q and Python spaces is not free. If you prefer to stick with the PyKX object, further training with PyKX is necessary. In this scenario, we could have used the following expression to achieve the same goal: # ```python # weather = weather[getattr(kx.q, 'not')(weather['precipitacion'] == kx.q('0n'))] # ``` # This is an exceptionally noisy case due to the peculiarities of `not`, but I thought it would be nice to illustrate it anyway. # The remaining instructions are perfectly fine and do not require further adaptations. # In[ ]: weather['fecha'] = weather['fecha'] + weather['hora'] weather = weather.set_index(['fecha']) weather_mad = weather[['precipitacion']].groupby(['fecha']).mean() weather_mad # ## All Together # What about prefix operations, such as `merge_asof`? # In[ ]: import pandas as pd traffic_weather = pd.merge_asof(traffic_mad, weather_mad, on='fecha', direction='backward') # Fortunately, PyKX supplies an infix variant to make the adaptation less painful. # In[ ]: # traffic_weather = pd.merge_asof(traffic_mad, weather_mad, on='fecha', direction='backward') traffic_weather = traffic_mad.merge_asof(weather_mad, on='fecha', direction='backward') traffic_weather # ## Model # Now it's time to leverage the rich Python ecosystem. A slight tweak is all it takes to retrieve the dataframe... # In[ ]: traffic_weather = traffic_weather.pd().reset_index() # ... and we are ready to go! # In[ ]: from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense import matplotlib.pyplot as plt to_quarter = lambda x: int((x.hour * 60 + x.minute) / 15) traffic_weather['hora'] = traffic_weather['fecha'].dt.time.apply(to_quarter) X = traffic_weather[['hora', 'precipitacion']] y = traffic_weather['carga'] scaler = MinMaxScaler() X_normalized = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42) model = Sequential() model.add(Dense(64, input_dim=X_train.shape[1], activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='adam', loss='mean_squared_error') history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1) predictions = model.predict(X_test) plt.scatter(y_test, predictions, color='blue') plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2) # Diagonal line for reference plt.xlabel('Actual Traffic Load') plt.ylabel('Predicted Traffic Load') plt.title('Actual vs. Predicted Traffic Load') plt.show() # ## What Next? # * We can go far with the Pandas API # * Understanding the core api and grasping q would become necessary # In[ ]: