#!/usr/bin/env python # coding: utf-8 # # Online prediction with BigQuery ML # # ML.Predict in BigQuery ML is primarily meant for batch predictions. What if you want to build a web application to provide online predictions? Here, I show the basic Python code to do online prediction. You can wrap this code in AppEngine or other web framework/toolkit to provide scalable, fast, online predictions. # In[ ]: get_ipython().system('pip install google-cloud # Reset Session after installing') # In[3]: PROJECT = 'cloud-training-demos' # change as needed # ## Create a model # # Let's start by creating a simple prediction model to [predict arrival delays of aircraft](https://towardsdatascience.com/how-to-train-and-predict-regression-and-classification-ml-models-using-only-sql-using-bigquery-ml-f219b180b947). I'll use this to illustrate the process. # # First, if necessary, create the BigQuery dataset that will store the output of the model. # In[ ]: get_ipython().run_line_magic('bash', '') bq mk -d flights # Then, do a "CREATE MODEL". This will take about 5 minutes. # In[3]: get_ipython().run_line_magic('bq', 'query') CREATE OR REPLACE MODEL flights.arrdelay OPTIONS (model_type='linear_reg', input_label_cols=['arr_delay']) AS SELECT arr_delay, carrier, origin, dest, dep_delay, taxi_out, distance FROM `cloud-training-demos.flights.tzcorr` WHERE arr_delay IS NOT NULL # ## Batch prediction with model # # Once you have a trained model, batch prediction can be done within BigQuery itself. # # For example, to find the predicted arrival delays for a flight from DFW to LAX for a range of departure delays # In[9]: get_ipython().run_line_magic('bq', 'query') SELECT * FROM ml.PREDICT(MODEL flights.arrdelay, ( SELECT 'AA' as carrier, 'DFW' as origin, 'LAX' as dest, dep_delay, 18 as taxi_out, 1235 as distance FROM UNNEST(GENERATE_ARRAY(-3, 10)) as dep_delay )) # ## Online prediction in Python # # The batch prediction technique above can not be used for online prediction though. Typical BigQuery queries have a latency of 1-2 seconds and that is too high for a web application. # # For online prediction, it is better to grab the weights and do the computation yourself. # In[22]: get_ipython().run_line_magic('bq', 'query') SELECT processed_input AS input, model.weight AS input_weight FROM ml.WEIGHTS(MODEL flights.arrdelay) AS model # In[16]: get_ipython().run_line_magic('bq', 'query') SELECT processed_input AS input, model.weight AS input_weight, category.category AS category_name, category.weight AS category_weight FROM ml.WEIGHTS(MODEL flights.arrdelay) AS model, UNNEST(category_weights) AS category # Here's how to do that in Python. # # p.s. I'm assuming that you are in an environment where you are already authenticated with Google Cloud. If not, see [this article on how to access BigQuery using private keys](https://towardsdatascience.com/how-to-enable-pandas-to-access-bigquery-from-a-service-account-205a216f0f68) # In[4]: def query_to_dataframe(query): import pandas as pd import pkgutil privatekey = None # pkgutil.get_data(KEYDIR, 'privatekey.json') return pd.read_gbq(query, project_id=PROJECT, dialect='standard', private_key=privatekey) # You need to pull 3 pieces of information: # * The weights for each of your numeric columns # * The scaling for each of your numeric columns # * The vocabulary and weights for each of your categorical columns # # I pull them using three separate BigQuery queries below # In[27]: numeric_query = """ SELECT processed_input AS input, model.weight AS input_weight FROM ml.WEIGHTS(MODEL flights.arrdelay) AS model """ numeric_weights = query_to_dataframe(numeric_query).dropna() numeric_weights # In[84]: scaling_query = """ SELECT input, min, max, mean, stddev FROM ml.FEATURE_INFO(MODEL flights.arrdelay) AS model """ scaling_df = query_to_dataframe(scaling_query).dropna() scaling_df # In[25]: categorical_query = """ SELECT processed_input AS input, model.weight AS input_weight, category.category AS category_name, category.weight AS category_weight FROM ml.WEIGHTS(MODEL flights.arrdelay) AS model, UNNEST(category_weights) AS category """ categorical_weights = query_to_dataframe(categorical_query) categorical_weights.head() # With the three pieces of information in-place, you can simply do the math for linear regression: # In[87]: def compute_prediction(rowdict, numeric_weights, scaling_df, categorical_weights): input_values = rowdict # numeric inputs pred = 0 for column_name in numeric_weights['input'].unique(): wt = numeric_weights[ numeric_weights['input'] == column_name ]['input_weight'].values[0] if column_name != '__INTERCEPT__': #minv = scaling_df[ scaling_df['input'] == column_name ]['min'].values[0] #maxv = scaling_df[ scaling_df['input'] == column_name ]['max'].values[0] #scaled_value = (input_values[column_name] - minv)/(maxv - minv) meanv = scaling_df[ scaling_df['input'] == column_name ]['mean'].values[0] stddev = scaling_df[ scaling_df['input'] == column_name ]['stddev'].values[0] scaled_value = (input_values[column_name] - meanv)/stddev else: scaled_value = 1.0 contrib = wt * scaled_value print('col={} wt={} scaled_value={} contrib={}'.format(column_name, wt, scaled_value, contrib)) pred = pred + contrib # categorical inputs for column_name in categorical_weights['input'].unique(): category_weights = categorical_weights[ categorical_weights['input'] == column_name ] wt = category_weights[ category_weights['category_name'] == input_values[column_name] ]['category_weight'].values[0] print('col={} wt={} value={} contrib={}'.format(column_name, wt, input_values[column_name], wt)) pred = pred + wt return pred # Here is an example of the prediction code in action: # In[88]: rowdict = { 'carrier' : 'AA', 'origin': 'DFW', 'dest': 'LAX', 'dep_delay': -3, 'taxi_out': 18, 'distance': 1235 } print(compute_prediction(rowdict, numeric_weights, scaling_df, categorical_weights)) # As you can see, this matches the batch prediction value, telling us that we got the computation correct. # # Now, all that we have to do is to wrap up the code into a web application to get online prediction. # # Copyright 2017 Google Inc. Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License