#!/usr/bin/env python # coding: utf-8 # # ISB-CGC Community Notebooks # Check out more notebooks at our [Community Notebooks Repository](https://github.com/isb-cgc/Community-Notebooks)! # # ``` # Title: How to visualize results from BigQuery # Author: David L Gibbs # Created: 2019-07-17 # Purpose: Demonstrate how visualize the results from a query. # URL: https://github.com/isb-cgc/Community-Notebooks/blob/master/Notebooks/How_to_plot_BigQuery_results.ipynb # Notes: # ``` # *** # # How to visualize results from BigQuery # In this example, we'll perform a few easy queries, and use the python Seaborn library to # visualize the results. # # Library documentation: https://seaborn.pydata.org/ # # There's many python tutorials on using the Seaborn library for plotting. # For example, https://elitedatascience.com/python-seaborn-tutorial # ## Initial setup # In[ ]: # If you start your jupyter notebook session using the # Community-Notebook/VM-Notebook-Launcher then you're already logged in! # !gcloud auth application-default login # In[2]: get_ipython().system('pip3 install seaborn') # In[3]: get_ipython().system('pip3 install google-cloud-bigquery') # ## Defining helper functions # In[8]: import seaborn as se from google.cloud import bigquery # In[17]: import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = (20.0, 10.0) plt.rcParams['font.family'] = "serif" # In[5]: def runQuery ( client, qString, dryRun=False ): print ( "\n in runQuery ... " ) if ( dryRun ): print ( " dry-run only " ) ## set up QueryJobConfig object job_config = bigquery.QueryJobConfig() job_config.dry_run = dryRun job_config.use_query_cache = True job_config.use_legacy_sql = False ## run the query try: query_job = client.query ( qString, job_config=job_config ) ## print ( " query job state: ", query_job.state ) except: print ( " FATAL ERROR: query execution failed " ) return ( None ) ## return results as a dataframe (or an empty dataframe for a dry-run) if ( not dryRun ): try: df = query_job.to_dataframe() if ( query_job.total_bytes_processed==0 ): print ( " the results for this query were previously cached " ) else: print ( " this query processed {} bytes ".format(query_job.total_bytes_processed) ) if ( len(df) < 1 ): print ( " WARNING: this query returned NO results ") return ( df ) except: print ( " FATAL ERROR: query execution failed " ) return ( None ) else: print ( " if not cached, this query will process {} bytes ".format(query_job.total_bytes_processed) ) ## return an empty dataframe return ( pd.DataFrame() ) def checkQueryResults ( qr ): print ( "\n in checkQueryResults ... " ) if ( not isinstance(qr, pd.DataFrame) ): print ( " query execution failed! " ) return ( False ) else: if ( len(qr) > 0 ): print ( " # of rows in query results: {} ".format(len(qr)) ) print ( "\n", qr.head() ) else: print ( " query returned NO results ?!? " ) return ( True ) # In[10]: # User your own project ID project_id = 'your_project_number' if project_id == 'your_project_number': print('Please update the project number with your Google Cloud Project') else: bqclient = bigquery.Client(project=project_id) # ## Making barplots # In[21]: # We define queries as strings # sql = ''' SELECT icd_10, COUNT(*) as Count FROM `isb-cgc.TCGA_bioclin_v0.Clinical` GROUP BY 1 -- this is the same as 'group by icd_10' ORDER BY Count ''' res0 = runQuery ( bqclient, sql, dryRun=False ) res0[0:5] # In[26]: # first I'll subset the results, # there's a lot of categories! res1 = res0[res0['Count'] > 200] # then we'll make a bar plot p = se.barplot(data=res1, x = 'icd_10', y = 'Count') # ## Making scatter plots # In[27]: sql = ''' SELECT avg_percent_neutrophil_infiltration, avg_percent_lymphocyte_infiltration FROM `isb-cgc.TCGA_bioclin_v0.Biospecimen` GROUP BY 1,2 ''' res3 = runQuery ( bqclient, sql, dryRun=False ) res3[0:5] # In[29]: se.scatterplot(data=res3, x='avg_percent_lymphocyte_infiltration', y='avg_percent_neutrophil_infiltration') # In[ ]: