#!/usr/bin/env python
# coding: utf-8

# # ISB-CGC Community Notebooks
# Check out more notebooks at our [Community Notebooks Repository](https://github.com/isb-cgc/Community-Notebooks)!
# 
# ```
# Title:   How to visualize results from BigQuery
# Author:  David L Gibbs
# Created: 2019-07-17
# Purpose: Demonstrate how visualize the results from a query.
# URL:     https://github.com/isb-cgc/Community-Notebooks/blob/master/Notebooks/How_to_plot_BigQuery_results.ipynb
# Notes:   
# ```
# ***

# # How to visualize results from BigQuery

# In this example, we'll perform a few easy queries, and use the python Seaborn library to 
# visualize the results.
# 
# Library documentation: https://seaborn.pydata.org/
# 
# There's many python tutorials on using the Seaborn library for plotting.
# For example, https://elitedatascience.com/python-seaborn-tutorial

# ## Initial setup

# In[ ]:


# If you start your jupyter notebook session using the 
# Community-Notebook/VM-Notebook-Launcher then you're already logged in!

# !gcloud auth application-default login


# In[2]:


get_ipython().system('pip3 install seaborn')


# In[3]:


get_ipython().system('pip3 install google-cloud-bigquery')


# ## Defining helper functions

# In[8]:


import seaborn as se
from google.cloud import bigquery


# In[17]:


import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"


# In[5]:


def runQuery ( client, qString, dryRun=False ):
  
  print ( "\n in runQuery ... " )
  if ( dryRun ):
    print ( "    dry-run only " )
    
  ## set up QueryJobConfig object
  job_config = bigquery.QueryJobConfig()
  job_config.dry_run = dryRun
  job_config.use_query_cache = True
  job_config.use_legacy_sql = False
  
  ## run the query
  try:
    query_job = client.query ( qString, job_config=job_config )
    ## print ( "    query job state: ", query_job.state )
  except:
    print ( "  FATAL ERROR: query execution failed " )
    return ( None )
  
  ## return results as a dataframe (or an empty dataframe for a dry-run) 
  if ( not dryRun ):
    try:
      df = query_job.to_dataframe()
      if ( query_job.total_bytes_processed==0 ):
        print ( "    the results for this query were previously cached " )
      else:
        print ( "    this query processed {} bytes ".format(query_job.total_bytes_processed) )
      if ( len(df) < 1 ):
        print ( "  WARNING: this query returned NO results ")
      return ( df )
    except:
      print ( "  FATAL ERROR: query execution failed " )
      return ( None )
    
  else:
    print ( "    if not cached, this query will process {} bytes ".format(query_job.total_bytes_processed) )
    ## return an empty dataframe
    return ( pd.DataFrame() )
 

def checkQueryResults ( qr ):
  print ( "\n in checkQueryResults ... " )
  if ( not isinstance(qr, pd.DataFrame) ):
    print ( " query execution failed! " )
    return ( False )
  else:
    if ( len(qr) > 0 ): 
      print ( " # of rows in query results: {} ".format(len(qr)) )
      print ( "\n", qr.head() )
    else:
      print ( " query returned NO results ?!? " )  
    return ( True )


# In[10]:


# User your own project ID

project_id = 'your_project_number'
if project_id == 'your_project_number':
    print('Please update the project number with your Google Cloud Project')
else:
    bqclient = bigquery.Client(project=project_id)


# ## Making barplots

# In[21]:


# We define queries as strings #

sql = '''
SELECT
   icd_10,
   COUNT(*) as Count
FROM
   `isb-cgc.TCGA_bioclin_v0.Clinical`
GROUP BY
   1  -- this is the same as 'group by icd_10'
ORDER BY
   Count
'''

res0 = runQuery ( bqclient, sql, dryRun=False )

res0[0:5]


# In[26]:


# first I'll subset the results,
# there's a lot of categories!
res1 = res0[res0['Count'] > 200]

# then we'll make a bar plot
p = se.barplot(data=res1, x = 'icd_10', y = 'Count')


# ## Making scatter plots

# In[27]:


sql = '''
SELECT
   avg_percent_neutrophil_infiltration,
   avg_percent_lymphocyte_infiltration
FROM
   `isb-cgc.TCGA_bioclin_v0.Biospecimen`
GROUP BY
   1,2
'''

res3 = runQuery ( bqclient, sql, dryRun=False )

res3[0:5]


# In[29]:


se.scatterplot(data=res3, x='avg_percent_lymphocyte_infiltration', y='avg_percent_neutrophil_infiltration')


# In[ ]: