#!/usr/bin/env python
# coding: utf-8

# # ISB-CGC Community Notebooks
# 
# 

# ```
# Title:   How to make a heatmap using BigQuery 
# Author:  David L Gibbs
# Created: 2019-12-02
# Purpose: Demonstrate how to make a correlation matrix and create a heatmap visualization.
# Notes: Works in Google Colaboratory notebooks.
# Repo: https://github.com/isb-cgc/Community-Notebooks/blob/master/Notebooks/How_to_make_a_heatmap_using_BigQuery.ipynb
# ```

# In this notebook, we will cover how to use BigQuery to pull some data (in the cloud so data transfer is free!), make a correlation (or distance) matrix, and visualize that result in a heatmap.
# 
# It's also possible to compute the correlations *inside* BigQuery, which is a good use of the technology, but for simplicity's sake, we'll use python.
# 
# These methods are commonly used in cancer data analysis (see every practically all TCGA studies), but can be applied to any type of data. Here we will be using TCGA gene expression data from two studies, KIRC (kidney cancer) and GBM (brain cancer).
# 
# In this work, we'll see if the two tissue types separate into clusters.

# # Google Auth

# In[ ]:


# first we'll log in, in order to access a project bucket.
get_ipython().system('gcloud auth login')


# In[ ]:


# in order to authenticate and get a credentials token use the following:
get_ipython().system('gcloud auth application-default login')


# In[ ]:


get_ipython().system('gcloud config set project [MY-PROJECT]')


# 
# ---
# 

# # Software install

# In case it's needed, we can install Google's bigquery python library.

# In[ ]:


pip install --upgrade google-cloud-bigquery


# And then we can import the libraries

# In[6]:


import seaborn as sb
import pandas as pd
import numpy as np
from google.cloud import bigquery


# # Querying for data

# OK, we're going to query for some data using a preselected gene set from MSigDB.
# 
# http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_TGF_BETA_SIGNALING.html
# 
# 
# 
# 
# 

# In[ ]:


get_ipython().system('curl -o geneset.txt https://www.gsea-msigdb.org/gsea/msigdb/download_geneset.jsp?geneSetName=HALLMARK_TGF_BETA_SIGNALING')


# In[11]:


genes = open('geneset.txt','r').read().strip().split('\n')
genes = ["'"+x+"'" for x in genes]
### Make the set a little smaller ###
genelist = ' ' + ','.join(genes[2:12]) + ' '  


# So, now that we have our list of genes, we'll create a client object that takes an SQL string and communicates 
# with the Google mothership.

# In[ ]:


client = bigquery.Client()


# Here, the SQL is captured as a string. Using the Google BigQuery web interface is a good place to prototype SQL.

# In[13]:


sql = '''
SELECT project_short_name, sample_barcode, HGNC_gene_symbol, normalized_count 
FROM `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
WHERE project_short_name IN ('TCGA-KIRC', 'TCGA-GBM')
AND HGNC_gene_symbol IN (__GENELIST__) 
GROUP BY 1,2,3,4
'''.replace('__GENELIST__', genelist)


# Now we use the client, send off the SQL, and convert the results to a pandas dataframe.

# In[18]:


dat = client.query(sql).to_dataframe()  # Make an API request.


# In[ ]:


dat[0:5]


# # Converting tidy data to a matrix, and computing a correlation matrix

# In[20]:


mat = pd.pivot_table(dat, values='normalized_count', columns='HGNC_gene_symbol', index ='sample_barcode')


# In[21]:


mat.shape


# In[ ]:


mat[0:5]  # the pandas dataframe index is the sample barcode.


# Let's choose a smaller set of samples, to speed things up.

# In[23]:


idx = np.random.choice(a=range(0,693), size=60, replace=False)
matSample = mat.iloc[idx]  # choose a smaller set of samples to speed things up #


# In[24]:


matSample.shape


# Now we'll compute the correlations:

# In[25]:


cormat = matSample.transpose().corr()


# In[26]:


cormat.shape


# To organize our phenotype information...

# In[27]:


# first we're getting a unique map of sample barcodes to tissue types
udat = dat.loc[:, ['sample_barcode', 'project_short_name']].drop_duplicates()

# then we can get the tissue designations for each random sample in our table
tissueType = udat.loc[ udat['sample_barcode'].isin(matSample.index), ['sample_barcode','project_short_name'] ]

# for use as a color bar in the heatmap, we index the table by barcode
tissueType.index = tissueType['sample_barcode']


# # Making plots

# In[28]:


# unsorted #
sb.heatmap(cormat, annot=False, )


# In[29]:


# to create the color bar, indicating the sample sources,
# we're going to create a data frame indexed by sample barcode
# and mark KIRC as red and GBM as blue
lut = {'TCGA-KIRC' : 'r', 'TCGA-GBM': 'b'}
row_colors = tissueType['project_short_name'].map(lut)


# To get a clustered heatmap, like we're more used to seeing, we use the seaborn 'clustermap'.  Parameters allow for clustering rows and columns.

# In[30]:


plt = sb.clustermap(cormat, row_colors=row_colors)


# Cool! The tissue types separated out, and it looks like KIRC (red bar) clusters into two groups. 

# 
# # Wrapping up and writing results and figures  back to our bucket.

# In[ ]:


# saving the clustered heatmap #
plt.savefig('tcga_clustered_heatmap.png')


# In[ ]:


get_ipython().system('gsutil cp tcga_clustered_heatmap.png gs://my-project/heatmaps')


# In[ ]:


# That's it!