#!/usr/bin/env python # coding: utf-8 # # ISB-CGC Community Notebooks # # Check out more notebooks at our [Community Notebooks Repository](https://github.com/isb-cgc/Community-Notebooks)! # # ``` # Title: How to create a complext cohort # Author: Lauren Hagen # Created: 2019-08-12 # Purpose: More complex overview of creating cohorts # URL: https://github.com/isb-cgc/Community-Notebooks/blob/master/Notebooks/How_to_create_a_complex_cohort.ipynb # Notes: This notebook was adapted from work by Sheila Reynolds, 'How to Create TCGA Cohorts part 3' https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%201.ipynb. # ``` # *** # This notebook will construct a cohort for a single tumor type based on data availability, while also taking into consideration annotations about the patients or samples. # # As you've seen already, in order to work with BigQuery, the first thing we need to do is import the bigquery module: # # In[ ]: from google.cloud import bigquery # And we will need to Authenticate ourselves. # In[2]: get_ipython().system('gcloud auth application-default login') # Just so that this doesn't get buried in the code below, we are going to specify our tumor-type of interest here. In TCGA each tumor-type is also a separate *study* within the TCGA *project*. The studies are referred to based on the 2-4 letter tumor-type abbreviation. A complete list of all study abbreviations, with the full study name can be found on this [page](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations). For this particular exercise, we will look at the "Breast invasive carcinoma" study, abbreviated BRCA: # In[ ]: studyName = "TCGA-BRCA" # More information the the BRCA study can be found [here](https://portal.gdc.cancer.gov/projects/TCGA-BRCA). In this notebook, we are going to wind up making use of all of the available data types, so let's have a look at the entire **`TCGA_hg38_data_v0`** dataset in the query below. The tables and data sets available from ISB-CGC in BigQuery can also be explored without login with the [ISB-CGC BigQuery Table Searcher](https://isb-cgc.appspot.com/bq_meta_search/). # In[ ]: # Create a client to access the data within BigQuery client = bigquery.Client('isb-cgc') # Create a variable of datasets datasets = list(client.list_datasets()) # Create a variable for the name of the project project = client.project # In[5]: print("Tables:") # Create a variable with the list of tables in the dataset tables = list(client.list_tables('TCGA_hg38_data_v0')) # If there are tables then print their names, # else print that there are no tables if tables: for table in tables: print("\t{}".format(table.table_id)) else: print("\tThis dataset does not contain any tables.") # In this next code cell, we define an SQL query called **`DNU_patients`** which finds all patients in the Annotations table which have either been 'redacted' or had 'unacceptable prior treatment'. It will display the output of the query and then save the data to a Pandas DataFrame. # Now we'll use the query defined above to get the "Do Not Use" list of participants (aka patients): # # ** Note: you will need to update 'your_project_number' with your project number before continuing with the notebook ** # In[ ]: get_ipython().run_cell_magic('bigquery', 'DNU_patients --your_project_number', '\nSELECT\n case_barcode,\n category AS categoryName,\n classification AS classificationName\nFROM\n `isb-cgc.TCGA_bioclin_v0.Annotations`\nWHERE\n ( entity_type="Patient"\n AND (category="History of unacceptable prior treatment related to a prior/other malignancy"\n OR classification="Redaction" ) )\nGROUP BY\n case_barcode,\n categoryName,\n classificationName\nORDER BY\n case_barcode\n') # In[7]: DNU_patients.describe() # Now we're gong to use the query defined previously in a function that builds a "clean" list of patients in the specified study, with available molecular data, and without any disqualifying annotations. # In[ ]: def buildCleanBarcodeList ( studyName, bqDataset, barcodeType, DNUList): print("in buildCleanBarcodeList ... ", studyName) # Print a start statement ULists = [] # Define an empty list for Unique Barcodes print(" --> looping over data tables: ") # Print an indication of when the loop is beginning barcodeField = barcodeType # Set the barcodeField for each loop # For each dataset in the bqDataset table for t in bqDataset: currTable = "`isb-cgc.TCGA_hg38_data_v0." + t.table_id + "`" # Set the current table barcodeField = barcodeType # reset the barcode Field for each loop # Try the simple query that will get the barcode list from most of the tables try: # Set query string get_barcode_list = """ SELECT {} as barcode FROM {} WHERE project_short_name="{}" GROUP BY {} ORDER BY {} """.format(barcodeField, currTable, studyName, barcodeField, barcodeField) # Query BigQuery results = bigquery.Client('isb-cgc-02-0001').query(get_barcode_list) # Store the results into a dataframe results = results.result().to_dataframe() # If the simple query does not run, try joining the two tables to add the # project_short_name to the the query as it is not in some of the Methylation tables except: try: get_barcode_list = """ WITH a AS( SELECT {} FROM {}), b AS ( SELECT {}, project_short_name FROM `isb-cgc.TCGA_hg38_data_v0.Copy_Number_Segment_Masked`) SELECT {} as barcode FROM ( SELECT a.{} AS {}, b.project_short_name AS project_short_name FROM a JOIN b ON a.{} = b.{}) WHERE project_short_name='{}' GROUP BY {} ORDER BY {} """.format(barcodeField, currTable, barcodeField, barcodeField, barcodeField, barcodeField, barcodeField, barcodeField, studyName, barcodeField, barcodeField) # Query BigQuery results = bigquery.Client('isb-cgc-02-0001').query(get_barcode_list) # Store the results into a dataframe results = results.result().to_dataframe() except: try: barcodeField = "sample_barcode_tumor" get_barcode_list = """ SELECT {} as barcode FROM {} WHERE project_short_name="{}" GROUP BY {} ORDER BY {} """.format(barcodeField, currTable, studyName, barcodeField, barcodeField) # Query BigQuery results = bigquery.Client('isb-cgc-02-0001').query(get_barcode_list) # Store the results into a dataframe results = results.result().to_dataframe() except: # Create an empty dataframe if none of the above queries return values results = pd.DataFrame() # If there is data in the result dataframe, add the results to ULists if ( len(results) > 0): print(" ", t.table_id, " --> ", len(results["barcode"]), "unique barcodes.") ULists += [ results ] else: print(" ", t.table_id, " --> no data returned") print(" --> we have {} lists to merge".format(len(ULists))) masterList = [] # Create a list for the master list of barcodes # Add barcodes to the master list with no repeating barcodes for aList in ULists: for aBarcode in aList["barcode"]: if ( aBarcode not in masterList ): masterList += [aBarcode] print(" --> which results in a single list with {} barcodes".format(len(masterList))) print(" --> removing DNU barcodes") cleanList = [] # For each barcode in the master check to see if it is in the DNU # list and then add it to the clean list for aBarcode in masterList: if ( aBarcode not in DNUList[barcodeField].tolist() ): cleanList += [ aBarcode ] else: print(" excluding this barcode", aBarcode) print(" --> returning a clean list with {} barcodes".format(len(cleanList))) return(cleanList) # In[14]: barcodeType = "case_barcode" cleanPatientList = buildCleanBarcodeList ( studyName, tables, barcodeType, DNU_patients ) # Now we are going to repeat the same process, but at the sample barcode level. Most patients will have provided two samples, a "primary tumor" sample, and a "normal blood" sample, but in some cases additional or different types of samples may have been provided, and sample-level annotations may exist that should result in samples being excluded from most downstream analyses. # In[ ]: get_ipython().run_cell_magic('bigquery', 'DNUsamples --project your_project_number', '\n# there are many different types of annotations that are at the "sample" level\n# in the Annotations table, and most of them seem like they should be "disqualifying"\n# annotations, so for now we will just return all sample barcodes with sample-level\n# annotations\nSELECT\n sample_barcode\nFROM\n `isb-cgc.TCGA_bioclin_v0.Annotations`\nWHERE\n (entity_type="Sample")\nGROUP BY\n sample_barcode\nORDER BY\n sample_barcode\n') # In[16]: DNUsamples.describe() # And now we can re-use the previously defined function get a clean list of sample-level barcodes: # In[27]: barcodeType = "sample_barcode" cleanSampleList = buildCleanBarcodeList ( studyName, tables, barcodeType, DNUsamples ) # Now we're going to double-check first that we keep only sample-level barcodes that correspond to patients in the "clean" list of patients, and then we'll also filter the list of patients in case there are patients with no samples remaining in the "clean" list of samples. # In[28]: finalSampleList = [] for aSample in cleanSampleList: aPatient = aSample[:12] if ( aPatient not in cleanPatientList ): print(" excluding this sample in the final pass: ", aSample) else: finalSampleList += [aSample] print(" Length of final sample list: {} ".format(len(finalSampleList))) # In[29]: finalPatientList = [] for aSample in finalSampleList: aPatient = aSample[:12] if ( aPatient not in finalPatientList ): finalPatientList += [ aPatient ] print(" Lenth of final patient list: {} ".format(len(finalPatientList))) for aPatient in cleanPatientList: if ( aPatient not in finalPatientList ): print(" --> patient removed in final pass: ", aPatient) # We're also interested in knowing what *types* of samples we have. The codes for the major types of samples are: # - **01** : primary solid tumor # - **02** : recurrent solid tumor # - **03** : primary blood derived cancer # - **06** : metastatic # - **10** : blood derived normal # - **11** : solid tissue normal # # and a complete list of all sample type codes and their definitions can be found [here](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes). # In[30]: sampleCounts = {} for aSample in finalSampleList: sType = str(aSample[13:15]) if ( sType not in sampleCounts ): sampleCounts[sType] = 0 sampleCounts[sType] += 1 for aKey in sorted(sampleCounts): print(" {} samples of type {} ".format(sampleCounts[aKey], aKey)) # Now we are going to create a simple dataframe with all of the sample barcodes and the associated patient (participant) barcodes so that we can write this to a BigQuery "cohort" table. # In[31]: import pandas as pd patientBarcodes = [] sampleBarcodes = [] for aSample in finalSampleList: sampleBarcodes += [aSample] patientBarcodes += [aSample[:12]] df = pd.DataFrame ( { 'ParticipantBarcode': patientBarcodes, 'SampleBarcode': sampleBarcodes } ) df.describe() # As a next step you may want to consider is to put the data into your GCP. An example of how to move files in and out of GCP and BigQuery can be found [here](https://github.com/isb-cgc/Community-Notebooks/tree/master/Notebooks) along with other tutorial notebooks.