from collections import defaultdict from glob import glob, iglob import os import re import sys import gffutils import numpy as np import pandas as pd ls /home/obotvinnik/scratch/mn_diff_singlecell/fastq/sailfish_k31/*.sailfish/quant_bias_corrected.sf ! head /home/obotvinnik/scratch/mn_diff_singlecell/fastq/sailfish_k31/CVN_01.sailfish/quant_bias_corrected.sf import pandas as pd from glob import iglob directory = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/sailfish_k31' tpm_dfs = [] columns = ['transcript', 'length', 'tpm', 'rpkm', 'kpkm', 'EstimatedNumKmers', 'EstimatedNumReads'] for filename in iglob('{}/*.sailfish/quant_bias_corrected.sf'.format(directory)): # Read "tabluar" data, separated by tabs. # Arguments: # skiprows=5 Skip the first 5 rows # names=columns Use the column names in the list "columns" # index_col=0 The first column is the row names (the row names are called the "index" in pandas terms) df = pd.read_table(filename, skiprows=5, names=columns, index_col=0) # Get the "series" (aka single column) of TPM tpm = df.tpm # To get the sample ID, split by the folder identifier, "/", # and take the second-to-last item (via "[-2]"), which has the sample # id, then split on the period, and take the first item via "[0]" sample_id = filename.split('/')[-2].split('.')[0] # Change the name of the tpm.name = sample_id tpm_dfs.append(tpm) tpm = pd.concat(tpm_dfs, axis=1) tpm.head() tpm = tpm.T tpm.head() spikein_columns = tpm.columns.map(lambda x: not x.startswith('ENST')) tpm_spikein = tpm.ix[:, spikein_columns] tpm_transcripts = tpm.ix[:, ~spikein_columns] tpm_spikein.head() tpm_transcripts.head() ensembl_ids = tpm_transcripts.columns.map(lambda x: x.split('|')[1].split('.')[0]) tpm_transcripts.columns = ensembl_ids tpm_genes = tpm_transcripts.groupby(level=0, axis=1).sum() tpm_transcripts.head() tpm_genes.head() ensembl_id = 'ENSG00000000419' tpm_transcripts[ensembl_id].head() tpm_genes[ensembl_id].head() cat /projects/ps-yeolab/genomes/hg19/star_sjdb/genomeParameters.txt cat /home/obotvinnik/scratch/mn_diff_singlecell/fastq/mapping_stats/M1_01.Log.final.out from glob import glob # We use "glob" instead of "iglob" here because we want the whole list around import numpy as np def make_unique(seq, idfun=None): ''' if an object appears more than once in a list, append a letter to it Modified from: http://www.peterbe.com/plog/uniqifiers-benchmark ''' if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) # in old Python versions: # if seen.has_key(marker) # but in new ones: if marker in seen: seen[marker] += 1 result.append(item + string.lowercase[seen[marker] - 2]) continue seen[marker] = 1 result.append(item) return result def try_converting_to_float(x): try: return float(x) except ValueError: return x def fix_duplicate_columns(mapping_stats): ''' Given a dataframe of mapping stats with "duplicate" column names (actually duplicate column names aren't allowed in pandas, but this assumes you have columns like M1_01 and M1_01a, and the M1_01a column was created from the second *.Log.final.out file from RNA-STAR), this detects the duplicate columns and sends them to merge_mapping_stats, which combines the duplicate colmns into single column in a reasonable way ''' duplicate_columns = mapping_stats.filter(regex='[a-z]$') mapping_stats.drop((x[:-1] for x in duplicate_columns), axis=1) merged_columns = {} for col in duplicate_columns: original_col = col[:-1] merged_columns[original_col] = merge_mapping_stats( mapping_stats.ix[:, col], mapping_stats.ix[:, original_col]) # Remove duplicately-named columns, e.g. M1_01 and M1_01a mapping_stats = mapping_stats.drop((x for x in duplicate_columns), axis=1) mapping_stats = mapping_stats.drop((x[:-1] for x in duplicate_columns), axis=1) merged_df = pd.DataFrame(data=merged_columns, index=mapping_stats.index) mapping_stats = pd.concat((mapping_stats, merged_df), axis=1) return mapping_stats def log_final_out(glob_command, ids_function): """ Given a glob command describing where all the Log.final.out files are from STAR, return a pd.DataFrame with each sample (id) as its own column. glob_command : str Where to look for the files, will be passed to "glob" ids_function : function A function (could be an anonymous function like a lambda) that specifies how to get the sample ID from the filename. Could also be a list of IDs, but they must be in the exact order as in the directories, which is why a function can be easier. Example: >>> glob_command = '/Users/olga/workspace-git/single_cell/analysis/mapping_stats/*.Log.final.out' >>> mapping_stats = log_final_out(glob_command, ... lambda x: '_'.join(x.split('/')[-1].split('_')[:2])) """ series = [] filenames = glob(glob_command) if isinstance(ids_function, list): ids = ids_function else: ids = [ids_function(filename) for filename in filenames] for filename in filenames: s = pd.read_table(filename, header=None, index_col=0, squeeze=True) converted = [try_converting_to_float(x.strip('%')) if type(x) != float else x for x in s] series.append(pd.Series(converted, index=s.index)) mapping_stats = pd.concat(series, keys=make_unique(ids), axis=1) new_index = [str(x).strip().rstrip(' |') for x in mapping_stats.index] mapping_stats.index = new_index mapping_stats = mapping_stats.dropna(how='all') mapping_stats = fix_duplicate_columns(mapping_stats) # Turn all the number of splicing events into percentages for statistical # testing number_splicing_event_names = ['Number of splices: Annotated (sjdb)', 'Number of splices: GT/AG', 'Number of splices: GC/AG', 'Number of splices: AT/AC', 'Number of splices: Non-canonical'] percent_splicing_event_names = [x.replace('Number of', '%') for x in number_splicing_event_names] total_splicing_events = mapping_stats.ix['Number of splices: Total', :].replace(0, np.nan).values.astype(float) pieces = [] for num_events in zip(number_splicing_event_names): pieces.append(100.0 * mapping_stats.ix[num_events, :].values \ / total_splicing_events) pieces = [np.reshape(piece, len(mapping_stats.columns)) for piece in pieces] percent_splicing = pd.DataFrame(pieces, index=percent_splicing_event_names, columns=mapping_stats.columns) return pd.concat((mapping_stats, percent_splicing)) glob_command = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/mapping_stats/*.Log.final.out' import os ids_function = lambda x: os.path.basename(x).split('.')[0] x = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/mapping_stats/M1_01.Log.final.out' os.path.basename(x) os.path.basename(x).split('.')[0] mapping_stats = log_final_out(glob_command, ids_function) mapping_stats.head() mapping_stats = mapping_stats.T cat /home/obotvinnik/scratch/mn_diff_singlecell/fastq/M1_01.Aligned.out.sam.bam.sorted.bam.miso.sh def read_miso_summary(filename): '''Read a miso summary, with some additional columns Reads a MISO summary file as a pandas dataframe, and adds these columns: 1. a copy-paste-able genome location at the end, based on the minimum mRNA_starts and maximum mRNA_ends. (df.genome_location) 2. The difference between df.ci_high and df.ci_low (df.ci_diff) 3. The left and right halves of the confidence interval, e.g. the right half is df.ci_high - df.miso_posterior_mean. (df.ci_left_half and df.ci_right_half) 4. The max of the two left and right confidence interval halves (df.ci_halves_max) Parameters ---------- filename : str or file buffer Name of the miso Returns ------- miso_summary : pandas.DataFrame A DataFrame of the miso summary for this sample and splice type ''' df = pd.read_table(filename) genome_location = pd.DataFrame( ['%s:%d-%d' % (chrom, min_csv(starts), max_csv(stops)) for chrom, starts, stops in zip(df.chrom, df.mRNA_starts, df.mRNA_ends)], columns=['genome_location'], index=df.index) ci_diff = pd.DataFrame(df.ci_high - df.ci_low, columns=['ci_diff'], index=df.index) ci_halves = pd.DataFrame( {'ci_left_half': (df.ci_high - df.miso_posterior_mean), 'ci_right_half': (df.miso_posterior_mean - df.ci_low)}, index=df.index) ci_halves_max = pd.DataFrame(ci_halves.max(axis=1), columns=['ci_halves_max']) return pd.concat([df, genome_location, ci_diff, ci_halves, ci_halves_max], axis=1) def max_csv(x): ''' Of integers separated by commas, take the max e.g. 75112684,75112684 would return 75112684 or 75112684,75112689 would return 75112689 ''' return max(map(int, x.split(','))) def min_csv(x): ''' Of integers separated by commas, take the minimum e.g. 75112684,75112684 would return 75112684 or 75112684,75112689 would return 75112684 ''' return min(map(int, x.split(','))) dfs = [] glob_command = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/miso/*/*/summary/*.miso_summary' afe_ale = ('AFE', 'ALE') for filename in iglob(glob_command): # Try to read in the data, unless it doesn't work, in which case, # just skip this file and continue on to the next try: df = read_miso_summary(filename) except: continue # Add columns for the sample id and splice type sample_id = filename.split('/')[7] splice_type = os.path.basename(filename).split('.')[0] df['sample_id'] = sample_id df['splice_type'] = splice_type if splice_type in afe_ale: df.event_name = splice_type + '|' + df.event_name dfs.append(df) dfs[0].head() miso_summary = pd.concat(dfs) miso_summary.head() miso_summary.shape import re import sys def counts_pair_to_ints(x): """Convert an isoform:counts pair to integers >>> x = '(1,0):5084,' >>> counts_pair_to_ints(x) ((1, 0), 5084) """ x = x.split(':') isoforms = tuple(map(int, x[0].strip('()').split(','))) counts = int(x[1].rstrip(',')) return isoforms, counts def counts_col_to_dict(counts): """Convert a miso counts column value into a dictionary >>> counts = '(0,0):12,(1,0):5084,(1,1):2036' >>> counts_col_to_dict(counts) {(0, 0): 12, (1, 0): 5084, (1, 1): 2036} """ return dict(map(counts_pair_to_ints, re.findall('\(\d,\d\):\d+,?', counts))) def filter_miso_summary(summary, ci_diff_thresh=0.5, specific_isoform_counts_thresh=5): """Filter a MISO summary dataframe created by read_miso_summary on the maximum confidence interval size, and number of "junction reads" (reads that are specific to one isoform) From the MISO documentation: (1,0):28,(0,1):50,(1,1):12, where 0 and 1 correspond to the first and second isoforms. This indicates that 28 reads support isoform 1 but not 2, while 50 reads support the second but not the first, and 12 reads support both isoforms. The field (0,0):N, if outputted, indicates the number of reads N that mapped to the region but were thrown out (e.g. because they are not consistent with the isoforms in the annotation, violate overhang constraints, or other related reasons.) Summarized: - (1,0) reads support isoform1 and not isoform2 - (0,1) reads support isoform2 and not isoform1 - (1,1) reads support both isoforms - (0,0) reads support neither isoform For "junction reads", we filter on the number of "(1,0)" and "(0,1)" reads. Parameters ---------- summary : pandas.DataFrame A "tall" dataframe of all samples and splice types Returns ------- filtered_summary : pandas.DataFrame Only the splicing events passing the filters """ original_events = summary.shape[0] summary = summary.ix[summary.ci_diff <= ci_diff_thresh] after_ci_events = summary.shape[0] isoform_counts = pd.DataFrame.from_dict(dict(zip(summary.index, summary.counts.map( counts_col_to_dict).values)), orient='index') # Get counts that support only one specific isoform "junction reads" specific_isoform_counts = isoform_counts.ix[:, [(0, 1), (1, 0)]].sum(axis=1) # Filter on at least 10 "junction reads" summary = summary.ix[ specific_isoform_counts >= specific_isoform_counts_thresh] after_counts_events = summary.shape[0] # Set the index as just the range now that we've filtered everything out summary.index = np.arange(after_counts_events) sys.stdout.write(' {} events removed with poor confidence (ci >{:.2f})\n' .format(after_ci_events - original_events, ci_diff_thresh)) sys.stdout.write(' {} events removed with low read counts which are unique' ' to individual isoforms (n < {})\n'.format( after_counts_events - after_ci_events, specific_isoform_counts_thresh)) return summary filtered_miso_summary = filter_miso_summary(miso_summary) for gff in glob('/projects/ps-yeolab/genomes/hg19/miso/*.hg19.gff3'): print 'Creating gffutils database for {}...'.format(gff) db_filename = gff + '.db' try: %time db = gffutils.create_db(gff, db_filename) except: # The database already exists db = gffutils.FeatureDB(db_filename) isoform_lengths = defaultdict(dict) for mrna in db.features_of_type('mRNA'): miso_id = mrna.id.rstrip('AB.') isoform_id = mrna.id[-1] isoform_lengths[isoform_id][miso_id] = sum(len(exon) for exon in db.children(mrna)) isoform_lengths = pd.DataFrame(isoform_lengths) csv = gff.replace('.gff3', '.isoform_lengths.csv') isoform_lengths.to_csv(csv) glob_command = '/projects/ps-yeolab/genomes/hg19/miso/*isoform_lengths.csv' dfs = [] for filename in glob(glob_command): splice_type = os.path.basename(filename).split('.')[0] df = pd.read_csv(filename, index_col=0) if splice_type in afe_ale: df.index = splice_type + '|' + df.index dfs.append(df) isoform_lengths = pd.concat(dfs) isoform_lengths.head() read_length = 100 short_isoform_lengths = isoform_lengths[isoform_lengths < read_length] short_miso_ids = short_isoform_lengths.index[short_isoform_lengths.T.any()] filtered_miso_summary = filtered_miso_summary.ix[~filtered_miso_summary.event_name.isin(short_miso_ids)] splicing = filtered_miso_summary.pivot(columns='event_name', index='sample_id', values='miso_posterior_mean') splicing.head() splicing.to_csv('/projects/ps-yeolab/obotvinnik/mn_diff_singlecell/data/splicing/splicing.csv') splicing.columns.map(lambda x: x.startswith('ALE')).sum() splicing_filtered = splicing.dropna(axis=1, thresh=10) splicing_filtered.shape splicing_filtered.columns.map(lambda x: x.startswith('ALE')).sum() splicing.columns.map(lambda x: len(x.split('@')) == 2 and x.startswith('chr')).sum() splicing.columns.map(lambda x: len(x.split('@')) == 3).sum() splicing.columns.map(lambda x: len(x.split('@')) == 4).sum()