from collections import defaultdict
from glob import glob, iglob
import os
import re
import sys

import gffutils
import numpy as np
import pandas as pd

ls /home/obotvinnik/scratch/mn_diff_singlecell/fastq/sailfish_k31/*.sailfish/quant_bias_corrected.sf

! head /home/obotvinnik/scratch/mn_diff_singlecell/fastq/sailfish_k31/CVN_01.sailfish/quant_bias_corrected.sf

import pandas as pd
from glob import iglob

directory = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/sailfish_k31'

tpm_dfs = []
columns = ['transcript', 'length', 'tpm', 'rpkm', 'kpkm', 'EstimatedNumKmers', 'EstimatedNumReads']

for filename in iglob('{}/*.sailfish/quant_bias_corrected.sf'.format(directory)):
    
    # Read "tabluar" data, separated by tabs. 
    # Arguments: 
    # skiprows=5      Skip the first 5 rows
    # names=columns   Use the column names in the list "columns"
    # index_col=0     The first column is the row names (the row names are called the "index" in pandas terms)
    df = pd.read_table(filename, skiprows=5, names=columns, index_col=0)
    
    # Get the "series" (aka single column) of TPM 
    tpm = df.tpm

    # To get the sample ID, split by the folder identifier, "/", 
    # and take the second-to-last item (via "[-2]"), which has the sample
    # id, then split on the period, and take the first item via "[0]"
    sample_id = filename.split('/')[-2].split('.')[0]
    
    # Change the name of the 
    tpm.name = sample_id
    tpm_dfs.append(tpm)
tpm = pd.concat(tpm_dfs, axis=1)

tpm.head()

tpm = tpm.T

tpm.head()

spikein_columns = tpm.columns.map(lambda x: not x.startswith('ENST'))
tpm_spikein = tpm.ix[:, spikein_columns]
tpm_transcripts = tpm.ix[:, ~spikein_columns]

tpm_spikein.head()

tpm_transcripts.head()

ensembl_ids = tpm_transcripts.columns.map(lambda x: x.split('|')[1].split('.')[0])

tpm_transcripts.columns = ensembl_ids

tpm_genes = tpm_transcripts.groupby(level=0, axis=1).sum()

tpm_transcripts.head()

tpm_genes.head()

ensembl_id = 'ENSG00000000419'
tpm_transcripts[ensembl_id].head()

tpm_genes[ensembl_id].head()

cat /projects/ps-yeolab/genomes/hg19/star_sjdb/genomeParameters.txt

cat /home/obotvinnik/scratch/mn_diff_singlecell/fastq/mapping_stats/M1_01.Log.final.out

from glob import glob  # We use "glob" instead of "iglob" here because we want the whole list around
import numpy as np

def make_unique(seq, idfun=None):
    '''
    if an object appears more than once in a list, append a letter to it

    Modified from: http://www.peterbe.com/plog/uniqifiers-benchmark
    '''
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker in seen:
            seen[marker] += 1
            result.append(item + string.lowercase[seen[marker] - 2])
            continue
        seen[marker] = 1
        result.append(item)
    return result

def try_converting_to_float(x):
    try:
        return float(x)
    except ValueError:
        return x
    
def fix_duplicate_columns(mapping_stats):
    '''
    Given a dataframe of mapping stats with "duplicate" column names
    (actually duplicate column names aren't allowed in pandas,
    but this assumes you have columns like M1_01 and M1_01a, and the M1_01a
    column was created from the second *.Log.final.out file from RNA-STAR),
    this detects the duplicate columns and sends them to merge_mapping_stats,
    which combines the duplicate colmns into single column in a reasonable way
    '''
    duplicate_columns = mapping_stats.filter(regex='[a-z]$')
    mapping_stats.drop((x[:-1] for x in duplicate_columns), axis=1)
    merged_columns = {}
    for col in duplicate_columns:
        original_col = col[:-1]
        merged_columns[original_col] = merge_mapping_stats(
            mapping_stats.ix[:, col], mapping_stats.ix[:, original_col])

    # Remove duplicately-named columns, e.g. M1_01 and M1_01a
    mapping_stats = mapping_stats.drop((x for x in duplicate_columns), axis=1)
    mapping_stats = mapping_stats.drop((x[:-1] for x in duplicate_columns),
                                       axis=1)

    merged_df = pd.DataFrame(data=merged_columns, index=mapping_stats.index)
    mapping_stats = pd.concat((mapping_stats, merged_df), axis=1)
    return mapping_stats

def log_final_out(glob_command, ids_function):
    """
    Given a glob command describing where all the Log.final.out files are from
    STAR, return a pd.DataFrame with each sample (id) as its own column.

    glob_command : str
        Where to look for the files, will be passed to "glob"
    ids_function : function
        A function (could be an anonymous function like a
        lambda) that specifies how to get the sample ID from the filename. Could
        also be a list of IDs, but they must be in the exact order as in the
        directories, which is why a function can be easier.

    Example:
    >>> glob_command = '/Users/olga/workspace-git/single_cell/analysis/mapping_stats/*.Log.final.out'
    >>> mapping_stats = log_final_out(glob_command,
    ...             lambda x: '_'.join(x.split('/')[-1].split('_')[:2]))
    """
    series = []
    filenames = glob(glob_command)

    if isinstance(ids_function, list):
        ids = ids_function
    else:
        ids = [ids_function(filename) for filename in filenames]

    for filename in filenames:
        s = pd.read_table(filename, header=None, index_col=0, squeeze=True)
        converted = [try_converting_to_float(x.strip('%'))
                     if type(x) != float else x for x in s]
        series.append(pd.Series(converted, index=s.index))

    mapping_stats = pd.concat(series, keys=make_unique(ids), axis=1)
    new_index = [str(x).strip().rstrip(' |') for x in mapping_stats.index]
    mapping_stats.index = new_index
    mapping_stats = mapping_stats.dropna(how='all')
    mapping_stats = fix_duplicate_columns(mapping_stats)

    # Turn all the number of splicing events into percentages for statistical
    # testing
    number_splicing_event_names = ['Number of splices: Annotated (sjdb)',
                                   'Number of splices: GT/AG',
                                   'Number of splices: GC/AG',
                                   'Number of splices: AT/AC',
                                   'Number of splices: Non-canonical']
    percent_splicing_event_names = [x.replace('Number of', '%')
                                    for x in number_splicing_event_names]
    total_splicing_events = mapping_stats.ix['Number of splices: Total',
                            :].replace(0, np.nan).values.astype(float)

    pieces = []
    for num_events in zip(number_splicing_event_names):
        pieces.append(100.0 * mapping_stats.ix[num_events,
                              :].values \
                      / total_splicing_events)
    pieces = [np.reshape(piece, len(mapping_stats.columns)) for piece in pieces]
    percent_splicing = pd.DataFrame(pieces, index=percent_splicing_event_names,
                                    columns=mapping_stats.columns)
    return pd.concat((mapping_stats, percent_splicing))

glob_command = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/mapping_stats/*.Log.final.out'

import os

ids_function = lambda x: os.path.basename(x).split('.')[0]

x = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/mapping_stats/M1_01.Log.final.out'
os.path.basename(x)

os.path.basename(x).split('.')[0]

mapping_stats = log_final_out(glob_command, ids_function)

mapping_stats.head()

mapping_stats = mapping_stats.T

cat /home/obotvinnik/scratch/mn_diff_singlecell/fastq/M1_01.Aligned.out.sam.bam.sorted.bam.miso.sh

def read_miso_summary(filename):
    '''Read a miso summary, with some additional columns

    Reads a MISO summary file as a pandas dataframe, and adds these columns:
    1. a copy-paste-able genome location at the end, based on the minimum
       mRNA_starts and maximum mRNA_ends. (df.genome_location)
    2. The difference between df.ci_high and df.ci_low (df.ci_diff)
    3. The left and right halves of the confidence interval, e.g. the right
       half is df.ci_high - df.miso_posterior_mean. (df.ci_left_half and
       df.ci_right_half)
    4. The max of the two left and right confidence interval halves
       (df.ci_halves_max)
    
    Parameters
    ----------
    filename : str or file buffer
        Name of the miso 
    
    Returns
    -------
    miso_summary : pandas.DataFrame
        A DataFrame of the miso summary for this sample and splice type
    '''
    df = pd.read_table(filename)
    genome_location = pd.DataFrame(
        ['%s:%d-%d' % (chrom, min_csv(starts), max_csv(stops))
         for chrom, starts, stops in zip(df.chrom,
                                         df.mRNA_starts,
                                         df.mRNA_ends)],
        columns=['genome_location'], index=df.index)
    ci_diff = pd.DataFrame(df.ci_high - df.ci_low, columns=['ci_diff'],
                           index=df.index)
    ci_halves = pd.DataFrame(
        {'ci_left_half': (df.ci_high - df.miso_posterior_mean),
         'ci_right_half': (df.miso_posterior_mean - df.ci_low)},
        index=df.index)
    ci_halves_max = pd.DataFrame(ci_halves.max(axis=1),
                                        columns=['ci_halves_max'])
    return pd.concat([df, genome_location, ci_diff, ci_halves,
                      ci_halves_max], axis=1)

def max_csv(x):
    '''
    Of integers separated by commas, take the max
    e.g. 75112684,75112684 would return 75112684
    or 75112684,75112689 would return 75112689
    '''
    return max(map(int, x.split(',')))

def min_csv(x):
    '''
    Of integers separated by commas, take the minimum
    e.g. 75112684,75112684 would return 75112684
    or 75112684,75112689 would return 75112684
    '''
    return min(map(int, x.split(',')))


dfs = []

glob_command = '/home/obotvinnik/scratch/mn_diff_singlecell/fastq/miso/*/*/summary/*.miso_summary'

afe_ale = ('AFE', 'ALE')

for filename in iglob(glob_command):
    
    # Try to read in the data, unless it doesn't work, in which case,
    # just skip this file and continue on to the next
    try:
        df = read_miso_summary(filename)
    except:
        continue
    
    # Add columns for the sample id and splice type
    sample_id = filename.split('/')[7]
    splice_type = os.path.basename(filename).split('.')[0]
    df['sample_id'] = sample_id
    df['splice_type'] = splice_type
    
    if splice_type in afe_ale:
        df.event_name  = splice_type + '|' + df.event_name 
    
    dfs.append(df)

dfs[0].head()

miso_summary = pd.concat(dfs)

miso_summary.head()

miso_summary.shape

import re 
import sys

def counts_pair_to_ints(x):
    """Convert an isoform:counts pair to integers
    
    >>> x = '(1,0):5084,'
    >>> counts_pair_to_ints(x)
    ((1, 0), 5084)
    """
    x = x.split(':')
    isoforms = tuple(map(int, x[0].strip('()').split(',')))
    counts = int(x[1].rstrip(','))
    return isoforms, counts

def counts_col_to_dict(counts):
    """Convert a miso counts column value into a dictionary
    
    >>> counts = '(0,0):12,(1,0):5084,(1,1):2036'
    >>> counts_col_to_dict(counts)
    {(0, 0): 12, (1, 0): 5084, (1, 1): 2036}
    """
    return dict(map(counts_pair_to_ints, re.findall('\(\d,\d\):\d+,?', counts)))

def filter_miso_summary(summary, ci_diff_thresh=0.5,
                        specific_isoform_counts_thresh=5):
    """Filter a MISO summary dataframe created by read_miso_summary on 
    the maximum confidence interval size, and number of 
    "junction reads" (reads that are specific to one isoform)
    
    From the MISO documentation:
        (1,0):28,(0,1):50,(1,1):12, where 0 and 1 correspond to the first and 
    second isoforms. This indicates that 28 reads support isoform 1 but not 
    2, while 50 reads support the second but not the first, and 12 reads 
    support both isoforms. The field (0,0):N, if outputted, indicates the 
    number of reads N that mapped to the region but were thrown out (e.g. 
    because they are not consistent with the isoforms in the annotation, 
    violate overhang constraints, or other related reasons.)
    
    Summarized:
    - (1,0) reads support isoform1 and not isoform2
    - (0,1) reads support isoform2 and not isoform1
    - (1,1) reads support both isoforms
    - (0,0) reads support neither isoform
    
    For "junction reads", we filter on the number of "(1,0)" and "(0,1)" reads.

    Parameters
    ----------
    summary : pandas.DataFrame
        A "tall" dataframe of all samples and splice types

    Returns
    -------
    filtered_summary : pandas.DataFrame
        Only the splicing events passing the filters
    """
    original_events = summary.shape[0]
    summary = summary.ix[summary.ci_diff <= ci_diff_thresh]
    after_ci_events = summary.shape[0]
    isoform_counts = pd.DataFrame.from_dict(dict(zip(summary.index,
                                                     summary.counts.map(
                                                         counts_col_to_dict).values)),
                                            orient='index')

    # Get counts that support only one specific isoform "junction reads"
    specific_isoform_counts = isoform_counts.ix[:, [(0, 1), (1, 0)]].sum(axis=1)

    # Filter on at least 10 "junction reads"
    summary = summary.ix[
        specific_isoform_counts >= specific_isoform_counts_thresh]
    after_counts_events = summary.shape[0]

    # Set the index as just the range now that we've filtered everything out
    summary.index = np.arange(after_counts_events)

    sys.stdout.write(' {} events removed with poor confidence (ci >{:.2f})\n'
                     .format(after_ci_events - original_events, ci_diff_thresh))
    sys.stdout.write(' {} events removed with low read counts which are unique'
                     ' to individual isoforms (n < {})\n'.format(
        after_counts_events - after_ci_events, specific_isoform_counts_thresh))
    return summary

filtered_miso_summary = filter_miso_summary(miso_summary)

for gff in glob('/projects/ps-yeolab/genomes/hg19/miso/*.hg19.gff3'):
    print 'Creating gffutils database for {}...'.format(gff)
    db_filename = gff + '.db'
    try:
        %time db = gffutils.create_db(gff, db_filename)
    except:
        # The database already exists
        db = gffutils.FeatureDB(db_filename)

    isoform_lengths = defaultdict(dict)
    
    for mrna in db.features_of_type('mRNA'):
        miso_id = mrna.id.rstrip('AB.')
        isoform_id = mrna.id[-1]
        isoform_lengths[isoform_id][miso_id] = sum(len(exon) for exon in db.children(mrna))
    isoform_lengths = pd.DataFrame(isoform_lengths)
    
    csv = gff.replace('.gff3', '.isoform_lengths.csv')
    isoform_lengths.to_csv(csv)

glob_command = '/projects/ps-yeolab/genomes/hg19/miso/*isoform_lengths.csv'

dfs = []

for filename in glob(glob_command):
    splice_type = os.path.basename(filename).split('.')[0]
    df = pd.read_csv(filename, index_col=0)
    if splice_type in afe_ale:
        df.index = splice_type + '|' + df.index
    dfs.append(df)

isoform_lengths = pd.concat(dfs)
isoform_lengths.head()

read_length = 100

short_isoform_lengths = isoform_lengths[isoform_lengths < read_length]
short_miso_ids = short_isoform_lengths.index[short_isoform_lengths.T.any()]

filtered_miso_summary = filtered_miso_summary.ix[~filtered_miso_summary.event_name.isin(short_miso_ids)]

splicing = filtered_miso_summary.pivot(columns='event_name', index='sample_id', values='miso_posterior_mean')
splicing.head()

splicing.to_csv('/projects/ps-yeolab/obotvinnik/mn_diff_singlecell/data/splicing/splicing.csv')

splicing.columns.map(lambda x: x.startswith('ALE')).sum()

splicing_filtered = splicing.dropna(axis=1, thresh=10)
splicing_filtered.shape

splicing_filtered.columns.map(lambda x: x.startswith('ALE')).sum()

splicing.columns.map(lambda x: len(x.split('@')) == 2 and x.startswith('chr')).sum()


splicing.columns.map(lambda x: len(x.split('@')) == 3).sum()

splicing.columns.map(lambda x: len(x.split('@')) == 4).sum()