#!/usr/bin/env python
# coding: utf-8

# # Computing a PCA of tile-level CHARM features
# 
# This notebook exemplifies working with the computed CHARM features. 
# 
# We will load several Wells of time resolved tile-level features, compute their PCA, and show that single cell biologically relevant information are indeed present in unsupervised features computed on tiles without segmentation.

# **Some cells take a very long time to run (querying the rows from the table in particular)**

# ### Dependencies
# 
#  * [Matplotlib](http://matplotlib.org/)
#  * [NumPy](http://www.numpy.org/)
#  * [Pandas](http://pandas.pydata.org/)
#  * [PyTables](http://www.pytables.org/)
#  * [Scikit-learn](http://scikit-learn.org/)

# In[1]:


import os
import numpy as np
import matplotlib.pyplot as plt
import shelve
import random

from pandas import Series
from pandas import DataFrame
from pandas import read_csv
from pandas import merge
from pandas import concat
from pandas import read_hdf

import omero
from matplotlib.widgets import Slider
from matplotlib import gridspec
from sklearn.externals import joblib


from sklearn import random_projection
import sklearn.neighbors as nn
import sklearn.manifold as man
import sklearn.decomposition as dec
from sklearn.preprocessing import scale,robust_scale
from sklearn.metrics.pairwise import euclidean_distances

from idr import connection
import tables

get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['image.cmap']='gray'


# ### Method definitions

# In[2]:


def getBulkAnnotationAsDf(screenID, conn):
    """
    Download the annotation file from a screen as a Pandas DataFrame
    """
    sc = conn.getObject('Screen',screenID)
    for ann in sc.listAnnotations():
        if isinstance(ann, omero.gateway.FileAnnotationWrapper):
            if (ann.getFile().getName() == 'bulk_annotations'):
                if (ann.getFile().getSize()> 147625090):
                    print "that's a big file..."
                    return None
                ofId = ann.getFile().getId()
                break


    original_file = omero.model.OriginalFileI(ofId, False)

    table = conn.c.sf.sharedResources().openTable(original_file)
    try:
        rowCount = table.getNumberOfRows()
        column_names = [col.name for col in table.getHeaders()]

        black_list = []
        column_indices = []
        for column_name in column_names:
            if column_name in black_list:
                continue
            column_indices.append(column_names.index(column_name))

        table_data = table.slice(column_indices, None)
    finally:
        table.close()
    data = []
    for index in range(rowCount):
        row_values = [column.values[index] for column in table_data.columns]
        data.append(row_values)

    dfAnn = DataFrame(data)
    dfAnn.columns = column_names
    return dfAnn


# In[22]:


def goneFishing(iid,x,y,t,df,nbrs,conn):
    """
    Find and display nearest neighbours of a given tile
    """

    qry = df[(df.x==x) & (df.y==y) & (df.ImageID==iid) & (df.t==t)].iloc[:,12:]

    dfq = df[df.ImageID!=iid]
    w = dfq.w.iloc[0]
    h = dfq.h.iloc[0]
    chan = 0
    
 
    hook = getCondensationSubStack(iid,x,y,w,h,t,t+1,conn)    
    
    distances, indices = nbrs.kneighbors(qry)
    nnn = len(indices[0])
    tiles = np.zeros((h,w,nnn))
    for ind, ii in zip(indices[0], range(nnn)):
        iidcur = dfq.ImageID.iloc[ind]
        x = dfq.x.iloc[ind]
        y = dfq.y.iloc[ind]
        t = int(dfq.t.iloc[ind])
        tiles[:, :, ii] = getCondensationSubStack(iidcur, x, y, w, h, t, t+1, conn) 

    
    d, r = divmod(nnn, 4)
    plt.figure(figsize = (12, 30))
    gs = gridspec.GridSpec(2, 1, height_ratios=[1, 3]) 
    ax0 = plt.subplot(gs[0,0])
    ax0.set_title('Original frame')
    ax0.imshow(hook)

    imc = buildComposite(tiles, d+(1&r), 4, smpl=1)
    ax1 = plt.subplot(gs[1,0])
    ax1.set_title('Nearest neighbours')
    ax1.imshow(imc)
    

# In[4]:


def getCondensationSubStack(imId, x, y, w, h, ti, tf, conn, chan=0):
#  Extract a substack of an image as a numpy array

    im = conn.getObject("Image", imId)
    pix = im.getPrimaryPixels()

    z = 0
    c = chan
    tile = (x, y, w, h)
    zctList = [(z, c, it, tile) for it in range(ti,tf)]
    planes = pix.getTiles(zctList)
    st = []
    for plane in planes:
        st.append(plane)

    st = np.asarray(st)
    if tf > ti+1:
        st = np.rollaxis(st, 0, 3)
    else:
        st = np.squeeze(st)
    
    return st


# In[5]:


def buildComposite(st, n, m, smpl=None):
    # nxm shots from st in a grid, as an image
    nr = st.shape[0]
    nc = st.shape[1]
    if smpl == None:
        smpl = st.shape[2]/(n*m)
    res = np.zeros((nr*n,nc*m))
    for i in range(n):
        for j in range(m):
            try:
                res[i*nr:i*nr+nr,j*nc:j*nc+nc]=st[:,:,(i*m+j)*smpl]
            except:
                break
    return res


# In[26]:


def outOneStack(imId, x, y, df, con, alg='PCA', chan=0, title='First 5 PCA components of a stack'):
#    Plot PCA components along with tiles from the stack

    df = df[(df.x==x) & (df.y==y) & (df.ImageID==imId)].sort_values('t', ascending=True)

    plt.figure(figsize=(12,30))
    gs = gridspec.GridSpec(2, 1, height_ratios=[1, 2]) 
    ax0 = plt.subplot(gs[0,0])
    ax0.set_title(title)
    ax0.set_xlabel('Time (frame)')
    ax0.set_ylabel('PCA components (a.u.)')
    if alg == 'PCA':
        ax0.plot(df.PCA_0.values)
        ax0.plot(df.PCA_1.values, 'r')
        ax0.plot(df.PCA_2.values, 'g')
        ax0.plot(df.PCA_3.values, 'y')
        ax0.plot(df.PCA_4.values, 'm')
        ax0.plot(df.PCA_5.values, 'c')
    elif alg == 'DR':
        ax0.plot(df.DR_0.values)
        ax0.plot(df.DR_1.values, 'r')
        ax0.plot(df.DR_2.values, 'g')
        ax0.plot(df.DR_3.values, 'y')
        ax0.plot(df.DR_4.values, 'm')


    x, y, w, h = x, y, df.w.min(), df.h.min()
    ti, tf = int(df.t.min()), int(df.t.max())
    st = getCondensationSubStack(imId, x, y, w, h, ti, tf, conn, chan)
    
    imc = buildComposite(st, 5, 5)
    ax1 = plt.subplot(gs[1,0])
    ax1.imshow(imc)
    ax1.set_title('Mosaic from the corresponding stack, begins in the upper left, ends in the lower right')
    return st


# In[7]:


def data2df(data):
# convert an OMERO.table data structure to a dataframe, unpacking features array
    nrow = len(data.columns[0].values)
    nfeat = 0
    for i in range(len(data.columns)):
        try:
            s = len(data.columns[i].values[0])
        except:
            s = 1
        nfeat = nfeat+s

    featsAll = np.zeros((nrow,nfeat))

    nfeat = 0
    featNames = []
    for i in range(len(data.columns)):
        try:
            s = len(data.columns[i].values[0])
            for k in range(len(data.columns[i].values)):
                featsAll[k, nfeat:nfeat+s] = np.array(data.columns[i].values[k])

            for k in range(s):
                featNames.append(data.columns[i].name+str(k))

        except:
            s = 1
            featsAll[:, nfeat] = np.array(data.columns[i].values)
            featNames.append(data.columns[i].name)
        nfeat=nfeat+s

    dfFeat = DataFrame(featsAll)
    dfFeat.columns = featNames
    
    return dfFeat


# In[8]:


def rows2features(qr, concatenateC=True):
# Extract selected rows from an OMERO.table
    maxr =10000  #else memory errors
    dfFeatPhen = DataFrame()
    k = -1 #in case there is less that maxr rows
    for k in range(len(qr)/maxr):
        data = featTable.readCoordinates(qr[k*maxr:(k+1)*maxr])
        data = data2df(data)
        dfFeatPhen = concat([dfFeatPhen,data])
        print str(k*maxr)+' to '+str((k+1)*maxr)

    if len(qr)%maxr !=0 :
        data = featTable.readCoordinates(qr[(k+1)*maxr:])
        data = data2df(data)
        dfFeatPhen = concat([dfFeatPhen,data])
        print 'from '+str((k+1)*maxr)

        
    dfFeatPhen = dfFeatPhen.reset_index(drop=True)
    
    if concatenateC:
        featNames = dfFeatPhen.columns[0:12]
        for i in [0, 1, 2]:
            featNames = featNames.append(str(i)+'_'+dfFeatPhen.columns[12:])

        nf = dfFeatPhen.shape[1]-12
        dfFeat = np.zeros((dfFeatPhen.shape[0]/3, 12+3*(nf)))
        for k, (name, grp) in enumerate(dfFeatPhen.groupby(['ImageID', u'x', u'y'])):
            dfFeat[k, 0:12] = grp.iloc[0, 0:12].values
            grp.sort_values('c', inplace=True)
            for i, (nr, r) in enumerate(grp.iterrows()):
                dfFeat[k, 12+i*nf:(12+(i+1)*nf)] = r.iloc[12:].values


        dfFeaPhent = DataFrame(dfFeat)
        dfFeatPhen.columns = featNames

    
    return dfFeatPhen


# ## Loading Data

# In[9]:


conn = connection()


# In[10]:


scId = 102 #Heriche et al., DNA condentation screen
dfAnn = getBulkAnnotationAsDf(scId,conn)


# In[11]:


# Retrieving the original file corresponding to the feature table 

sc = conn.getObject('Screen',scId)
# Printing names of all files attached to the screen
for ann in sc.listAnnotations():
    if isinstance(ann, omero.gateway.FileAnnotationWrapper):
        print ann.getFile().getName()

# Getting the fileAnnotation corresponding to the features
# !!! not the full file
for ann in sc.listAnnotations():
    if isinstance(ann, omero.gateway.FileAnnotationWrapper):
            if (ann.getFile().getName() == 'idr0002-heriche-condensation-screenA-plate-all.h5'):
                ofId = ann.getFile().getId()
            break
            
original_file = omero.model.OriginalFileI(ofId, False)


# In[12]:


# Cherry picking a few wells
plateCP = ['plate1_1_013', 'plate2_2_007', 'plate2_2_007', 'plate3_4_034']
wellCP = [5, 53, 63, 24]

we = [dfAnn[(dfAnn['Plate Name'] == plateCP[i]) & (dfAnn['Well Number'] == str(wellCP[i]))]['Well'] for i in range (4)]
we = concat(we)
print we


# In[13]:


# Finding the rows of the table we are interested in
featTable = conn.c.sf.sharedResources().openTable(original_file)
try:
    featRowCount = featTable.getNumberOfRows()
    print 'number of tiles with computed features:'+str(featRowCount)
    where = "(WellID=="+str(we.values[0])+")|(WellID=="+str(we.values[1])+")|(WellID=="+str(we.values[2])+")|(WellID=="+str(we.values[3])+")"
    qr = featTable.getWhereList(where, variables={}, start=0, stop=featRowCount, step=0)

    #downloading them as a dataframe
    maxr = 10000  #else memory errors
    df = DataFrame()
    k = -1 #in case there is less that maxr rows
    for k in range(len(qr)/maxr):
        data = featTable.readCoordinates(qr[k*maxr:(k+1)*maxr])
        data = data2df(data)
        df = concat([df,data])
        print str(k*maxr) + ' to '+ str((k+1)*maxr)

    if len(qr)%maxr !=0 :
        data = featTable.readCoordinates(qr[(k+1)*maxr:])
        data = data2df(data)
        df = concat([df,data])
        print 'from '+str((k+1)*maxr)


    df = df[df.c==0]   #first channel is the one with chromosome information
    df = df.reset_index(drop=True)
finally:
    featTable.close()


# ## computing PCA

# In[14]:


#PCA of 4 wells
    
df = df[df.c==0]
dat = df.iloc[:,12:]
# keeping only the features which are not too discrete
#nbv=[len(dat[x].unique()) for x in df.iloc[:,12:]]
#dat=dat.iloc[:,np.array(nbv)>10]

dat=scale(dat)


pca = dec.PCA(n_components=250,svd_solver='randomized')
#pca = dec.RandomizedPCA(250)
#pca = dec.SparsePCA(250)
#pca = dec.KernelPCA(n_components=10,kernel='rbf')

pca.fit(dat)
datPCA = pca.transform(dat)
datPCA = DataFrame(datPCA)
datPCA.columns = ['PCA_'+str(i) for i in range(datPCA.shape[1])]
df = df.reset_index(drop=True)
df =concat((df.iloc[:, 0:12], datPCA), axis=1)


# ## Visualization

# In[27]:


# Visualization of one stack. 
# a Division is happening around t = 220, and seems to have a clear signature, even with only 5 components
x, y = 504, 384 
imId = 179719

st = outOneStack(imId, x, y, df, conn, alg='PCA', chan=0)


# In[28]:


## Looking for nearest neighbours of a given frame

iid, x, y, t = 179719, 504, 384, 220  #division at 220

dfq = df[df.ImageID!=iid] #excluding the image the query frame comes from
nbrs = nn.NearestNeighbors(n_neighbors=12, algorithm='ball_tree').fit(dfq.iloc[:,12:]) 


goneFishing(iid, x, y, t, df, nbrs, conn)


# ### Disconnect when done loading data

# In[ ]:


conn.close()


# ### License
# 
# Copyright (C) 2016 University of Dundee. All Rights Reserved.
# 
# This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.