library(DIscBIO)
library(partykit)
library(enrichR)
library(ggplot2)

load("FGcdiffBinomial.RData")

#### To show the result table
head(FGcdiffBinomial[[1]])                  # The first component 
head(FGcdiffBinomial[[2]])                  # The second component

name<-"UpRegulated-DEG-cluster3"  ############ Selecting the DEGs'
U<-read.csv(file="UpRegulated-DEG-cluster3.csv",head=TRUE,sep=",")
Vplot<-VolcanoPlot(U,value=0.0001,name=name,FS=0.7,fc=0.75)

###################### Finding biomarker genes between cluster 3 and cluster 4
First="CL3"
Second="CL4"
load("FGDATAforDT.RData")            # Preparing the dataset for the decision trees

j48dt<-J48DT(FGDATAforDT)           #J48 Decision Tree
summary(j48dt)
rm(j48dt)

rpartDT<-RpartDT(FGDATAforDT)
rm(rpartDT)

############ Selecting the DEGs' table  ##############
data<-read.csv(file="UpRegulated-DEG-cluster3.csv",head=TRUE,sep=",")
data<-as.character(data[1:200,3])                              ## Checking the first 200 DEGs             

###### Getting the protein-protein interactions  #############
ppi<-PPI(data)

#############   Networking Analysis  ################
networking<-NetAnalysis(ppi)
networking                            ##### In case the Examine response components = 200 and an error "linkmat[i, ]" appeared, that means there are no PPI.

############ Plotting the Network  ##################
Data<-data[1:100]           ## Plotting the first 100 DEGs      
network<-Networking(Data)

############ Selecting the DEGs' table  ##############
data<-read.csv(file="UpRegulated-DEG-cluster3.csv",head=TRUE,sep=",")  # Up-regulated genes in cluster 3 (from the Binomial analysis)
data<-as.character(data[,3])                              ## Checking the first 200 DEGs             

dbs <- c("KEGG_2019_Human","GO_Biological_Process_2018")
enriched <- enrichr(data, dbs)
KEGG_2019_Human<-enriched[[1]][,c(1,2,3,9)]
GO_Biological_Process_2018<-enriched[[2]][,c(1,2,3,9)]

GEA<-rbind(KEGG_2019_Human,GO_Biological_Process_2018)
GEA

load("fg.RData")               # Loading the "fg" object that has include the data of the k-means clustering 
FG<-fg  
FG<-pseudoTimeOrdering(FG,quiet = TRUE, export = FALSE)
FG@kordering<-FG@kordering[order(-FG@kordering)]
NewNames<-names(FG@kordering)
OrderedDataSet<-FG@ndata
for (i in 1:length(FG@kordering)){
	OrderedDataSet[1:length(FG@ndata[,1]),i]<-FG@ndata[1:length(FG@ndata[,1]),FG@kordering[i]]
}
colnames(OrderedDataSet)<-NewNames
COL=rainbow(20)

## Spearman correlation between TGFB1 and KRT18
KRT18<-as.numeric(OrderedDataSet["ENSG00000111057",])
TGFB1<-as.numeric(OrderedDataSet["ENSG00000105329",])
cor.test(TGFB1,KRT18, method = "spearman",exact=F)

## Spearman correlation between MMD and ZEB1
MMD<-as.numeric(OrderedDataSet["ENSG00000108960",])
ZEB1<-as.numeric(OrderedDataSet["ENSG00000148516",])
cor.test(MMD, ZEB1, method = "spearman",,exact=F)

################## Loading the EMT gene list
S <- read.csv(file = "EMT_List.csv", sep = ",",header=F)    

#######################  Plotting the expression of each gene
gene<-S[1,1]
cells<-c(1:length(FG@ndata[1,]))
expression=log(t(OrderedDataSet[gene,]))
names(expression)=NULL
ID<-c(rep(gene,1448))
NewDF1=data.frame(expression,cells,ID)
rownames(NewDF1)=NULL
colnames(NewDF1)=c("expression","cells","GeneID")
########################################################### to get the smooth line
p=ggplot(NewDF1,aes(x=cells,y=expression),colour=col) + 
    # Line for each gene
    geom_smooth(size = 2, se = F, color = "orange")+ 
	labs(y="Gene Expression", x = "Cells' pseudotime order")

g<-ggplot_build(p)
par(mar = c(5, 4, 1, 3.6))
plot(g$data[[1]][,1],g$data[[1]][,2],col=COL[1], type="l",
	las=1,xlab="Pseudo-time-ordering",ylab="Log Gene Expression",ylim=c(-3,3),lwd = 3,xaxt = "n")

############################### to show cluster ID at the x axis
part=FG@kmeans$kpart  
col=c()
for (i in NewNames){
	col=c(col,as.numeric(part[i]))
}
coll = c("black", "blue", "green", "red", "yellow", "gray")
for (i in 1: length(part)){
	points(i,-3.16,col = coll[col[i]],pch = 15, cex = 1.5)
} 

genes<-S[,1]
SYM<-S[,2]
for ( i in 1:length(S[,1])){
	gene<-genes[i]
	cells<-c(1:1448)
	expression=log(t(OrderedDataSet[gene,]))
	names(expression)=NULL
	ID<-c(rep(gene,1448))
	NewDF3=data.frame(expression,cells,ID)
	rownames(NewDF3)=NULL
	colnames(NewDF3)=c("expression","cells","GeneID")
	p=ggplot(NewDF3,aes(x=cells,y=expression),colour=col) + 
	    # Line for each gene
	    geom_smooth(size = 2, se = F, color = COL[i])+ 
		labs(y="Gene Expression", x = "Cells' pseudotime order")
	ggg<-ggplot_build(p) ############################# to extract the xy
	lines(ggg$data[[1]][,1],ggg$data[[1]][,2],col=COL[i],lwd = 3)
}

add_legend <- function(...) {
  opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), 
    mar=c(0, 0, 0, 0), new=TRUE)
  on.exit(par(opar))
  plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
  legend(...)
}
add_legend("topright", legend=SYM, pch=19, 
   col=COL[1:length(S[,1])],   horiz=F, bty='n', cex=0.7)

#### Plotting the heatmap

load("fg.RData")               # Loading the "fg" object that includes the data of the k-means clustering 
FG<-fg                         # Storing the data of fg in the FG 
rm(fg)                         # Removing the unneeded objects

clustheatmap(FG)