list.of.packages <- c("gdata","R.utils","FactoMineR","factoextra","data.table","stringr","sp","cluster","plyr",
                      "stringi","proj4","ggplot2","plotly","rgdal","rgeos","raster","randomForest")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages) > 0) install.packages(new.packages)

lapply(list.of.packages, FUN = function(X) {
    do.call("require", list(X)) 
})

path <- getwd()
setwd(path)
set.seed(1601720)

#proj <- CRS("+init=epsg:9040 +datum=WGS84")
proj <- CRS("+proj=lcc +lat_0=52 +lon_0=10 +lat_1=35 +lat_2=65 +x_0=4000000 +y_0=2800000
                    +datum=WGS84 +units=m +no_defs")

if (file.exists("Provided_Results.gz")) {
    arc <- fread("Provided_Results.gz", encoding="UTF-8")
} else {
    source("Translator.R")
    arc <- arc.dataconvert("results.json.gz")
}

setnames(arc,names(arc),c("NUMBER","ID","X","Y","DATASET","CATEGORY",
                          "ENTRY","CONCEPT","COMBINATOR","RELATED","CONTEXT"))

arc[, names(arc) := lapply(.SD,trimws,which='left')]
head(arc)

arc[,c("X","Y") := lapply(.SD,as.numeric), .SDcols = c("X","Y")] # Convert to numeric
arc[, `:=`(X = X * ..pi / 180, Y = Y * ..pi / 180)] # Convert to radians
arc[,`:=`(X = ptransform(.(X,Y),src.proj=CRS("+proj=longlat +datum=WGS84"), # Project the coordinates
                         dst.proj=proj)[[1]],
          Y = ptransform(.(X,Y),src.proj=CRS("+proj=longlat +datum=WGS84"),
                         dst.proj=proj)[[2]])]

concepts <- read.csv("CONCEPT_DATA_FRAME.csv",
                     stringsAsFactors = FALSE)
concepts <- as.data.table(concepts)
concepts[,names(concepts) := lapply(.SD,trimws,which='left')]

head(concepts)

datasets <- read.csv("DATASET_HASH_OUT.csv",
                     stringsAsFactors = FALSE,
                     encoding='UTF-8')
datasets <- as.data.table(datasets)
datasets[,names(datasets) := lapply(.SD,trimws,which='left')]

datasets

query <- "Farm Histories"
fhd_hash <- datasets[str_detect(NAME,query),HASH]
fhd_hash <- toupper(fhd_hash) # Since the dataARC hashes are all capitalized

fhd <- arc[DATASET == fhd_hash,] # Filter dataARC and assign to new variable

fhd

query <- "Sagas"
saga_hash <- datasets[str_detect(NAME,query),HASH]
saga_hash <- toupper(saga_hash)

saga <- arc[DATASET == saga_hash,]
head(saga)

entries <- c("ENTITY_TYPE_ALIAS", "RELATIONSHIPS_LOOKUP_RESOURCE_EN", # Entries by which to filter
             "FODDER_PRODUCTIVITY", "1861_ADJUSTED_VALUE")
fhd_socio <- fhd[CATEGORY %in% entries,][,.(ID,CATEGORY,ENTRY)]  # Apply the filter, keeping only the ID and new cols

head(fhd_socio)

fhd_socio <- dcast(fhd_socio, ID+ENTRY~CATEGORY, value.var="ENTRY", # Cast the rows
                   fun.aggregate = function (x) paste(x,collapse = "; "))

fhd_socio <- setDT(fhd_socio)[order(ID), # And smush them
                              lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), 
                              by=ID]
head(fhd_socio)

fhd_socio[is.na(RELATIONSHIPS_LOOKUP_RESOURCE_EN), 
          RELATIONSHIPS_LOOKUP_RESOURCE_EN := "NOTHING"]
fhd_socio <- na.omit(fhd_socio)

head(fhd_socio)

unique(fhd_socio$RELATIONSHIPS_LOOKUP_RESOURCE_EN)

fhd_socio[, RELATIONSHIPS_LOOKUP_RESOURCE_EN := 
            list(strsplit(RELATIONSHIPS_LOOKUP_RESOURCE_EN, "; "))] # Split any multi-resource entries
fhd_socio[, RELATIONSHIPS_LOOKUP_RESOURCE_EN := 
            lapply(RELATIONSHIPS_LOOKUP_RESOURCE_EN, function(x) x[1])] # Keep only the first one
                   
unique(fhd_socio$RELATIONSHIPS_LOOKUP_RESOURCE_EN)

fhd_socio[, c("FODDER_PRODUCTIVITY","1861_ADJUSTED_VALUE") :=
           lapply(.SD,as.numeric), .SDcols = c("FODDER_PRODUCTIVITY","1861_ADJUSTED_VALUE")]
fhd_socio <- na.omit(fhd_socio)

vars <- c(entries) # The entries variable already has stored the variables of interest
fhd_famd <- fhd_socio[,..vars] # Extract only the columns we want
fhd_famd[, "RELATIONSHIPS_LOOKUP_RESOURCE_EN" := # Convert list to character type
           as.character(RELATIONSHIPS_LOOKUP_RESOURCE_EN)]
fhd_famd[, c("ENTITY_TYPE_ALIAS","RELATIONSHIPS_LOOKUP_RESOURCE_EN") := 
           lapply(.SD,as.factor), # Convert character to factor
         .SDcols = c("ENTITY_TYPE_ALIAS", "RELATIONSHIPS_LOOKUP_RESOURCE_EN")]
famd <- FAMD(fhd_famd,ncp=nrow(fhd_famd),graph=FALSE) # Perform the FAMD
famd

famd$eig

famd$var$contrib

fviz_famd_var(famd, "quanti.var", repel = TRUE,col.var = "black", axes=c(1,2))
fviz_famd_var(famd, "quanti.var", repel = TRUE,col.var = "black", axes=c(2,3))
fviz_famd_var(famd, "quanti.var", repel = TRUE,col.var = "black", axes=c(1,3))

fviz_famd_var(famd, "quali.var", repel = TRUE,col.var = "black", axes=c(1,2))
fviz_famd_var(famd, "quali.var", repel = TRUE,col.var = "black", axes=c(2,3))
fviz_famd_var(famd, "quali.var", repel = TRUE,col.var = "black", axes=c(1,3))

famd_out <- as.data.table(famd$call$X) 
setnames(famd_out,c("1861_ADJUSTED_VALUE","ENTITY_TYPE_ALIAS","RELATIONSHIPS_LOOKUP_RESOURCE_EN"),
         c("VALUE","OWNER","RIGHTS_TO"))
famd_out[,colnames(famd$ind$coord) := as.data.table(..famd$ind$coord)]

p1 <- plot_ly(x = famd_out$Dim.1, y = famd_out$Dim.2, z = famd_out$Dim.3, 
          type = "scatter3d",
          mode = "markers",
          color = famd_out$OWNER,
          text = paste("RIGHTS TO:",famd_out$RIGHTS_TO))

p1

fhd_byconcept <- fhd[, .(CONCEPT = unlist(tstrsplit(CONCEPT,":"))), by=c("ID")]
fhd_byconcept <- fhd_byconcept[,.(CONCEPT = unique(CONCEPT)),by=ID][, WEIGHT := 3]
fhd_concepts <- fhd_byconcept
fhd_concepts[,CONCEPT := ..concepts$NAME[match(CONCEPT,..concepts$HASH)]]

fhd_byrelated <- fhd[, .(RELATED = unlist(tstrsplit(RELATED,":"))), by=c("ID")]
fhd_byrelated <- fhd_byrelated[,.(RELATED = unique(RELATED)),by=ID][,WEIGHT := 2]
fhd_related <- fhd_byrelated
fhd_related[,RELATED := ..concepts$NAME[match(RELATED,..concepts$HASH)]]

fhd_bycontext <- fhd[, .(CONTEXT = unlist(tstrsplit(CONTEXT,":"))), by=c("ID")]
fhd_bycontext <- fhd_bycontext[,.(CONTEXT = unique(CONTEXT)),by=ID][,WEIGHT := 1]
fhd_context <- fhd_bycontext
fhd_context[,CONTEXT := ..concepts$NAME[match(CONTEXT,..concepts$HASH)]]

head(fhd_concepts)

saga_byconcept <- saga[, .(CONCEPT = unlist(tstrsplit(CONCEPT,":"))), by=c("ID")]
saga_byconcept <- saga_byconcept[,.(CONCEPT = unique(CONCEPT)),by=ID][, WEIGHT := 3]
saga_concepts <- saga_byconcept
saga_concepts[,CONCEPT := ..concepts$NAME[match(CONCEPT,..concepts$HASH)]]

saga_byrelated <- saga[, .(RELATED = unlist(tstrsplit(RELATED,":"))), by=c("ID")]
saga_byrelated <- saga_byrelated[,.(RELATED = unique(RELATED)),by=ID][,WEIGHT := 2]
saga_related <- saga_byrelated
saga_related[,RELATED := ..concepts$NAME[match(RELATED,..concepts$HASH)]]

saga_bycontext <- saga[, .(CONTEXT = unlist(tstrsplit(CONTEXT,":"))), by=c("ID")]
saga_bycontext <- saga_bycontext[,.(CONTEXT = unique(CONTEXT)),by=ID][,WEIGHT := 1]
saga_context <- saga_bycontext
saga_context[,CONTEXT := ..concepts$NAME[match(CONTEXT,..concepts$HASH)]]

head(saga_concepts)

n = length(unique(fhd$ID))
fhd_concepts <- fhd_concepts[,.N,by=CONCEPT][N != n]
fhd_related <- fhd_related[,.N, by=RELATED][N != n]
setnames(fhd_byrelated,"RELATED","CONCEPT")
fhd_context <- fhd_context[,.N, by=CONTEXT][N != n]
setnames(fhd_bycontext,"CONTEXT","CONCEPT")

fhd_byconcept <- do.call(rbind,list(fhd_byconcept,fhd_byrelated,fhd_bycontext))
fhd_byconcept[,WEIGHT := sum(WEIGHT),by=.(ID,CONCEPT)]

head(fhd_byconcept)

n = length(unique(saga$ID))
saga_concepts <- saga_concepts[,.N,by=CONCEPT][N != n]
saga_related <- saga_related[,.N, by=RELATED][N != n]
setnames(saga_byrelated,"RELATED","CONCEPT")
saga_context <- saga_context[,.N, by=CONTEXT][N != n]
setnames(saga_bycontext,"CONTEXT","CONCEPT")

saga_byconcept <- do.call(rbind,list(saga_byconcept,saga_byrelated,saga_bycontext))
saga_byconcept[,WEIGHT := sum(WEIGHT),by=.(ID,CONCEPT)]

head(saga_byconcept)

var_list <- unique(saga_byconcept$CONCEPT[saga_byconcept$CONCEPT %in% fhd_byconcept$CONCEPT])
var_list <- na.omit(var_list)

as.list(var_list)

fhd_byconcept <- fhd_byconcept[CONCEPT %in% var_list] # Get only the concepts of interest
fhd_byconcept <- dcast(fhd_byconcept, ID+WEIGHT~CONCEPT, value.var="WEIGHT",
                   fun.aggregate = mean) # Cast the concepts into their own rows

fhd_byconcept <- setDT(fhd_byconcept)[order(ID), 
                              lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), 
                              by=ID] # Smush them so that there's only one row per observation
setnafill(fhd_byconcept,fill=0,cols=var_list) # If a concept doesn't apply, give it weight of 1
fhd_byconcept[, SUM := rowSums(.SD),.SDcols=var_list][, c("WEIGHT","SUM") := NULL]
head(fhd_byconcept)

name <- c("RELATIONSHIPS_LOOKUP_RESOURCE_EN")
fhd_rf <- fhd[CATEGORY %in% name, .(ID,X,Y,ENTRY)]
fhd_rf <- fhd_byconcept[fhd_rf,on="ID"]
fhd_rf[,ENTRY := as.factor(ENTRY)]
fhd_rf

count(as.character(fhd_rf$ENTRY))
fhd_rf[,ENTRY := unlist(ENTRY)]
fhd_rf <- na.omit(fhd_rf)

strategies <- read.csv('Strategies.csv')
fhd_rf[,ENTRY := toupper(ENTRY)]
fhd_rf <- fhd_rf[strategies,on="ENTRY"]
fhd_rf[, STRATEGY := as.factor(STRATEGY)]
fhd_rf <- na.omit(fhd_rf)

head(fhd_rf[,.(ID,ENTRY,STRATEGY)])

test <- sample(1:nrow(fhd_rf), nrow(fhd_rf) %/% 10, replace=F)
train <- fhd_rf[!test,]
test <- fhd_rf[test,]

train

rf <- randomForest(x = train[,..var_list], y= train$STRATEGY,
                   xtest = test[,..var_list], ytest = test$STRATEGY,
                   ntree = 1000, importance = TRUE, keep.forest = T)

rf$importance

conf <- as.data.table(rf$conf) # Gets confusion matrix
classes <- names(conf)[1:(length(conf)-1)] # Gets strategy names
conf[, N := rowSums(.SD),.SDcols = classes] # Gets the total number of test observations
conf[, (classes) := lapply(.SD, function(x) x / N * 100), .SDcols = classes] # Normalizes
conf <- as.matrix(conf[,..classes]) # Converts from data.table to matrix
rownames(conf) <- classes # re-applies strategy names
conf <- melt(conf) # Converts from matrix to i,j list
names(conf) <- c("ACTUAL","PREDICTED","PERCENT") 


p2 <- ggplot(conf,aes(PREDICTED, ACTUAL)) + geom_tile(aes(fill=PERCENT)) +
  theme(axis.text.x = element_text(angle=90, vjust = 0.5, hjust = 1)) + 
  scale_fill_gradientn(colors = c("navy","blanchedalmond","red4")) 

ggplotly(p2)

saga_byconcept <- saga_byconcept[CONCEPT %in% var_list] # Get concepts of interest
saga_byconcept <- dcast(saga_byconcept, ID+WEIGHT~CONCEPT, value.var="WEIGHT",
                       fun.aggregate = mean) # Cast concepts into their own column

saga_byconcept <- setDT(saga_byconcept)[order(ID), 
                                      lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), 
                                      by=ID] # Smush them into one row per observation
setnafill(saga_byconcept,fill=0,cols=var_list[var_list %in% names(saga_byconcept)])
                                           
saga_rf <- saga[,lapply(.SD,mean),by=ID,.SDcols=c("X","Y")]
saga_rf <- saga_byconcept[saga_rf,on="ID"]
saga_rf <- na.omit(saga_rf)
                                             
head(saga_rf)

saga_rf$Predictions <- predict(rf,saga_rf[,1:(length(saga_rf))]) # Make predictions
saga_rf <- cbind(saga_rf,predict(rf,saga_rf[,1:(length(saga_rf)-1)], type='prob')) # Add to data.table

head(saga_rf[,.(ID,MARINE,MATERIAL,OTHER,PASTURE)])
tail(saga_rf[,.(ID,MARINE,MATERIAL,OTHER,PASTURE)])

entries = c("SAGANAME","CHAPTER") # Set the query

saga_refs <- saga[CATEGORY %in% entries, .(ID, CATEGORY, ENTRY, X,Y)]  # Filter by Entry

saga_refs <- dcast(saga_refs, ID+ENTRY~CATEGORY, value.var="ENTRY", # Cast into columns
                   fun.aggregate = function (x) paste(x,collapse = "; "))

saga_refs <- setDT(saga_refs)[order(ID), # Smush into one row
              lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), 
              by=ID]

head(saga_refs)