list.of.packages <- c("gdata","R.utils","FactoMineR","factoextra","data.table","stringr","sp","cluster","plyr", "stringi","proj4","ggplot2","plotly","rgdal","rgeos","raster","randomForest") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages) > 0) install.packages(new.packages) lapply(list.of.packages, FUN = function(X) { do.call("require", list(X)) }) path <- getwd() setwd(path) set.seed(1601720) #proj <- CRS("+init=epsg:9040 +datum=WGS84") proj <- CRS("+proj=lcc +lat_0=52 +lon_0=10 +lat_1=35 +lat_2=65 +x_0=4000000 +y_0=2800000 +datum=WGS84 +units=m +no_defs") if (file.exists("Provided_Results.gz")) { arc <- fread("Provided_Results.gz", encoding="UTF-8") } else { source("Translator.R") arc <- arc.dataconvert("results.json.gz") } setnames(arc,names(arc),c("NUMBER","ID","X","Y","DATASET","CATEGORY", "ENTRY","CONCEPT","COMBINATOR","RELATED","CONTEXT")) arc[, names(arc) := lapply(.SD,trimws,which='left')] head(arc) arc[,c("X","Y") := lapply(.SD,as.numeric), .SDcols = c("X","Y")] # Convert to numeric arc[, `:=`(X = X * ..pi / 180, Y = Y * ..pi / 180)] # Convert to radians arc[,`:=`(X = ptransform(.(X,Y),src.proj=CRS("+proj=longlat +datum=WGS84"), # Project the coordinates dst.proj=proj)[[1]], Y = ptransform(.(X,Y),src.proj=CRS("+proj=longlat +datum=WGS84"), dst.proj=proj)[[2]])] concepts <- read.csv("CONCEPT_DATA_FRAME.csv", stringsAsFactors = FALSE) concepts <- as.data.table(concepts) concepts[,names(concepts) := lapply(.SD,trimws,which='left')] head(concepts) datasets <- read.csv("DATASET_HASH_OUT.csv", stringsAsFactors = FALSE, encoding='UTF-8') datasets <- as.data.table(datasets) datasets[,names(datasets) := lapply(.SD,trimws,which='left')] datasets query <- "Farm Histories" fhd_hash <- datasets[str_detect(NAME,query),HASH] fhd_hash <- toupper(fhd_hash) # Since the dataARC hashes are all capitalized fhd <- arc[DATASET == fhd_hash,] # Filter dataARC and assign to new variable fhd query <- "Sagas" saga_hash <- datasets[str_detect(NAME,query),HASH] saga_hash <- toupper(saga_hash) saga <- arc[DATASET == saga_hash,] head(saga) entries <- c("ENTITY_TYPE_ALIAS", "RELATIONSHIPS_LOOKUP_RESOURCE_EN", # Entries by which to filter "FODDER_PRODUCTIVITY", "1861_ADJUSTED_VALUE") fhd_socio <- fhd[CATEGORY %in% entries,][,.(ID,CATEGORY,ENTRY)] # Apply the filter, keeping only the ID and new cols head(fhd_socio) fhd_socio <- dcast(fhd_socio, ID+ENTRY~CATEGORY, value.var="ENTRY", # Cast the rows fun.aggregate = function (x) paste(x,collapse = "; ")) fhd_socio <- setDT(fhd_socio)[order(ID), # And smush them lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), by=ID] head(fhd_socio) fhd_socio[is.na(RELATIONSHIPS_LOOKUP_RESOURCE_EN), RELATIONSHIPS_LOOKUP_RESOURCE_EN := "NOTHING"] fhd_socio <- na.omit(fhd_socio) head(fhd_socio) unique(fhd_socio$RELATIONSHIPS_LOOKUP_RESOURCE_EN) fhd_socio[, RELATIONSHIPS_LOOKUP_RESOURCE_EN := list(strsplit(RELATIONSHIPS_LOOKUP_RESOURCE_EN, "; "))] # Split any multi-resource entries fhd_socio[, RELATIONSHIPS_LOOKUP_RESOURCE_EN := lapply(RELATIONSHIPS_LOOKUP_RESOURCE_EN, function(x) x[1])] # Keep only the first one unique(fhd_socio$RELATIONSHIPS_LOOKUP_RESOURCE_EN) fhd_socio[, c("FODDER_PRODUCTIVITY","1861_ADJUSTED_VALUE") := lapply(.SD,as.numeric), .SDcols = c("FODDER_PRODUCTIVITY","1861_ADJUSTED_VALUE")] fhd_socio <- na.omit(fhd_socio) vars <- c(entries) # The entries variable already has stored the variables of interest fhd_famd <- fhd_socio[,..vars] # Extract only the columns we want fhd_famd[, "RELATIONSHIPS_LOOKUP_RESOURCE_EN" := # Convert list to character type as.character(RELATIONSHIPS_LOOKUP_RESOURCE_EN)] fhd_famd[, c("ENTITY_TYPE_ALIAS","RELATIONSHIPS_LOOKUP_RESOURCE_EN") := lapply(.SD,as.factor), # Convert character to factor .SDcols = c("ENTITY_TYPE_ALIAS", "RELATIONSHIPS_LOOKUP_RESOURCE_EN")] famd <- FAMD(fhd_famd,ncp=nrow(fhd_famd),graph=FALSE) # Perform the FAMD famd famd$eig famd$var$contrib fviz_famd_var(famd, "quanti.var", repel = TRUE,col.var = "black", axes=c(1,2)) fviz_famd_var(famd, "quanti.var", repel = TRUE,col.var = "black", axes=c(2,3)) fviz_famd_var(famd, "quanti.var", repel = TRUE,col.var = "black", axes=c(1,3)) fviz_famd_var(famd, "quali.var", repel = TRUE,col.var = "black", axes=c(1,2)) fviz_famd_var(famd, "quali.var", repel = TRUE,col.var = "black", axes=c(2,3)) fviz_famd_var(famd, "quali.var", repel = TRUE,col.var = "black", axes=c(1,3)) famd_out <- as.data.table(famd$call$X) setnames(famd_out,c("1861_ADJUSTED_VALUE","ENTITY_TYPE_ALIAS","RELATIONSHIPS_LOOKUP_RESOURCE_EN"), c("VALUE","OWNER","RIGHTS_TO")) famd_out[,colnames(famd$ind$coord) := as.data.table(..famd$ind$coord)] p1 <- plot_ly(x = famd_out$Dim.1, y = famd_out$Dim.2, z = famd_out$Dim.3, type = "scatter3d", mode = "markers", color = famd_out$OWNER, text = paste("RIGHTS TO:",famd_out$RIGHTS_TO)) p1 fhd_byconcept <- fhd[, .(CONCEPT = unlist(tstrsplit(CONCEPT,":"))), by=c("ID")] fhd_byconcept <- fhd_byconcept[,.(CONCEPT = unique(CONCEPT)),by=ID][, WEIGHT := 3] fhd_concepts <- fhd_byconcept fhd_concepts[,CONCEPT := ..concepts$NAME[match(CONCEPT,..concepts$HASH)]] fhd_byrelated <- fhd[, .(RELATED = unlist(tstrsplit(RELATED,":"))), by=c("ID")] fhd_byrelated <- fhd_byrelated[,.(RELATED = unique(RELATED)),by=ID][,WEIGHT := 2] fhd_related <- fhd_byrelated fhd_related[,RELATED := ..concepts$NAME[match(RELATED,..concepts$HASH)]] fhd_bycontext <- fhd[, .(CONTEXT = unlist(tstrsplit(CONTEXT,":"))), by=c("ID")] fhd_bycontext <- fhd_bycontext[,.(CONTEXT = unique(CONTEXT)),by=ID][,WEIGHT := 1] fhd_context <- fhd_bycontext fhd_context[,CONTEXT := ..concepts$NAME[match(CONTEXT,..concepts$HASH)]] head(fhd_concepts) saga_byconcept <- saga[, .(CONCEPT = unlist(tstrsplit(CONCEPT,":"))), by=c("ID")] saga_byconcept <- saga_byconcept[,.(CONCEPT = unique(CONCEPT)),by=ID][, WEIGHT := 3] saga_concepts <- saga_byconcept saga_concepts[,CONCEPT := ..concepts$NAME[match(CONCEPT,..concepts$HASH)]] saga_byrelated <- saga[, .(RELATED = unlist(tstrsplit(RELATED,":"))), by=c("ID")] saga_byrelated <- saga_byrelated[,.(RELATED = unique(RELATED)),by=ID][,WEIGHT := 2] saga_related <- saga_byrelated saga_related[,RELATED := ..concepts$NAME[match(RELATED,..concepts$HASH)]] saga_bycontext <- saga[, .(CONTEXT = unlist(tstrsplit(CONTEXT,":"))), by=c("ID")] saga_bycontext <- saga_bycontext[,.(CONTEXT = unique(CONTEXT)),by=ID][,WEIGHT := 1] saga_context <- saga_bycontext saga_context[,CONTEXT := ..concepts$NAME[match(CONTEXT,..concepts$HASH)]] head(saga_concepts) n = length(unique(fhd$ID)) fhd_concepts <- fhd_concepts[,.N,by=CONCEPT][N != n] fhd_related <- fhd_related[,.N, by=RELATED][N != n] setnames(fhd_byrelated,"RELATED","CONCEPT") fhd_context <- fhd_context[,.N, by=CONTEXT][N != n] setnames(fhd_bycontext,"CONTEXT","CONCEPT") fhd_byconcept <- do.call(rbind,list(fhd_byconcept,fhd_byrelated,fhd_bycontext)) fhd_byconcept[,WEIGHT := sum(WEIGHT),by=.(ID,CONCEPT)] head(fhd_byconcept) n = length(unique(saga$ID)) saga_concepts <- saga_concepts[,.N,by=CONCEPT][N != n] saga_related <- saga_related[,.N, by=RELATED][N != n] setnames(saga_byrelated,"RELATED","CONCEPT") saga_context <- saga_context[,.N, by=CONTEXT][N != n] setnames(saga_bycontext,"CONTEXT","CONCEPT") saga_byconcept <- do.call(rbind,list(saga_byconcept,saga_byrelated,saga_bycontext)) saga_byconcept[,WEIGHT := sum(WEIGHT),by=.(ID,CONCEPT)] head(saga_byconcept) var_list <- unique(saga_byconcept$CONCEPT[saga_byconcept$CONCEPT %in% fhd_byconcept$CONCEPT]) var_list <- na.omit(var_list) as.list(var_list) fhd_byconcept <- fhd_byconcept[CONCEPT %in% var_list] # Get only the concepts of interest fhd_byconcept <- dcast(fhd_byconcept, ID+WEIGHT~CONCEPT, value.var="WEIGHT", fun.aggregate = mean) # Cast the concepts into their own rows fhd_byconcept <- setDT(fhd_byconcept)[order(ID), lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), by=ID] # Smush them so that there's only one row per observation setnafill(fhd_byconcept,fill=0,cols=var_list) # If a concept doesn't apply, give it weight of 1 fhd_byconcept[, SUM := rowSums(.SD),.SDcols=var_list][, c("WEIGHT","SUM") := NULL] head(fhd_byconcept) name <- c("RELATIONSHIPS_LOOKUP_RESOURCE_EN") fhd_rf <- fhd[CATEGORY %in% name, .(ID,X,Y,ENTRY)] fhd_rf <- fhd_byconcept[fhd_rf,on="ID"] fhd_rf[,ENTRY := as.factor(ENTRY)] fhd_rf count(as.character(fhd_rf$ENTRY)) fhd_rf[,ENTRY := unlist(ENTRY)] fhd_rf <- na.omit(fhd_rf) strategies <- read.csv('Strategies.csv') fhd_rf[,ENTRY := toupper(ENTRY)] fhd_rf <- fhd_rf[strategies,on="ENTRY"] fhd_rf[, STRATEGY := as.factor(STRATEGY)] fhd_rf <- na.omit(fhd_rf) head(fhd_rf[,.(ID,ENTRY,STRATEGY)]) test <- sample(1:nrow(fhd_rf), nrow(fhd_rf) %/% 10, replace=F) train <- fhd_rf[!test,] test <- fhd_rf[test,] train rf <- randomForest(x = train[,..var_list], y= train$STRATEGY, xtest = test[,..var_list], ytest = test$STRATEGY, ntree = 1000, importance = TRUE, keep.forest = T) rf$importance conf <- as.data.table(rf$conf) # Gets confusion matrix classes <- names(conf)[1:(length(conf)-1)] # Gets strategy names conf[, N := rowSums(.SD),.SDcols = classes] # Gets the total number of test observations conf[, (classes) := lapply(.SD, function(x) x / N * 100), .SDcols = classes] # Normalizes conf <- as.matrix(conf[,..classes]) # Converts from data.table to matrix rownames(conf) <- classes # re-applies strategy names conf <- melt(conf) # Converts from matrix to i,j list names(conf) <- c("ACTUAL","PREDICTED","PERCENT") p2 <- ggplot(conf,aes(PREDICTED, ACTUAL)) + geom_tile(aes(fill=PERCENT)) + theme(axis.text.x = element_text(angle=90, vjust = 0.5, hjust = 1)) + scale_fill_gradientn(colors = c("navy","blanchedalmond","red4")) ggplotly(p2) saga_byconcept <- saga_byconcept[CONCEPT %in% var_list] # Get concepts of interest saga_byconcept <- dcast(saga_byconcept, ID+WEIGHT~CONCEPT, value.var="WEIGHT", fun.aggregate = mean) # Cast concepts into their own column saga_byconcept <- setDT(saga_byconcept)[order(ID), lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), by=ID] # Smush them into one row per observation setnafill(saga_byconcept,fill=0,cols=var_list[var_list %in% names(saga_byconcept)]) saga_rf <- saga[,lapply(.SD,mean),by=ID,.SDcols=c("X","Y")] saga_rf <- saga_byconcept[saga_rf,on="ID"] saga_rf <- na.omit(saga_rf) head(saga_rf) saga_rf$Predictions <- predict(rf,saga_rf[,1:(length(saga_rf))]) # Make predictions saga_rf <- cbind(saga_rf,predict(rf,saga_rf[,1:(length(saga_rf)-1)], type='prob')) # Add to data.table head(saga_rf[,.(ID,MARINE,MATERIAL,OTHER,PASTURE)]) tail(saga_rf[,.(ID,MARINE,MATERIAL,OTHER,PASTURE)]) entries = c("SAGANAME","CHAPTER") # Set the query saga_refs <- saga[CATEGORY %in% entries, .(ID, CATEGORY, ENTRY, X,Y)] # Filter by Entry saga_refs <- dcast(saga_refs, ID+ENTRY~CATEGORY, value.var="ENTRY", # Cast into columns fun.aggregate = function (x) paste(x,collapse = "; ")) saga_refs <- setDT(saga_refs)[order(ID), # Smush into one row lapply(.SD, function(x) x[!is.na(x) & x != ""][1]), by=ID] head(saga_refs)