This file is an ipython notebook with R-magic
to convert the data from Rds (the R programming language data dtorage sytem) to csv
to be read into Python. If you ever find yourself in a bind with R code available for you... give R-magic
a try.
All code and data is available on GitHub: https://github.com/Chicago/food-inspections-evaluation
import rpy2
import pandas as pd
%load_ext rpy2.ipython
The rpy2.ipython extension is already loaded. To reload it, use: %reload_ext rpy2.ipython
%%R
# change to your local clone
data_dir = '~/food-inspections-evaluation/'
out_dir = '~'
library("data.table", "ggplot2")
setwd(data_dir)
%%R
food = readRDS("DATA/food_inspections.Rds")
write.csv(food, file = paste(out_dir, '/food_inspections.csv', sep = ''), row.names = FALSE)
%%R
dat = readRDS("DATA/dat_model.Rds")
write.csv(dat, file = paste(out_dir, '/dat_model.csv', sep = ''))
%%R
dat <- readRDS("DATA/dat_model.Rds")
## Only keep "Retail Food Establishment"
dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"]
## Remove License Description
dat$LICENSE_DESCRIPTION <- NULL
dat <- na.omit(dat)
## Add criticalFound variable to dat:
dat$criticalFound <- pmin(1, dat$criticalCount)
# ## Set the key for dat
setkey(dat, Inspection_ID)
# Match time period of original results
# dat <- dat[Inspection_Date < "2013-09-01" | Inspection_Date > "2014-07-01"]
#==============================================================================
# CREATE MODEL DATA
#==============================================================================
# sort(colnames(dat))
xmat <- dat[ , list(Inspector = Inspector_Assigned,
pastSerious = pmin(pastSerious, 1),
pastCritical = pmin(pastCritical, 1),
timeSinceLast,
ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
consumption_on_premises_incidental_activity,
tobacco_retail_over_counter,
temperatureMax,
heat_burglary = pmin(heat_burglary, 70),
heat_sanitation = pmin(heat_sanitation, 70),
heat_garbage = pmin(heat_garbage, 50),
# Facility_Type,
criticalFound),
keyby = Inspection_ID]
mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
mm <- as.data.table(mm)
str(mm)
colnames(mm)
#==============================================================================
# CREATE TEST / TRAIN PARTITIONS
#==============================================================================
# 2014-07-01 is an easy separator
dat[Inspection_Date < "2014-07-01", range(Inspection_Date)]
dat[Inspection_Date > "2014-07-01", range(Inspection_Date)]
iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")]
iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
## Check to see if any rows didn't make it through the model.matrix formula
nrow(dat)
nrow(xmat)
nrow(mm)
Classes ‘data.table’ and 'data.frame': 18712 obs. of 16 variables: $ Inspectorblue : num 0 1 1 1 1 0 0 0 0 0 ... $ Inspectorbrown : num 0 0 0 0 0 0 0 0 0 0 ... $ Inspectorgreen : num 1 0 0 0 0 0 0 0 0 0 ... $ Inspectororange : num 0 0 0 0 0 1 1 1 1 1 ... $ Inspectorpurple : num 0 0 0 0 0 0 0 0 0 0 ... $ Inspectoryellow : num 0 0 0 0 0 0 0 0 0 0 ... $ pastSerious : num 0 0 0 0 0 0 0 0 0 0 ... $ pastCritical : num 0 0 0 0 0 0 0 0 0 0 ... $ timeSinceLast : num 2 2 2 2 2 2 2 2 2 2 ... $ ageAtInspection : num 1 1 1 1 1 1 0 1 1 0 ... $ consumption_on_premises_incidental_activity: num 0 0 0 0 0 0 0 0 0 0 ... $ tobacco_retail_over_counter : num 1 0 0 0 0 0 0 0 0 0 ... $ temperatureMax : num 53.5 59 59 56.2 52.7 ... $ heat_burglary : num 26.99 13.98 12.61 35.91 9.53 ... $ heat_sanitation : num 37.75 15.41 8.32 38.19 2.13 ... $ heat_garbage : num 12.8 12.9 8 26.2 3.4 ... - attr(*, ".internal.selfref")=<externalptr> [1] 18712
%%R
# Output Model Matrix and Target
write.csv(mm, file = paste(out_dir, '/model_matrix.csv', sep = ''), row.names = FALSE)
write.csv(xmat$criticalFound, file = paste(out_dir, '/TARGET.csv', sep = ''), row.names = FALSE)