using DataFrames
using DataStructures

using Gadfly
using Color

colors = distinguishable_colors(8)
gajomidark = Theme(default_color = color("black"),
                    major_label_color = color("black"),major_label_font_size=18px,
                    minor_label_color = color("black"),minor_label_font_size=14px,
                    panel_stroke = color("black"),
                    default_point_size = 0.5mm,
                    line_width = 1mm,
                    grid_line_width = 0px,
                    highlight_width = 0px)
set_default_plot_size(20cm, 15cm)


download("https://data.cityofchicago.org/api/views/8yq3-m6wp/rows.csv?accessType=DOWNLOAD","energy.csv")
download("https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.csv?accessType=DOWNLOAD","socio.csv")

fulleconsummary = readtable("socio.csv")
map(display,names(fullecondata));

fullnames = [:COMMUNITY_AREA_NAME,:PERCENT_HOUSEHOLDS_BELOW_POVERTY, :PER_CAPITA_INCOME]
shortnames = [:community,:percent_poverty,:per_capita_income]
econsummary = sort!(names!(fullecondata[:,fullnames],shortnames),cols=:percent_poverty,rev=true)

plot(econsummary,x="percent_poverty",y="per_capita_income",label=:community,Geom.label,Geom.point,gajomidark)

fullenergydetail = readtable("energy.csv")
map(display,names(fullenergydetail));

fullnames = [:COMMUNITY_AREA_NAME,:CENSUS_BLOCK,:TOTAL_POPULATION,:BUILDING_TYPE,:THERMS_TOTAL_SQFT,:TOTAL_KWH,:TOTAL_THERMS]
shortnames = [:community,:block,:population,:building_type,:sqft,:kwh,:therms]
energydetail = complete_cases!(names!(fullenergydetail[:,fullnames],shortnames))

dupcounts = counter(energydetail[:block])
rows = (&)(array(energydetail[:building_type].=="Residential"),
           array(energydetail[:population].>0),
           array(map(i->dupcounts[i],energydetail[:block]).==1))
energydetail = delete!(energydetail[rows,:],[:block,:building_type])

per(denominator::Symbol,numerator::Symbol,data::DataFrame) = data[numerator]./data[denominator]

#add stats
energydata[:kwh_per_capita] = per(:population,:kwh,energydata)
energydata[:therms_per_capita] = per(:population,:therms,energydata)
energydata[:kwh_per_sqft] = per(:sqft,:kwh,energydata)
energydata[:therms_per_sqft] = per(:sqft,:therms,energydata)
energydata

stats = [:kwh_per_capita,:therms_per_capita,:kwh_per_sqft,:therms_per_sqft]
energysummary = aggregate(energydata[[:community, stats...]],:community, mean)

summary = join(econdata, energysummary, on = :community)

set_default_plot_size(25cm, 10cm)
hstack(
plot(fulldata,x = :percent_poverty, y = :therms_per_sqft_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark),
plot(fulldata,x=:per_capita_income,y = :therms_per_sqft_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark)
)

set_default_plot_size(25cm, 10cm)
hstack(
plot(fulldata,x=:per_capita_income,y = :kwh_per_sqft_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark),
plot(fulldata,x = :percent_poverty, y = :kwh_per_sqft_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark)
)

set_default_plot_size(25cm, 10cm)
hstack(
plot(fulldata,x = :percent_poverty, y = :kwh_per_capita_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark),
plot(fulldata,x= :per_capita_income,y = :kwh_per_capita_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark)
)

set_default_plot_size(25cm, 10cm)
hstack(
plot(fulldata,x = :percent_poverty, y = :therms_per_capita_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark),
plot(fulldata,x=:per_capita_income,y = :therms_per_capita_mean,label=:community,Geom.label,Geom.point,Scale.y_log10,gajomidark)
)

set_default_plot_size(20cm, 15cm)
plot(fulldata,x=:therms_per_sqft_mean,y = :kwh_per_sqft_mean,color=:percent_poverty,label=:community,
    Geom.label,Geom.point,Scale.ContinuousColorScale(p -> RGB(1-p,1-p,1-p)),Scale.y_log10,Scale.x_log10,gajomidark)

plot(fulldata,x=:therms_per_capita_mean,y = :kwh_per_capita_mean,color=:percent_poverty,label=:community,
    Geom.label,Geom.point,Scale.ContinuousColorScale(p -> RGB(1-p,1-p,1-p)),Scale.y_log10,Scale.x_log10,gajomidark)

#an ugly hack to get Gadfly to plot bulk statstic alongside lect communities
unlabeledall = deepcopy(energydata)
unlabeledall[:community] = utf8("All Chicago")
indexes = findin(energydata[:community],["Oakland","Washington Park","Riverdale"])
labledsubset = deepcopy(energydata[indexes,:])
superhackyappendeddata = append!(unlabeledall,labledsubset);

#indexes = findin(energydata[:community],["Lincoln Park","Hyde Park","Washington Park"])
plots = [[plot(superhackyappendeddata,x = units*"_per_"*normalizer,color = :community, 
        Scale.discrete_color_manual("black",colors[6:end]...),Geom.density,Scale.x_log10,gajomidark)
        for units in ["kwh","therms"]] for normalizer in ["capita","sqft"]]

vstack(hstack(plots[1]...),hstack(plots[2]...))