# we only need the package Gadfly today
using Gadfly

# First read in your files

# if the file's separator was a comma then you don't need to specify it
# similarly, its a default to read the first row as a header row
# if it was a tab separated file with a header row we would use:
#   mydat = readtable("filenameNoHeader.csv", separator='\t')

d_age = collect(readdlm("f_age.csv"))
d_sex = collect(readdlm("f_sex.csv"))
d_dbp = collect(readdlm("f_dBP.csv")) ;

# open 3 files and store them
# collect is used to create one dimensional arrays instead of 2d arrays because
#  each of these files has one column of data
#  If we'd had two columns in f_sex.csv then we'd skip the collect() and address
#    the columns as d_sex[1] and d_sex[2]
# note the semicolon on the last line to stop Julia printing the final output

# lets just check what we read into the arrays
print("sa ", size(d_age), " ss ", size(d_sex), " sd ", size(d_dbp),)
# and lets have a look at the first few rows of column 1 for each one
d_age[1:6]

# I can do that one at a time or use a trick instead
# [array1 array2 array3] with spaces between the output arrays 
# concatenates them into 3 columns and displays them
[d_age[1:6] d_sex[1:6] d_dbp[1:6]]

# Im interested in the age distribution so lets plot a histogram
plot(x=d_age, Geom.histogram)

# Its good practice to check coarse and fine histograms
plot(x=d_age, Geom.histogram(bincount=25))

# And lets look at a box plot.  First narrow the plot:
set_default_plot_size(6cm, 10cm)
plot(y=d_age, Geom.boxplot, Theme(boxplot_spacing=10mm))

# what if I want to compare men with women?
set_default_plot_size(8cm, 10cm)
plot(x=d_sex, y=d_age, Geom.boxplot, Theme(boxplot_spacing=15mm))

# If I want a summary of the statistics for the sample its easy to get it
[mean(d_age), std(d_age), mode(d_age), "", quantile(d_age,[0.75,0.5,0.25])]

# ok lets do a scatter plot
# resize the plot to something larger
set_default_plot_size(20cm, 12cm)
plot(x=1:50, y=d_age)

# note that rather than enter the number of rows I could have used the size function
# and entered  plot(x=1:size(d_age,1), y=d_age)

# So having looked at the plot we decide to plot it with an estimated confidence interval
# and a loess smoothing of the data.  Plus lets make the labels more relevant.
# For the confidence intervals we add Geom.errorbar and calculate a min and max
# For the smoothing we add Geom.smooth but we have to add Geom.point because although
#   its the default it will be replaced by any other Geom.  If we wanted a line 
#   we could use Geom.line.
# Lets show which respondents are male and which are female with the color function
# If we do that then 2 smoothing lines are drawn (if we commented out line 4, then only one)
# Finally notice that the plot command isn't on one line anymore - the brackets contain it.

plot(x=1:size(d_age,1), y=d_age, 
  Guide.xlabel("Respondent"), Guide.ylabel("Age"),
  Geom.errorbar, ymin=d_age-1.96*std(d_age), ymax=d_age+1.96*std(d_age),
  color=collect(d_sex), Guide.colorkey("Sex"),
  Geom.smooth, Geom.point)

# The other major chart type is the bar chart, comparing y with x so lets
# compare blood pressure with age
set_default_plot_size(20cm, 12cm)
plot(x=d_age, y=d_dbp, Geom.bar, Geom.smooth)

# and using color to identify sex
plot(x=d_age, y=d_dbp, color=d_sex, Geom.bar(position=:dodge))

# We don't have pie charts currently but you might prefer a normalized stacked bar chart anyway
#plot(x=d_age, y=d_dbp, Geom.normbar)


draw(PNG("myplot.png", 6inch, 3inch), plot(x=d_age, y=d_dbp, Geom.bar))

# or with a plot object:
p = plot(x=1:size(d_age,1), y=d_age, 
      Guide.xlabel("Respondent"), Guide.ylabel("Age"),
      Geom.errorbar, ymin=d_age-1.96*std(d_age), ymax=d_age+1.96*std(d_age),
      color=collect(d_sex), Guide.colorkey("Sex"),
      Geom.smooth, Geom.point)

draw(PDF("myplot.pdf", 6inch, 3inch), p)

# we want to use Gadfly and Dataframes today
using Gadfly; using DataFrames

# First read in your file into a dataframe with readtable
df = readtable("filename.csv")

# if the file's separator was a comma then you don't need to specify it
# if it was a tab separated file with no header row we would use:
#   mydat = readtable("filenameNoHeader.csv", separator='\t', header=false)

# lets just check what we read into mydata
print("size is ", size(df))
# and lets have a look at the first few rows and columns
#  because its a single frame no tricks are needed to display it
df[1:3, 1:size(df,2)]


# I'd prefer M and F to 2 and 1,  similarly Y and N for Drink
df["Sex"]=ifelse(df["Sex"].==1, "F", "M") 
df["Drink"]=ifelse(df["Drink"].==1, "Y", "N")
df[1:6, 1:size(df,2)]

# Im interested in the age distribution so lets plot a histogram
plot(df, x="Age", Geom.histogram(bincount=6))

# Its good practice to check coarse and fine histograms
plot(df, x=3, Geom.histogram(bincount=15))
# note that I just entered the column number instead of its heading

# and lets look at box plots again but lets do the original two side by side
# but first lets convert the 1s and 2s in sex to "F" and "M"
#df = df[df["Sex"].==1 ? "F" : "M",:]
hstack( plot(df, y="Age", Geom.boxplot), 
        plot(df, x="Sex", y="Age", Geom.boxplot),
        plot(df, x="Drink", y="Age", Geom.boxplot) 
)

# Its quite a bit easier with dataframes.  Similarly we can display all stats
describe(df)

# And the standard deviations
["Age" std(df["Age"]) "sBP" std(df["sBP"]) "dBP" std(df["dBP"]) "BMI" std(df["BMI"])]

# ok lets do a scatter plot
# resize the plot to something larger
set_default_plot_size(20cm, 12cm)
plot(df, x="IX", y="sBP")

# So having looked at the plot we decide to plot it with an estimated confidence interval
# and a loess smoothing of the data.  Plus lets make the labels more relevant.
# For the confidence intervals we add Geom.errorbar and calculate a min and max
# For the smoothing we add Geom.smooth but we have to add Geom.point because although
#   its the default it will be replaced by any other Geom.  If we wanted a line 
#   we could use Geom.line.
# Lets show which respondents are male and which are female with the color function
# If we do that then 2 smoothing lines are drawn (if we commented out line 4, then only one)
# Finally notice that the plot command isn't on one line anymore - the brackets contain it.

plot(df, x="IX", y="sBP",
  Guide.xlabel("Respondent"), Guide.ylabel("Blood Pressure"),
  Geom.errorbar, ymin=df["sBP"]-1.96*std(df["sBP"]), ymax=df["sBP"]+1.96*std(df["sBP"]),
  color="Sex", # Guide.colorkey("Sex"),
  Geom.smooth, Geom.point)

# The other major chart type is the bar chart, comparing y with x so lets
# compare blood pressure with age
set_default_plot_size(20cm, 12cm)
plot(df, x="Age", y="sBP", Geom.bar, Geom.smooth)

# and using color to identify sex
plot(df, x="Age", y="sBP", color="Sex", Geom.bar(position=:dodge))

# We don't have pie charts currently but you might prefer a normalized stacked bar chart anyway
#plot(x=d_age, y=d_dbp, Geom.normbar)


# Its easy to save your visualizations as png, pdf or ps files
#  draw(format("filename.formatsuffix", width, height),(plot object or command))
draw(PNG("myplot.png", 6inch, 3inch), plot(df, x="Age", y="dBP", Geom.bar))

# or with a plot object:
p = plot(df, x="IX", y="Age", 
      Guide.xlabel("Respondent"), # Guide.ylabel("Age"),
      Geom.errorbar, ymin=df["Age"]-1.96*std(df["Age"]), ymax=df["Age"]+1.96*std(df["Age"]),
      color="Sex", # Guide.colorkey("Sex"),
      Geom.smooth, Geom.point)

draw(PDF("myplot.pdf", 6inch, 3inch), p)