# we only need the package Gadfly today using Gadfly # First read in your files # if the file's separator was a comma then you don't need to specify it # similarly, its a default to read the first row as a header row # if it was a tab separated file with a header row we would use: # mydat = readtable("filenameNoHeader.csv", separator='\t') d_age = collect(readdlm("f_age.csv")) d_sex = collect(readdlm("f_sex.csv")) d_dbp = collect(readdlm("f_dBP.csv")) ; # open 3 files and store them # collect is used to create one dimensional arrays instead of 2d arrays because # each of these files has one column of data # If we'd had two columns in f_sex.csv then we'd skip the collect() and address # the columns as d_sex[1] and d_sex[2] # note the semicolon on the last line to stop Julia printing the final output # lets just check what we read into the arrays print("sa ", size(d_age), " ss ", size(d_sex), " sd ", size(d_dbp),) # and lets have a look at the first few rows of column 1 for each one d_age[1:6] # I can do that one at a time or use a trick instead # [array1 array2 array3] with spaces between the output arrays # concatenates them into 3 columns and displays them [d_age[1:6] d_sex[1:6] d_dbp[1:6]] # Im interested in the age distribution so lets plot a histogram plot(x=d_age, Geom.histogram) # Its good practice to check coarse and fine histograms plot(x=d_age, Geom.histogram(bincount=25)) # And lets look at a box plot. First narrow the plot: set_default_plot_size(6cm, 10cm) plot(y=d_age, Geom.boxplot, Theme(boxplot_spacing=10mm)) # what if I want to compare men with women? set_default_plot_size(8cm, 10cm) plot(x=d_sex, y=d_age, Geom.boxplot, Theme(boxplot_spacing=15mm)) # If I want a summary of the statistics for the sample its easy to get it [mean(d_age), std(d_age), mode(d_age), "", quantile(d_age,[0.75,0.5,0.25])] # ok lets do a scatter plot # resize the plot to something larger set_default_plot_size(20cm, 12cm) plot(x=1:50, y=d_age) # note that rather than enter the number of rows I could have used the size function # and entered plot(x=1:size(d_age,1), y=d_age) # So having looked at the plot we decide to plot it with an estimated confidence interval # and a loess smoothing of the data. Plus lets make the labels more relevant. # For the confidence intervals we add Geom.errorbar and calculate a min and max # For the smoothing we add Geom.smooth but we have to add Geom.point because although # its the default it will be replaced by any other Geom. If we wanted a line # we could use Geom.line. # Lets show which respondents are male and which are female with the color function # If we do that then 2 smoothing lines are drawn (if we commented out line 4, then only one) # Finally notice that the plot command isn't on one line anymore - the brackets contain it. plot(x=1:size(d_age,1), y=d_age, Guide.xlabel("Respondent"), Guide.ylabel("Age"), Geom.errorbar, ymin=d_age-1.96*std(d_age), ymax=d_age+1.96*std(d_age), color=collect(d_sex), Guide.colorkey("Sex"), Geom.smooth, Geom.point) # The other major chart type is the bar chart, comparing y with x so lets # compare blood pressure with age set_default_plot_size(20cm, 12cm) plot(x=d_age, y=d_dbp, Geom.bar, Geom.smooth) # and using color to identify sex plot(x=d_age, y=d_dbp, color=d_sex, Geom.bar(position=:dodge)) # We don't have pie charts currently but you might prefer a normalized stacked bar chart anyway #plot(x=d_age, y=d_dbp, Geom.normbar) draw(PNG("myplot.png", 6inch, 3inch), plot(x=d_age, y=d_dbp, Geom.bar)) # or with a plot object: p = plot(x=1:size(d_age,1), y=d_age, Guide.xlabel("Respondent"), Guide.ylabel("Age"), Geom.errorbar, ymin=d_age-1.96*std(d_age), ymax=d_age+1.96*std(d_age), color=collect(d_sex), Guide.colorkey("Sex"), Geom.smooth, Geom.point) draw(PDF("myplot.pdf", 6inch, 3inch), p) # we want to use Gadfly and Dataframes today using Gadfly; using DataFrames # First read in your file into a dataframe with readtable df = readtable("filename.csv") # if the file's separator was a comma then you don't need to specify it # if it was a tab separated file with no header row we would use: # mydat = readtable("filenameNoHeader.csv", separator='\t', header=false) # lets just check what we read into mydata print("size is ", size(df)) # and lets have a look at the first few rows and columns # because its a single frame no tricks are needed to display it df[1:3, 1:size(df,2)] # I'd prefer M and F to 2 and 1, similarly Y and N for Drink df["Sex"]=ifelse(df["Sex"].==1, "F", "M") df["Drink"]=ifelse(df["Drink"].==1, "Y", "N") df[1:6, 1:size(df,2)] # Im interested in the age distribution so lets plot a histogram plot(df, x="Age", Geom.histogram(bincount=6)) # Its good practice to check coarse and fine histograms plot(df, x=3, Geom.histogram(bincount=15)) # note that I just entered the column number instead of its heading # and lets look at box plots again but lets do the original two side by side # but first lets convert the 1s and 2s in sex to "F" and "M" #df = df[df["Sex"].==1 ? "F" : "M",:] hstack( plot(df, y="Age", Geom.boxplot), plot(df, x="Sex", y="Age", Geom.boxplot), plot(df, x="Drink", y="Age", Geom.boxplot) ) # Its quite a bit easier with dataframes. Similarly we can display all stats describe(df) # And the standard deviations ["Age" std(df["Age"]) "sBP" std(df["sBP"]) "dBP" std(df["dBP"]) "BMI" std(df["BMI"])] # ok lets do a scatter plot # resize the plot to something larger set_default_plot_size(20cm, 12cm) plot(df, x="IX", y="sBP") # So having looked at the plot we decide to plot it with an estimated confidence interval # and a loess smoothing of the data. Plus lets make the labels more relevant. # For the confidence intervals we add Geom.errorbar and calculate a min and max # For the smoothing we add Geom.smooth but we have to add Geom.point because although # its the default it will be replaced by any other Geom. If we wanted a line # we could use Geom.line. # Lets show which respondents are male and which are female with the color function # If we do that then 2 smoothing lines are drawn (if we commented out line 4, then only one) # Finally notice that the plot command isn't on one line anymore - the brackets contain it. plot(df, x="IX", y="sBP", Guide.xlabel("Respondent"), Guide.ylabel("Blood Pressure"), Geom.errorbar, ymin=df["sBP"]-1.96*std(df["sBP"]), ymax=df["sBP"]+1.96*std(df["sBP"]), color="Sex", # Guide.colorkey("Sex"), Geom.smooth, Geom.point) # The other major chart type is the bar chart, comparing y with x so lets # compare blood pressure with age set_default_plot_size(20cm, 12cm) plot(df, x="Age", y="sBP", Geom.bar, Geom.smooth) # and using color to identify sex plot(df, x="Age", y="sBP", color="Sex", Geom.bar(position=:dodge)) # We don't have pie charts currently but you might prefer a normalized stacked bar chart anyway #plot(x=d_age, y=d_dbp, Geom.normbar) # Its easy to save your visualizations as png, pdf or ps files # draw(format("filename.formatsuffix", width, height),(plot object or command)) draw(PNG("myplot.png", 6inch, 3inch), plot(df, x="Age", y="dBP", Geom.bar)) # or with a plot object: p = plot(df, x="IX", y="Age", Guide.xlabel("Respondent"), # Guide.ylabel("Age"), Geom.errorbar, ymin=df["Age"]-1.96*std(df["Age"]), ymax=df["Age"]+1.96*std(df["Age"]), color="Sex", # Guide.colorkey("Sex"), Geom.smooth, Geom.point) draw(PDF("myplot.pdf", 6inch, 3inch), p)