#%install_ext https://raw.github.com/minrk/ipython_extensions/master/extensions/nbtoc.py #%load_ext nbtoc %autosave 60 #%nbtoc import pandas as pd import matplotlib.pyplot # somehow pandas has trouble reading the sorted frequency table a = pd.read_table("sorted_freq.txt", names = ['col']) c = [] for i in range(a.shape[0]): b = a['col'][i].split() if (len(b) == 2): if (b[1] != 'NA') and (b[1]!='ArrDelay'): c.append([float(b[0]), float(b[1])]) c = np.array(c).transpose() delay = c[1] freq = c[0] plt.plot(delay, freq, '.b') plt.xlabel('Delay time (min)', size=14) plt.ylabel('Frequency', size=14) xlim_low = -100 xlim_high = 100 plt.title('Truncated view of frequency table from'+\ '{0} to {1}'.format(xlim_low, xlim_high), size=15) plt.xlim(xlim_low, xlim_high) plt.axvline('6.566504', label = 'mean', color = 'r', ls = ':', lw = 3 ) plt.axvline('0.00', label = 'median', color = 'g', ls = '--', lw = 3 ) plt.legend(loc = 'best') plt.plot(delay, freq, '.b') plt.xlabel('Delay time (min)', size=14) plt.ylabel('Frequency', size=14) xlim_low = -100 xlim_high = 100 plt.title('Total view of frequency table', size=15) plt.axvline('6.566504', label = 'mean', color = 'r', ls = ':', lw = 3) plt.axvline('0.00', label = 'median', color = 'g', ls = '--', lw = 3 ) plt.legend(loc = 'best') print 'each line should at most take '+\ '{0}s'.format( 30 * 60 / (1e6)) y = [1, 2, 3] result1 = [6.566504, 0.00000, 31.556326] result2 = [6.56650421703, 0.0, 31.5563262623] result3 = [6.591345, 0.000000, 31.465116] plt.plot(result1, y, 'rs', markeredgewidth=1, alpha = .25, markersize = 10, label="Method 1") plt.plot(result2, y, 'gx', markeredgewidth=3, alpha = .25, markersize = 10, label="Method 2") plt.plot(result3, y, 'b*', markeredgewidth=1, alpha = .25, markersize = 10, label="Method 3") plt.legend(loc='lower right') plt.title('Comparison of statistics from different approaches', size = 15) label = ['mean', 'median', 'std. dev',] plt.yticks( y, label, size = 15) plt.margins(0.2) plt.xlabel('Arrival delay (min)', size = 15) plt.title('Comparison of runtime of different methods', size = 16) y = [1, 2, 3] time_taken = [282.900/60., 181.094/60., 269.420/60.] plt.plot(time_taken, x, 'o') label = ['Freq table - Shell + R ', 'Python Pandas', 'FastCSVSample + R',] plt.yticks( y, label, size = 12) plt.margins(0.2) plt.xlim(0,6) plt.xlabel('Wall clock time (min)', size = 15) !cat ../freq_count.sh !cat ../method1.R !cat ../compute_stat.py !cat ../method2.R !cat ../method3.R !cat ../NotSoFastCSVSample/R/csvSample.R