%pylab inline import pandas as pd import numpy as np import sys import matplotlib.pyplot as plt import apachelog sample_string = '178.137.91.215 - - [21/Feb/2014:06:44:53 +0000] "GET /work/homepages-maths-year-6/ HTTP/1.0" \ 200 10427 "http://authoralcove.com/work/homepages-maths-year-6/" "Opera/9.80 (Windows NT 6.1; WOW64; U; ru) \ Presto/2.10.289 Version/12.00" "-"' nformat = r'%h %l %u %t \"%r\" %>s %b \"%i\" \"%{User-Agent}i\" \"%V\"' p = apachelog.parser(nformat) data = p.parse(sample_string) data from apachelog import ApacheLogParserError log_list = [] with open('./private-data/access.log') as f: for line in f.readlines(): try: data = p.parse(line) except ApacheLogParserError: sys.stderr.write("Unable to parse %s" % line) data['%t'] = data['%t'][1:12]+' '+data['%t'][13:21]+' '+data['%t'][22:27] log_list.append(data) from pandas import Series, DataFrame, Panel df = DataFrame(log_list) df[0:2] del df['%V'] del df['%h'] del df['%i'] del df['%l'] del df['%u'] del df['%{User-Agent}i'] df[0:2] df = df.rename(columns={'%>s': 'Status', '%b':'b', '%r':'Request', '%t': 'Time'}) df[0:2] df.index = pd.to_datetime(df.pop('Time')) df['Status'] = df['Status'].astype('int') def dash2nan(x): if x == '-': x = np.nan else: x = float(x)/1048576. return x df['b'] = df['b'].apply(dash2nan) from pylab import * rcParams['figure.figsize'] = 10, 5 # width, height in inches df_s = df['Status'].resample('10t', how='count') df_s.plot() df['Status'].value_counts() grouped_status = df.groupby('Status') grouped_status.size().plot(kind='bar') t_span = '60t' df_200 = df['Status'][df['Status'] == 200].resample(t_span, how='count') df_502 = df['Status'][df['Status'] == 502].resample(t_span, how='count') df_499 = df['Status'][df['Status'] == 499].resample(t_span, how='count') df_304 = df['Status'][df['Status'] == 304].resample(t_span, how='count') df_302 = df['Status'][df['Status'] == 302].resample(t_span, how='count') df_404 = df['Status'][df['Status'] == 404].resample(t_span, how='count') df_504 = df['Status'][df['Status'] == 504].resample(t_span, how='count') df_301 = df['Status'][df['Status'] == 301].resample(t_span, how='count') status_df = DataFrame({'OK': df_200, 'Bad Gateway': df_502, 'Client Closed': df_499, 'Not Modified': df_304, 'Found': df_302, 'Not Found': df_404, 'Gateway Timeout': df_504, 'Moved Permenantely': df_301}) status_df.fillna(0, inplace=True) status_df[0:5] status_df.plot() legend() success_df = df[df['Status'].isin([200, 304, 302, 404, 301, 401, 403, 206, 444, 400, 408, 405])].resample(t_span, how='count') fail_df = df[df['Status'].isin([502, 499, 504, 500, 444])].resample(t_span, how='count') success_df.plot(label="Success") fail_df.plot(label="Failure") legend() dynamic_df = df[df['Request'].str.contains(r'^GET /(static|favicon.ico)') == False] dynamic_df[0:10] success_df = dynamic_df[dynamic_df['Status'].isin([200, 304, 302, 301, 401])].resample(t_span, how='count') fail_df = dynamic_df[dynamic_df['Status'].isin([502, 499, 504, 500])].resample(t_span, how='count') success_df.plot(label="Success", color="g") fail_df.plot(label="Failure", color="r") legend() responses_df = DataFrame({'Success': success_df, 'Failure': fail_df, }) responses_df.plot(kind="bar", stacked=True, xticks=(), colors=("r", "g",)) dynamic_df['Request'].value_counts().head(40) dynamic_df[dynamic_df['Request'].str.contains('GET / HTTP/1.1')]['Status'].value_counts() dynamic_df.count() dynamic_df[dynamic_df['Request'].str.contains(r'(png |xml |gif )')]['Request'].value_counts().head(10) dynamic_df[dynamic_df['Request'].str.contains(r'(png |xml |gif )')].count()