from IPython.display import HTML HTML('') !mkdir -p pcap cd pcap url="http://digitalcorpora.org/corp/nps/packets/2008-nitroba/nitroba.pcap" # If you have curl installed, we can get nice progress bars: #!curl -o nitroba.pcap $url # Or use pure Python: # import urllib # urllib.urlretrieve(url, "nitroba.pcap") ls -l nitroba.pcap !md5sum nitroba.pcap !tshark -v !tshark -n -r nitroba.pcap -T fields -Eheader=y -e frame.number -e frame.len > frame.len !head -10 frame.len import pandas as pd df=pd.read_table("frame.len") df df["frame.len"].describe() %pylab inline figsize(10,6) df["frame.len"].plot(style=".", alpha=0.2) title("Frame length") ylabel("bytes") xlabel("frame number") import subprocess import datetime import pandas as pd def read_pcap(filename, fields=[], display_filter="", timeseries=False, strict=False): """ Read PCAP file into Pandas DataFrame object. Uses tshark command-line tool from Wireshark. filename: Name or full path of the PCAP file to read fields: List of fields to include as columns display_filter: Additional filter to restrict frames strict: Only include frames that contain all given fields (Default: false) timeseries: Create DatetimeIndex from frame.time_epoch (Default: false) Syntax for fields and display_filter is specified in Wireshark's Display Filter Reference: http://www.wireshark.org/docs/dfref/ """ if timeseries: fields = ["frame.time_epoch"] + fields fieldspec = " ".join("-e %s" % f for f in fields) display_filters = fields if strict else [] if display_filter: display_filters.append(display_filter) filterspec = "-R '%s'" % " and ".join(f for f in display_filters) options = "-r %s -n -T fields -Eheader=y" % filename cmd = "tshark %s %s %s" % (options, filterspec, fieldspec) proc = subprocess.Popen(cmd, shell = True, stdout=subprocess.PIPE) if timeseries: df = pd.read_table(proc.stdout, index_col = "frame.time_epoch", parse_dates=True, date_parser=datetime.datetime.fromtimestamp) else: df = pd.read_table(proc.stdout) return df framelen=read_pcap("nitroba.pcap", ["frame.len"], timeseries=True) framelen bytes_per_second=framelen.resample("S", how="sum") bytes_per_second.head() bytes_per_second.plot() fields=["tcp.stream", "ip.src", "ip.dst", "tcp.seq", "tcp.ack", "tcp.window_size", "tcp.len"] ts=read_pcap("nitroba.pcap", fields, timeseries=True, strict=True) ts stream=ts[ts["tcp.stream"] == 10] stream print stream.to_string() stream["type"] = stream.apply(lambda x: "client" if x["ip.src"] == stream.irow(0)["ip.src"] else "server", axis=1) print stream.to_string() client_stream=stream[stream.type == "client"] client_stream["tcp.seq"].plot(style="r-o") client_stream.index = arange(len(client_stream)) client_stream["tcp.seq"].plot(style="r-o") per_stream=ts.groupby("tcp.stream") per_stream.head() bytes_per_stream = per_stream["tcp.len"].sum() bytes_per_stream.head() bytes_per_stream.plot() bytes_per_stream.max() biggest_stream=bytes_per_stream.idxmax() biggest_stream bytes_per_stream.ix[biggest_stream] trailer_df = read_pcap("nitroba.pcap", ["eth.src", "eth.trailer"], timeseries=True) trailer_df trailer=trailer_df["eth.trailer"] trailer trailer.value_counts() import binascii def unhex(s, sep=":"): return binascii.unhexlify("".join(s.split(sep))) s=unhex("3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02") s padding = trailer_df.dropna() padding["unhex"]=padding["eth.trailer"].map(unhex) def printable(s): chars = [] for c in s: if c.isalnum(): chars.append(c) else: chars.append(".") return "".join(chars) printable("\x95asd\x33") padding["printable"]=padding["unhex"].map(printable) padding["printable"].value_counts() def ratio_printable(s): printable = sum(1.0 for c in s if c.isalnum()) return printable / len(s) ratio_printable("a\x93sdfs") padding["ratio_printable"] = padding["unhex"].map(ratio_printable) padding[padding["ratio_printable"] > 0.5] _.printable.value_counts() padding[padding["ratio_printable"] > 0.5]['eth.src'].drop_duplicates() HTML('')