from IPython.display import HTML
HTML('')
!mkdir -p pcap
cd pcap
url="http://digitalcorpora.org/corp/nps/packets/2008-nitroba/nitroba.pcap"
# If you have curl installed, we can get nice progress bars:
#!curl -o nitroba.pcap $url
# Or use pure Python:
# import urllib
# urllib.urlretrieve(url, "nitroba.pcap")
ls -l nitroba.pcap
!md5sum nitroba.pcap
!tshark -v
!tshark -n -r nitroba.pcap -T fields -Eheader=y -e frame.number -e frame.len > frame.len
!head -10 frame.len
import pandas as pd
df=pd.read_table("frame.len")
df
df["frame.len"].describe()
%pylab inline
figsize(10,6)
df["frame.len"].plot(style=".", alpha=0.2)
title("Frame length")
ylabel("bytes")
xlabel("frame number")
import subprocess
import datetime
import pandas as pd
def read_pcap(filename, fields=[], display_filter="",
timeseries=False, strict=False):
""" Read PCAP file into Pandas DataFrame object.
Uses tshark command-line tool from Wireshark.
filename: Name or full path of the PCAP file to read
fields: List of fields to include as columns
display_filter: Additional filter to restrict frames
strict: Only include frames that contain all given fields
(Default: false)
timeseries: Create DatetimeIndex from frame.time_epoch
(Default: false)
Syntax for fields and display_filter is specified in
Wireshark's Display Filter Reference:
http://www.wireshark.org/docs/dfref/
"""
if timeseries:
fields = ["frame.time_epoch"] + fields
fieldspec = " ".join("-e %s" % f for f in fields)
display_filters = fields if strict else []
if display_filter:
display_filters.append(display_filter)
filterspec = "-R '%s'" % " and ".join(f for f in display_filters)
options = "-r %s -n -T fields -Eheader=y" % filename
cmd = "tshark %s %s %s" % (options, filterspec, fieldspec)
proc = subprocess.Popen(cmd, shell = True,
stdout=subprocess.PIPE)
if timeseries:
df = pd.read_table(proc.stdout,
index_col = "frame.time_epoch",
parse_dates=True,
date_parser=datetime.datetime.fromtimestamp)
else:
df = pd.read_table(proc.stdout)
return df
framelen=read_pcap("nitroba.pcap", ["frame.len"], timeseries=True)
framelen
bytes_per_second=framelen.resample("S", how="sum")
bytes_per_second.head()
bytes_per_second.plot()
fields=["tcp.stream", "ip.src", "ip.dst", "tcp.seq", "tcp.ack", "tcp.window_size", "tcp.len"]
ts=read_pcap("nitroba.pcap", fields, timeseries=True, strict=True)
ts
stream=ts[ts["tcp.stream"] == 10]
stream
print stream.to_string()
stream["type"] = stream.apply(lambda x: "client" if x["ip.src"] == stream.irow(0)["ip.src"] else "server", axis=1)
print stream.to_string()
client_stream=stream[stream.type == "client"]
client_stream["tcp.seq"].plot(style="r-o")
client_stream.index = arange(len(client_stream))
client_stream["tcp.seq"].plot(style="r-o")
per_stream=ts.groupby("tcp.stream")
per_stream.head()
bytes_per_stream = per_stream["tcp.len"].sum()
bytes_per_stream.head()
bytes_per_stream.plot()
bytes_per_stream.max()
biggest_stream=bytes_per_stream.idxmax()
biggest_stream
bytes_per_stream.ix[biggest_stream]
trailer_df = read_pcap("nitroba.pcap", ["eth.src", "eth.trailer"], timeseries=True)
trailer_df
trailer=trailer_df["eth.trailer"]
trailer
trailer.value_counts()
import binascii
def unhex(s, sep=":"):
return binascii.unhexlify("".join(s.split(sep)))
s=unhex("3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02")
s
padding = trailer_df.dropna()
padding["unhex"]=padding["eth.trailer"].map(unhex)
def printable(s):
chars = []
for c in s:
if c.isalnum():
chars.append(c)
else:
chars.append(".")
return "".join(chars)
printable("\x95asd\x33")
padding["printable"]=padding["unhex"].map(printable)
padding["printable"].value_counts()
def ratio_printable(s):
printable = sum(1.0 for c in s if c.isalnum())
return printable / len(s)
ratio_printable("a\x93sdfs")
padding["ratio_printable"] = padding["unhex"].map(ratio_printable)
padding[padding["ratio_printable"] > 0.5]
_.printable.value_counts()
padding[padding["ratio_printable"] > 0.5]['eth.src'].drop_duplicates()
HTML('')