#!/usr/bin/env python # coding: utf-8 # # NFStream: a Flexible Network Data Analysis Framework # In[1]: import nfstream print(nfstream.__version__) # [**NFStream**][repo] is a multiplatform Python framework providing fast, flexible, and expressive data structures designed to make # working with **online** or **offline** network data easy and intuitive. It aims to be Python's fundamental high-level # building block for doing practical, **real-world** network flow data analysis. Additionally, it has the broader # goal of becoming **a unifying network data analytics framework for researchers** providing data reproducibility # across experiments. # # * **Performance:** NFStream is designed to be fast: [**AF_PACKET_V3/FANOUT**][packet] on Linux, multiprocessing, native # [**CFFI based**][cffi] computation engine, and [**PyPy**][pypy] full support. # * **Encrypted layer-7 visibility:** NFStream deep packet inspection is based on [**nDPI**][ndpi]. # It allows NFStream to perform [**reliable**][reliable] encrypted applications identification and metadata # fingerprinting (e.g. TLS, SSH, DHCP, HTTP). # * **System visibility:** NFStream probes the monitored system's kernel to obtain information on open Internet sockets # and collects guaranteed ground-truth (process name, PID, etc.) at the application level. # * **Statistical features extraction:** NFStream provides state of the art of flow-based statistical feature extraction. # It includes post-mortem statistical features (e.g., minimum, mean, standard deviation, and maximum of packet size and # inter-arrival time) and early flow features (e.g. sequence of first n packets sizes, inter-arrival times, and directions). # * **Flexibility:** NFStream is easily extensible using [**NFPlugins**][nfplugin]. It allows the creation of a new flow # feature within a few lines of Python. # * **Machine Learning oriented:** NFStream aims to make Machine Learning Approaches for network traffic management # reproducible and deployable. By using NFStream as a common framework, researchers ensure that models are trained using # the same feature computation logic, and thus, a fair comparison is possible. Moreover, trained models can be deployed # and evaluated on live networks using [**NFPlugins**][nfplugin]. # # # In this notebook, we demonstrate a subset of features provided by [**NFStream**][repo]. # # [ndpi]: https://github.com/ntop/nDPI # [nfplugin]: https://nfstream.github.io/docs/api#nfplugin # [reliable]: http://people.ac.upc.edu/pbarlet/papers/ground-truth.pam2014.pdf # [repo]: https://nfstream.org/ # [pypy]: https://www.pypy.org/ # [cffi]: https://cffi.readthedocs.io/en/latest/index.html # In[2]: from nfstream import NFStreamer, NFPlugin import pandas as pd pd.set_option('display.max_columns', 500) pd.set_option('display.max_rows', 500) # ## Flow aggregation made simple # In the following, we are going to use the main object provided by nfstream, `NFStreamer` which have the following parameters: # # * `source` [default=None]: Packet capture source. Pcap file path or network interface name. # * `decode_tunnels` [default=True]: Enable/Disable GTP/TZSP tunnels decoding. # * `bpf_filter` [default=None]: Specify a [BPF filter][bpf] filter for filtering selected traffic. # * `promiscuous_mode` [default=True]: Enable/Disable promiscuous capture mode. # * `snapshot_length` [default=1500]: Control packet slicing size (truncation) in bytes. # * `idle_timeout` [default=120]: Flows that are idle (no packets received) for more than this value in seconds are expired. # * `active_timeout` [default=1800]: Flows that are active for more than this value in seconds are expired. # * `accounting_mode` [default=0] : Specify the accounting mode that will be used to report bytes related features (0: Link layer, 1: IP layer, 2: Transport layer, 3: Payload). # * `udps` [default=None]: Specify user defined NFPlugins used to extend NFStreamer. # * `n_dissections` [default=20]: Number of per flow packets to dissect for L7 visibility feature. When set to 0, L7 visibility feature is disabled. # * `statistical_analysis` [default=False]: Enable/Disable post-mortem flow statistical analysis. # * `splt_analysis` [default=0]: Specify the sequence of first packets length for early statistical analysis. When set to 0, splt_analysis is disabled. # * `max_nflows` [default=0]: Specify the number of maximum flows to capture before returning. Unset when equal to 0. # * `n_meters` [default=0]: Specify the number of parallel metering processes. When set to 0, NFStreamer will automatically scale metering according to available physical cores on the running host. # * `performance_report` [default=0]: [**Performance report**](https://github.com/nfstream/nfstream/blob/master/assets/PERFORMANCE_REPORT.md) interval in seconds. Disabled whhen set to 0. Ignored for offline capture. # * `system_visibility_mode` [default=0] Enable system process mapping by probing the host machine. # * `system_visibility_poll_ms` [default=100] Set the polling interval in milliseconds for system process mapping feature (0 is the maximum achievable rate). # # `NFStreamer` returns a flow iterator. We can iterate over flows or convert it directly to pandas Dataframe using `to_pandas()` method. # # [bpf]: https://biot.com/capstats/bpf.html # In[3]: df = NFStreamer(source="pcap/instagram.pcap").to_pandas() # In[4]: df.head() # In[5]: df.shape # We can enable post-mortem statistical flow features extraction as follow: # In[6]: df = NFStreamer(source="pcap/instagram.pcap", statistical_analysis=True).to_pandas() # In[7]: df.head() # We can enable early statistical flow features extraction as follow: # In[8]: df = NFStreamer(source="pcap/instagram.pcap", splt_analysis=10).to_pandas() # In[9]: df.head() # We can enable IP anonymization as follow: # In[10]: df = NFStreamer(source="pcap/instagram.pcap", statistical_analysis=True).to_pandas(columns_to_anonymize=["src_ip", "src_mac", "dst_ip", "dst_mac"]) # In[11]: df.head() # Now that we have our Dataframe, we can start analyzing our data as any data. For example we can compute additional features: # # * Compute data ratio on both direction (src2dst and dst2src) # In[12]: df["src2dst_bytes_data_ratio"] = df['src2dst_bytes'] / df['bidirectional_bytes'] df["dst2src_bytes_data_ratio"] = df['dst2src_bytes'] / df['bidirectional_bytes'] # In[13]: df.head() # * Filter data according to some criterias: # In[14]: df[df["dst_port"] == 443].head() # ## Extend nfstream # In some use cases, we need to add features that are computed as packet level. Thus, nfstream handles such scenario using [**NFPlugin**][nfplugin]. # # [nfplugin]: https://nfstream.github.io/docs/api#nfplugin # # * Let's suppose that we want bidirectional packets with exact IP size equal to 40 counter per flow. # In[15]: class Packet40Count(NFPlugin): def on_init(self, pkt, flow): # flow creation with the first packet if pkt.ip_size == 40: flow.udps.packet_with_40_ip_size=1 else: flow.udps.packet_with_40_ip_size=0 def on_update(self, pkt, flow): # flow update with each packet belonging to the flow if pkt.ip_size == 40: flow.udps.packet_with_40_ip_size += 1 # In[16]: df = NFStreamer(source="pcap/google_ssl.pcap", udps=[Packet40Count()]).to_pandas() # In[17]: df.head() # Our Dataframe have a new column named `udps.packet_with_40_ip_size`.