Notebook

Cross-dataset evaluation of universal embedding function for traffic classification¶

Install faiss with GPU support¶

Installing faiss with GPU support can be a bit complicated. See the official instructions - https://github.com/facebookresearch/faiss/blob/main/INSTALL.md. To run this notebook, we recommend starting with a fresh conda env by running the following:

conda create -n cross_dataset_faiss_env python=3.10 ipykernel ipywidgets
conda install -c pytorch -c nvidia pytorch pytorch-cuda=12.4
conda install -c conda-forge faiss-gpu=1.8.0 numpy=1.26.4

This installs faiss and PyTorch with conda and the rest is installed with pip. For Linux, there are unofficial faiss wheels available here - https://pypi.org/project/faiss-gpu-cu12/#description, but we did not tested them.

If installing the GPU version is not possible, use:

pip install faiss-cpu

Install common dependencies for both Windows and Linux¶

In [ ]:

# import sys
# !{sys.executable} -m pip install cesnet_models cesnet_datazoo tqdm torchinfo

# import faiss
# if hasattr(faiss, "StandardGpuResources"):
#     print("Faiss with GPU support is available")

Collecting cesnet_models
  Using cached cesnet_models-0.4.0-py3-none-any.whl.metadata (3.6 kB)
Collecting cesnet_datazoo
  Using cached cesnet_datazoo-0.1.10-py3-none-any.whl.metadata (12 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torchinfo
  Using cached torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Requirement already satisfied: numpy<2.0 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from cesnet_models) (1.26.4)
Collecting scikit-learn (from cesnet_models)
  Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Requirement already satisfied: torch>=1.10 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from cesnet_models) (2.5.1)
Collecting matplotlib (from cesnet_datazoo)
  Using cached matplotlib-3.10.0-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting pandas (from cesnet_datazoo)
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pydantic<=2.8.2,>=2.0 (from cesnet_datazoo)
  Using cached pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
Requirement already satisfied: PyYAML in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from cesnet_datazoo) (6.0.2)
Collecting requests (from cesnet_datazoo)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting seaborn (from cesnet_datazoo)
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting tables<=3.9.2,>=3.8.0 (from cesnet_datazoo)
  Using cached tables-3.9.2-cp310-cp310-win_amd64.whl.metadata (2.3 kB)
Requirement already satisfied: colorama in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from tqdm) (0.4.6)
Collecting annotated-types>=0.4.0 (from pydantic<=2.8.2,>=2.0->cesnet_datazoo)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.20.1 (from pydantic<=2.8.2,>=2.0->cesnet_datazoo)
  Using cached pydantic_core-2.20.1-cp310-none-win_amd64.whl.metadata (6.7 kB)
Requirement already satisfied: typing-extensions>=4.6.1 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from pydantic<=2.8.2,>=2.0->cesnet_datazoo) (4.12.2)
Collecting numexpr>=2.6.2 (from tables<=3.9.2,>=3.8.0->cesnet_datazoo)
  Using cached numexpr-2.10.2-cp310-cp310-win_amd64.whl.metadata (8.3 kB)
Requirement already satisfied: packaging in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from tables<=3.9.2,>=3.8.0->cesnet_datazoo) (24.2)
Collecting py-cpuinfo (from tables<=3.9.2,>=3.8.0->cesnet_datazoo)
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting blosc2>=2.3.0 (from tables<=3.9.2,>=3.8.0->cesnet_datazoo)
  Using cached blosc2-2.7.1-cp310-cp310-win_amd64.whl.metadata (9.3 kB)
Requirement already satisfied: filelock in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from torch>=1.10->cesnet_models) (3.13.1)
Requirement already satisfied: networkx in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from torch>=1.10->cesnet_models) (3.3)
Requirement already satisfied: jinja2 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from torch>=1.10->cesnet_models) (3.1.4)
Collecting fsspec (from torch>=1.10->cesnet_models)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch>=1.10->cesnet_models)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from sympy==1.13.1->torch>=1.10->cesnet_models) (1.3.0)
Collecting contourpy>=1.0.1 (from matplotlib->cesnet_datazoo)
  Using cached contourpy-1.3.1-cp310-cp310-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib->cesnet_datazoo)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->cesnet_datazoo)
  Using cached fonttools-4.55.3-cp310-cp310-win_amd64.whl.metadata (168 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->cesnet_datazoo)
  Using cached kiwisolver-1.4.8-cp310-cp310-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib->cesnet_datazoo)
  Using cached pillow-11.1.0-cp310-cp310-win_amd64.whl.metadata (9.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib->cesnet_datazoo)
  Using cached pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from matplotlib->cesnet_datazoo) (2.9.0.post0)
Collecting pytz>=2020.1 (from pandas->cesnet_datazoo)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->cesnet_datazoo)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting charset-normalizer<4,>=2 (from requests->cesnet_datazoo)
  Using cached charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests->cesnet_datazoo)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests->cesnet_datazoo)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests->cesnet_datazoo)
  Using cached certifi-2024.12.14-py3-none-any.whl.metadata (2.3 kB)
Collecting scipy>=1.6.0 (from scikit-learn->cesnet_models)
  Using cached scipy-1.15.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->cesnet_models)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->cesnet_models)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting ndindex>=1.4 (from blosc2>=2.3.0->tables<=3.9.2,>=3.8.0->cesnet_datazoo)
  Using cached ndindex-1.9.2-cp310-cp310-win_amd64.whl.metadata (3.5 kB)
Collecting msgpack (from blosc2>=2.3.0->tables<=3.9.2,>=3.8.0->cesnet_datazoo)
  Using cached msgpack-1.1.0-cp310-cp310-win_amd64.whl.metadata (8.6 kB)
Requirement already satisfied: six>=1.5 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from python-dateutil>=2.7->matplotlib->cesnet_datazoo) (1.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\janlu\miniconda3\envs\cross_dataset_faiss_env\lib\site-packages (from jinja2->torch>=1.10->cesnet_models) (2.1.3)
Using cached cesnet_models-0.4.0-py3-none-any.whl (38 kB)
Using cached cesnet_datazoo-0.1.10-py3-none-any.whl (51 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Using cached torchinfo-1.8.0-py3-none-any.whl (23 kB)
Using cached pydantic-2.8.2-py3-none-any.whl (423 kB)
Using cached pydantic_core-2.20.1-cp310-none-win_amd64.whl (1.9 MB)
Using cached tables-3.9.2-cp310-cp310-win_amd64.whl (4.4 MB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Using cached matplotlib-3.10.0-cp310-cp310-win_amd64.whl (8.0 MB)
Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)
Using cached blosc2-2.7.1-cp310-cp310-win_amd64.whl (2.4 MB)
Using cached certifi-2024.12.14-py3-none-any.whl (164 kB)
Using cached charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl (102 kB)
Using cached contourpy-1.3.1-cp310-cp310-win_amd64.whl (218 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Using cached fonttools-4.55.3-cp310-cp310-win_amd64.whl (2.2 MB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached kiwisolver-1.4.8-cp310-cp310-win_amd64.whl (71 kB)
Using cached numexpr-2.10.2-cp310-cp310-win_amd64.whl (144 kB)
Using cached pillow-11.1.0-cp310-cp310-win_amd64.whl (2.6 MB)
Using cached pyparsing-3.2.1-py3-none-any.whl (107 kB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached scipy-1.15.1-cp310-cp310-win_amd64.whl (43.9 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Using cached urllib3-2.3.0-py3-none-any.whl (128 kB)
Using cached fsspec-2024.12.0-py3-none-any.whl (183 kB)
Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Using cached ndindex-1.9.2-cp310-cp310-win_amd64.whl (159 kB)
Using cached msgpack-1.1.0-cp310-cp310-win_amd64.whl (74 kB)
Installing collected packages: pytz, py-cpuinfo, urllib3, tzdata, tqdm, torchinfo, threadpoolctl, sympy, scipy, pyparsing, pydantic-core, pillow, numexpr, ndindex, msgpack, kiwisolver, joblib, idna, fsspec, fonttools, cycler, contourpy, charset-normalizer, certifi, annotated-types, scikit-learn, requests, pydantic, pandas, matplotlib, blosc2, tables, seaborn, cesnet_models, cesnet_datazoo
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
Successfully installed annotated-types-0.7.0 blosc2-2.7.1 certifi-2024.12.14 cesnet_datazoo-0.1.10 cesnet_models-0.4.0 charset-normalizer-3.4.1 contourpy-1.3.1 cycler-0.12.1 fonttools-4.55.3 fsspec-2024.12.0 idna-3.10 joblib-1.4.2 kiwisolver-1.4.8 matplotlib-3.10.0 msgpack-1.1.0 ndindex-1.9.2 numexpr-2.10.2 pandas-2.2.3 pillow-11.1.0 py-cpuinfo-9.0.0 pydantic-2.8.2 pydantic-core-2.20.1 pyparsing-3.2.1 pytz-2024.2 requests-2.32.3 scikit-learn-1.6.1 scipy-1.15.1 seaborn-0.13.2 sympy-1.13.1 tables-3.9.2 threadpoolctl-3.5.0 torchinfo-1.8.0 tqdm-4.67.1 tzdata-2024.2 urllib3-2.3.0
Faiss with GPU support is available

Tcbench install on Linux¶

In [ ]:

# import sys
# !{sys.executable} -m pip install tcbench 

Tcbench install on Windows¶

The tcbench framework depends on Aim for experiment tracking. Aim is not supported on Windows - https://github.com/aimhubio/aim/issues/2064. The workaround is to install tcbench dependencies without Aim, which works because the experiment tracking functionality is not needed for downloading datasets and obtaining the provided train, validation, and test splits.

After installing tcbench like this, you need to comment out all imports of Aim. For tcbench==0.0.22, Aim imports need to be commented out in the following files:

cli/command_aimrepo.py

In [2]:

# import sys
# !{sys.executable} -m pip install --no-deps tcbench rich rich_click click click_plugins pyarrow==12.0.0

Collecting tcbench
  Using cached tcbench-0.0.22-py3-none-any.whl.metadata (7.5 kB)
Collecting rich
  Using cached rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting rich_click
  Using cached rich_click-1.8.5-py3-none-any.whl.metadata (7.9 kB)
Collecting click
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting click_plugins
  Using cached click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting pyarrow==12.0.0
  Using cached pyarrow-12.0.0-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Using cached pyarrow-12.0.0-cp310-cp310-win_amd64.whl (21.5 MB)
Using cached tcbench-0.0.22-py3-none-any.whl (115 kB)
Using cached rich-13.9.4-py3-none-any.whl (242 kB)
Using cached rich_click-1.8.5-py3-none-any.whl (35 kB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: click_plugins, tcbench, rich_click, rich, pyarrow, click
Successfully installed click-8.1.8 click_plugins-1.1.1 pyarrow-12.0.0 rich-13.9.4 rich_click-1.8.5 tcbench-0.0.22

Download datasets¶

Install tcbench datasets in command line within the activated conda env:

tcbench datasets import --name ucdavis-icdm19
tcbench datasets import --name utmobilenet21
tcbench datasets install --name mirage19
tcbench datasets install --name mirage22

The installation of the MIRAGE22 dataset can sometimes fail while generating the splits because it runs out of RAM (for a 32GB machine). A possible solution is installing this dataset on a remote machine with more RAM and downloading the parquet files.

Check that all datasets are installed (data splits filled) with:

tcbench datasets info

And proceed with installing the CESNET-TLS22 dataset.

In [2]:

# from cesnet_datazoo.datasets import CESNET_TLS22
# dataset = CESNET_TLS22(data_root="data/CESNET-TLS22/", size="S", silent=False)

Downloading CESNET-TLS22-S dataset
File size: 3.01GB
Remaining: 3.01GB

100%|██████████| 3.01G/3.01G [05:56<00:00, 9.07MB/s]

In [3]:

from functools import partial
from typing import Optional

import numpy as np
import pandas as pd
import tcbench as tcb
from tcbench.libtcdatasets.utmobilenet21_generate_splits import _verify_splits

TCBENCH_APP_COLUMN = "app"
PPI_MAX_LEN = 30
PPI_IPT_POS = 0
PPI_DIR_POS = 1
PPI_SIZE_POS = 2


def tcbench_convert_ppi(row, is_utmobilenet: bool = False):
    directions = np.where(row["pkts_dir"] == 0, -1, 1)
    sizes = row["pkts_size"]
    if is_utmobilenet:
        # For UTMOBILENET21, the time differences are already in the "timetofirst" column
        time_differences = row["timetofirst"].copy()
    else:
        time_differences = np.diff(row["timetofirst"], prepend=0)
        assert len(directions) == len(sizes) == len(time_differences)
        assert np.isclose(time_differences.cumsum(), row["timetofirst"]).all()
        if "pkts_iat" in row:
            assert np.isclose(time_differences, row["pkts_iat"]).all()
        time_differences[0] = 0.0
    if "duration" in row:
        assert np.isclose(row["duration"], time_differences.sum())
    time_differences = time_differences * 1000 # convert to ms
    # cesnet-models expects the following PPI format: (IPT, DIR, SIZE)
    ppi = (time_differences, directions, sizes)
    ppi = np.array(ppi)[:, :PPI_MAX_LEN]
    ppi = np.pad(ppi, pad_width=((0, 0), (0, PPI_MAX_LEN - len(ppi[0]))))
    return ppi

def get_data_from_tcbench(dataset_enum: tcb.DATASETS, split_id: int = 0, ucdavis_test_set: Optional[str] = None, use_val_as_train: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    if dataset_enum == tcb.DATASETS.UCDAVISICDM19:
        if ucdavis_test_set is None:
            raise ValueError("ucdavis_test_set must be either 'script' or 'human' when using the UCDAVIS19 dataset")
        # Use following to start using the tcbench prepared train splits for UCDAVIS19
        # df_train = tcb.load_parquet(dataset_enum, split=split_id) # type: ignore
        df = tcb.load_parquet(dataset_enum)
        df_train = df[df["partition"] == "pretraining"]
        df_test = tcb.load_parquet(dataset_enum, split=ucdavis_test_set)
    else:
        df = tcb.load_parquet(dataset_enum, min_pkts=10)
        df_splits = tcb.load_parquet(dataset_enum, min_pkts=10, split=True) # type: ignore
        _verify_splits(df, df_splits)
        split_indices = df_splits.iloc[split_id]
        train_incides, val_indices, test_indices = split_indices["train_indexes"], split_indices["val_indexes"], split_indices["test_indexes"]
        df_train, df_val, df_test = df.iloc[train_incides], df.iloc[val_indices], df.iloc[test_indices]
        if use_val_as_train:
            df_train = pd.concat([df_train, df_val])
    ppi_fn = partial(tcbench_convert_ppi, is_utmobilenet=dataset_enum==tcb.DATASETS.UTMOBILENET21)
    train_data, test_data = np.stack(df_train.apply(ppi_fn, axis=1)), np.stack(df_test.apply(ppi_fn, axis=1))
    train_labels, test_labels = df_train[TCBENCH_APP_COLUMN].to_numpy(), df_test[TCBENCH_APP_COLUMN].to_numpy()
    return train_data, test_data, train_labels, test_labels

In [4]:

import numpy as np
from cesnet_datazoo.config import DatasetConfig
from cesnet_datazoo.constants import APP_COLUMN, PPI_COLUMN
from cesnet_datazoo.datasets import CESNET_TLS22


def load_cesnet_tls22_from_datazoo(dataset_size: str = "S", split_id: int = 0, train_size: int = 1_000_000, test_size: int = 1_000_000) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    assert test_size < 10_000_000
    dataset = CESNET_TLS22(data_root="data/CESNET-TLS22/", size=dataset_size, silent=True)
    dataset_config = DatasetConfig(
        dataset=dataset,
        fold_id=split_id,
        batch_size=16384,
        test_batch_size=16384,
        train_period_name="W-2021-40",
        test_period_name="W-2021-41",
        train_size=train_size,
        test_known_size="all",
        train_workers=0,
        test_workers=0,
        need_val_set=False,)
    dataset.set_dataset_config_and_initialize(dataset_config)
    assert dataset.class_info is not None
    df_train = dataset.get_train_df()
    df_test = dataset.get_test_df().sample(test_size, random_state=42 + split_id)
    train_data, test_data = np.stack(df_train[PPI_COLUMN]), np.stack(df_test[PPI_COLUMN])
    train_labels, test_labels = dataset.class_info.encoder.inverse_transform(df_train[APP_COLUMN]), dataset.class_info.encoder.inverse_transform(df_test[APP_COLUMN])
    return train_data, test_data, train_labels, test_labels

In [5]:

from typing import Callable

import faiss
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


class DatasetWithTransform(Dataset):
    ppi_transform: Callable
    labels: np.ndarray
    data: torch.Tensor

    def __init__(self, data: np.ndarray, labels: np.ndarray, ppi_transform: Callable) -> None:
        assert len(data) == len(labels)
        self.ppi_transform = ppi_transform
        self.labels = labels
        self.data = torch.from_numpy(self.ppi_transform(data).astype("float32"))

    def __getitem__(self, index) -> torch.Tensor:
        return self.data[index]

    def __len__(self) -> int:
        return len(self.labels)

def compute_embeddings_from_loaded_dataset(model: nn.Module, dataloader: DataLoader, device, silent: bool = False) -> tuple[np.ndarray, np.ndarray]:
    assert isinstance(dataloader.dataset, DatasetWithTransform)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch_ppi in tqdm(dataloader, total=len(dataloader), disable=silent):
            batch_ppi = batch_ppi.to(device)
            batch_embeddings = model(batch_ppi)
            embeddings.append(batch_embeddings)
    embeddings = torch.cat(embeddings).cpu().numpy()
    return embeddings, dataloader.dataset.labels

def prepare_dataloader(data, labels, ppi_transform, batch_size=2048):
    dataset = DatasetWithTransform(data=data, labels=labels, ppi_transform=ppi_transform)
    return DataLoader(dataset, batch_size=batch_size, drop_last=False)

def find_ranks_faiss(vecs, qvecs, device: torch.device, metric: str = "cosine", N: int = 5, batch_size: Optional[int] = None, silent: bool = False) -> tuple[np.ndarray, np.ndarray]:
    if metric == "cosine":
        index = faiss.IndexFlatIP(vecs.shape[-1])
    elif metric == "L1":
        index = faiss.IndexFlat(vecs.shape[-1], faiss.METRIC_L1)
    if device.type == "cuda" and hasattr(faiss, "StandardGpuResources"):
        torch.cuda.empty_cache()
        gpu_res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(gpu_res, 0, index)

    index.add(vecs) # type: ignore
    if batch_size is None:
        scores, ranks = index.search(qvecs, N) # type: ignore
    else:
        num_batches = (len(qvecs) // batch_size) + 1
        scores_list = []
        ranks_list = []
        for batch in tqdm(np.array_split(qvecs, num_batches), total=num_batches, disable=silent):
            scores, ranks = index.search(batch, N) # type: ignore
            scores_list.append(scores)
            ranks_list.append(ranks)
        scores = np.concatenate(scores_list, axis=0)
        ranks = np.concatenate(ranks_list, axis=0)
    return scores, ranks

In [6]:

def prepare_input_space_embeddings(data, num_packets: int = 30, ipt_max_clip: int = 1000, ipt_scale: float = 1.0, dir_scale: float = 1.0):
    data = data[:, :, :num_packets]
    sizes = data[:, PPI_SIZE_POS].clip(min=0, max=1500)
    dirs = data[:, PPI_DIR_POS] * dir_scale
    times = data[:, PPI_IPT_POS].clip(min=0, max=ipt_max_clip) * ipt_scale
    embeddings =  np.hstack((dirs, sizes, times))
    return embeddings

def replace_unseen_packet_embeddings(embedding_model, replace_threshold: int = 1, small_packets_replace_with: int = 0, silent: bool = False) -> None:
    backbone_model = embedding_model.backbone_model
    if not hasattr(backbone_model, "psizes_hist"):
        print("Histogram of training packet sizes is not available")
        return
    df_train_packets = pd.DataFrame(backbone_model.psizes_hist, columns=["Count"])
    df_train_packets["Perc"] = df_train_packets["Count"] / df_train_packets["Count"].sum()
    packets_to_replace = df_train_packets[df_train_packets["Count"] < replace_threshold].index
    if len(packets_to_replace) == 0:
        print(f"All packet sizes were seen at least {replace_threshold} times")
        return
    # Small <100 unseen packets are replaced with the embedding of 'small_packets_replace_with'
    for i in packets_to_replace[packets_to_replace < 100]: # type: ignore
        backbone_model.packet_size_nn_embedding.weight.data[i] = backbone_model.packet_size_nn_embedding.weight.data[small_packets_replace_with]
        if not silent: print(f"Setting the packet size embedding of {i} ({df_train_packets.Count.iloc[i]} obs) to {small_packets_replace_with} ({df_train_packets.Count.iloc[small_packets_replace_with]} obs)")
    # Big >=1250 unseen packets are replaced with their closest seen packet
    seen_big_packets = [i for i in range(1250, 1501) if i not in packets_to_replace]
    for i in packets_to_replace[packets_to_replace >= 1250]: # type: ignore
        replace_with = min(seen_big_packets, key=lambda x: abs(x - i)) # type: ignore
        if not silent: print(f"Setting the packet size embedding of {i} ({df_train_packets.Count.iloc[i]} obs) to {replace_with} ({df_train_packets.Count.iloc[replace_with]} obs)")
        backbone_model.packet_size_nn_embedding.weight.data[i] = backbone_model.packet_size_nn_embedding.weight.data[replace_with]

In [7]:

from cesnet_models.models import (Model_30pktTCNET_256_Weights,
                                  model_30pktTCNET_256)
from torchinfo import summary

SMALL_PACKETS_REPLACE_WITH = 0
REPLACE_THRESHOLD = 1


pretrained_weights = Model_30pktTCNET_256_Weights.DEFAULT
ppi_transform = pretrained_weights.transforms["ppi_transform"]
embedding_model = model_30pktTCNET_256(weights=pretrained_weights)
replace_unseen_packet_embeddings(embedding_model, replace_threshold=REPLACE_THRESHOLD, small_packets_replace_with=SMALL_PACKETS_REPLACE_WITH, silent=True)
summary(embedding_model.to("cuda"), input_size=(2048, 3, 30), depth=5)

Out[7]:

===============================================================================================
Layer (type:depth-idx)                        Output Shape              Param #
===============================================================================================
EmbeddingModel                                [2048, 256]               --
├─Multimodal_CESNET_Enhanced: 1-1             --                        --
│    └─Embedding: 2-1                         [2048, 30, 20]            30,020
│    └─Embedding: 2-2                         [2048, 30, 10]            2,000
│    └─Identity: 2-3                          [2048, 32, 30]            --
│    └─Sequential: 2-4                        [2048, 448, 30]           --
│    │    └─Bottleneck: 3-1                   [2048, 192, 30]           --
│    │    │    └─Sequential: 4-1              [2048, 192, 30]           --
│    │    │    │    └─Identity: 5-1           [2048, 32, 30]            --
│    │    │    │    └─PadConv1d: 5-2          [2048, 192, 30]           6,144
│    │    │    │    └─BatchNorm1d: 5-3        [2048, 192, 30]           384
│    │    │    └─PadConv1d: 4-2               [2048, 48, 30]            1,536
│    │    │    └─BatchNorm1d: 4-3             [2048, 48, 30]            96
│    │    │    └─ReLU: 4-4                    [2048, 48, 30]            --
│    │    │    └─PadConv1d: 4-5               [2048, 48, 30]            16,128
│    │    │    └─BatchNorm1d: 4-6             [2048, 48, 30]            96
│    │    │    └─ReLU: 4-7                    [2048, 48, 30]            --
│    │    │    └─PadConv1d: 4-8               [2048, 192, 30]           9,216
│    │    │    └─BatchNorm1d: 4-9             [2048, 192, 30]           384
│    │    │    └─Identity: 4-10               [2048, 192, 30]           --
│    │    │    └─ReLU: 4-11                   [2048, 192, 30]           --
│    │    └─Bottleneck: 3-2                   [2048, 256, 30]           --
│    │    │    └─Sequential: 4-12             [2048, 256, 30]           --
│    │    │    │    └─Identity: 5-4           [2048, 192, 30]           --
│    │    │    │    └─PadConv1d: 5-5          [2048, 256, 30]           49,152
│    │    │    │    └─BatchNorm1d: 5-6        [2048, 256, 30]           512
│    │    │    └─PadConv1d: 4-13              [2048, 64, 30]            12,288
│    │    │    └─BatchNorm1d: 4-14            [2048, 64, 30]            128
│    │    │    └─ReLU: 4-15                   [2048, 64, 30]            --
│    │    │    └─PadConv1d: 4-16              [2048, 64, 30]            28,672
│    │    │    └─BatchNorm1d: 4-17            [2048, 64, 30]            128
│    │    │    └─ReLU: 4-18                   [2048, 64, 30]            --
│    │    │    └─PadConv1d: 4-19              [2048, 256, 30]           16,384
│    │    │    └─BatchNorm1d: 4-20            [2048, 256, 30]           512
│    │    │    └─Dropout: 4-21                [2048, 256, 30]           --
│    │    │    └─ReLU: 4-22                   [2048, 256, 30]           --
│    │    └─Bottleneck: 3-3                   [2048, 384, 30]           --
│    │    │    └─Sequential: 4-23             [2048, 384, 30]           --
│    │    │    │    └─Identity: 5-7           [2048, 256, 30]           --
│    │    │    │    └─PadConv1d: 5-8          [2048, 384, 30]           98,304
│    │    │    │    └─BatchNorm1d: 5-9        [2048, 384, 30]           768
│    │    │    └─PadConv1d: 4-24              [2048, 96, 30]            24,576
│    │    │    └─BatchNorm1d: 4-25            [2048, 96, 30]            192
│    │    │    └─ReLU: 4-26                   [2048, 96, 30]            --
│    │    │    └─PadConv1d: 4-27              [2048, 96, 30]            46,080
│    │    │    └─BatchNorm1d: 4-28            [2048, 96, 30]            192
│    │    │    └─ReLU: 4-29                   [2048, 96, 30]            --
│    │    │    └─PadConv1d: 4-30              [2048, 384, 30]           36,864
│    │    │    └─BatchNorm1d: 4-31            [2048, 384, 30]           768
│    │    │    └─Dropout: 4-32                [2048, 384, 30]           --
│    │    │    └─ReLU: 4-33                   [2048, 384, 30]           --
│    │    └─Bottleneck: 3-4                   [2048, 448, 30]           --
│    │    │    └─Sequential: 4-34             [2048, 448, 30]           --
│    │    │    │    └─Identity: 5-10          [2048, 384, 30]           --
│    │    │    │    └─PadConv1d: 5-11         [2048, 448, 30]           172,032
│    │    │    │    └─BatchNorm1d: 5-12       [2048, 448, 30]           896
│    │    │    └─PadConv1d: 4-35              [2048, 112, 30]           43,008
│    │    │    └─BatchNorm1d: 4-36            [2048, 112, 30]           224
│    │    │    └─ReLU: 4-37                   [2048, 112, 30]           --
│    │    │    └─PadConv1d: 4-38              [2048, 112, 30]           37,632
│    │    │    └─BatchNorm1d: 4-39            [2048, 112, 30]           224
│    │    │    └─ReLU: 4-40                   [2048, 112, 30]           --
│    │    │    └─PadConv1d: 4-41              [2048, 448, 30]           50,176
│    │    │    └─BatchNorm1d: 4-42            [2048, 448, 30]           896
│    │    │    └─Dropout: 4-43                [2048, 448, 30]           --
│    │    │    └─ReLU: 4-44                   [2048, 448, 30]           --
│    └─Sequential: 2-5                        [2048, 448]               --
│    │    └─AdaptiveGeM: 3-5                  [2048, 448, 1]            1
│    │    └─Flatten: 3-6                      [2048, 448]               --
│    │    └─Identity: 3-7                     [2048, 448]               --
│    │    └─Identity: 3-8                     [2048, 448]               --
│    └─Sequential: 2-6                        [2048, 448]               --
│    │    └─Linear: 3-9                       [2048, 448]               201,152
│    │    └─BatchNorm1d: 3-10                 [2048, 448]               896
│    │    └─Identity: 3-11                    [2048, 448]               --
│    │    └─ReLU: 3-12                        [2048, 448]               --
├─Linear: 1-2                                 [2048, 256]               114,944
├─BatchNorm1d: 1-3                            [2048, 256]               512
===============================================================================================
Total params: 1,004,117
Trainable params: 1,004,117
Non-trainable params: 0
Total mult-adds (G): 40.55
===============================================================================================
Input size (MB): 0.74
Forward/backward pass size (MB): 3190.88
Params size (MB): 4.02
Estimated Total Size (MB): 3195.64
===============================================================================================

In [9]:

import collections
import time
from collections import defaultdict

import pandas as pd
from sklearn.metrics import recall_score

EVALUATE_EMBEDDING_MODEL = True
RANKING_N = 5
SPLITS = [0, 1, 2, 3, 4]
MAJVOTE_K = 3

EVALUATE_INPUT_SPACE_BASELINE = True
BASELINE_PACKETS = 10
BASELINE_IPT_MAX_CLIP = 1000 # ms
BASELINE_IPT_SCALE = 0.1
BASELINE_DIR_SCALE = 1

datasets = {
    "UCDAVIS19-script": tcb.DATASETS.UCDAVISICDM19,
    "UCDAVIS19-human": tcb.DATASETS.UCDAVISICDM19,
    "UTMOBILENET21": tcb.DATASETS.UTMOBILENET21,
    "MIRAGE19": tcb.DATASETS.MIRAGE19,
    "MIRAGE22": tcb.DATASETS.MIRAGE22,
    # "CESNET-TLS22": "CESNET-TLS22",
}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
embedding_model = embedding_model.to(device)
per_dataset_model_metrics: dict = defaultdict(lambda: {"top1-acc": [], "top1-recall": [], "maj-acc": [], "maj-recall": []})
per_dataset_baseline_metrics: dict = defaultdict(lambda: {"top1-acc": [], "top1-recall": [], "maj-acc": [], "maj-recall": []})
for dataset_name, dataset_enum in datasets.items():
    print(f"\nProcessing dataset {dataset_name} with splits {SPLITS} (printing output for the first split)")
    for split_id in SPLITS:
        if dataset_name == "CESNET-TLS22":
            train_data, test_data, train_labels, test_labels = load_cesnet_tls22_from_datazoo(split_id=split_id)
        else:
            # UCDAVIS19 currently uses the entire pretraining partition as the training set
            if dataset_name == "UCDAVIS19-human":
                ucdavis_test_set = "human"
            elif dataset_name == "UCDAVIS19-script":
                ucdavis_test_set = "script"
            else:
                ucdavis_test_set = None
            train_data, test_data, train_labels, test_labels = get_data_from_tcbench(dataset_enum, split_id=split_id, ucdavis_test_set=ucdavis_test_set)
        train_dataloader = prepare_dataloader(data=train_data, labels=train_labels, ppi_transform=ppi_transform)
        test_dataloader = prepare_dataloader(data=test_data, labels=test_labels, ppi_transform=ppi_transform)
        if split_id == 0: print("Creating embeddings for train and test sets")
        train_embeddings, _ = compute_embeddings_from_loaded_dataset(embedding_model, dataloader=train_dataloader, device=device, silent=split_id != 0)
        test_embeddings, _ = compute_embeddings_from_loaded_dataset(embedding_model, dataloader=test_dataloader, device=device, silent=split_id != 0)

        if EVALUATE_EMBEDDING_MODEL:
            start_time = time.time()
            if split_id == 0: print("Computing ranking with faiss")
            distances, ranks = find_ranks_faiss(vecs=train_embeddings,
                                                qvecs=test_embeddings,
                                                N=RANKING_N,
                                                batch_size=10_000 if dataset_name == "CESNET-TLS22" else None,
                                                silent=split_id != 0,
                                                device=device,)
            if split_id == 0: print(f"Time elapsed for faiss ranking: {time.time() - start_time:.2f} s")
            # Compute metrics based on the ranking
            closest_1 = train_labels[ranks[:, 0]]
            maj_vote = [collections.Counter(row).most_common(1)[0][0] for row in train_labels[ranks[:, :MAJVOTE_K]]]
            top1_acc = (test_labels == closest_1).mean()
            maj_acc = (test_labels == maj_vote).mean()
            top1_recall = recall_score(test_labels, closest_1, average="macro", zero_division=0)
            maj_recall = recall_score(test_labels, maj_vote, average="macro", zero_division=0)
            per_dataset_model_metrics[dataset_name]["top1-acc"].append(top1_acc)
            per_dataset_model_metrics[dataset_name]["top1-recall"].append(top1_recall)
            per_dataset_model_metrics[dataset_name]["maj-recall"].append(maj_recall)
            per_dataset_model_metrics[dataset_name]["maj-acc"].append(maj_acc)
            if split_id == 0: print(f"Embedding model top1-acc {top1_acc * 100:.2f}, maj-acc {maj_acc * 100:.2f}")

        if EVALUATE_INPUT_SPACE_BASELINE:
            start_time = time.time()
            baseline_train_embeddings = prepare_input_space_embeddings(train_data,
                                                                       num_packets=BASELINE_PACKETS,
                                                                       ipt_max_clip=BASELINE_IPT_MAX_CLIP,
                                                                       ipt_scale=BASELINE_IPT_SCALE,
                                                                       dir_scale=BASELINE_DIR_SCALE,)
            baseline_test_embeddings = prepare_input_space_embeddings(test_data,
                                                                      num_packets=BASELINE_PACKETS,
                                                                      ipt_max_clip=BASELINE_IPT_MAX_CLIP,
                                                                      ipt_scale=BASELINE_IPT_SCALE,
                                                                      dir_scale=BASELINE_DIR_SCALE,)
            if split_id == 0: print("Computing input space ranking with faiss")
            baseline_distances, baseline_ranks = find_ranks_faiss(baseline_train_embeddings,
                                                                  baseline_test_embeddings,
                                                                  N=RANKING_N,
                                                                  metric="L1",
                                                                  batch_size=10_000 if dataset_name == "CESNET-TLS22" else None,
                                                                  silent=split_id != 0,
                                                                  device=device,)
            if split_id == 0: print(f"Time elapsed for faiss ranking: {time.time() - start_time:.2f} s")
            # Compute metrics based on the ranking
            baseline_closest_1 = train_labels[baseline_ranks[:, 0]]
            baseline_maj_vote = [collections.Counter(row).most_common(1)[0][0] for row in train_labels[baseline_ranks[:, :MAJVOTE_K]]]
            baseline_top1_acc = (test_labels == baseline_closest_1).mean()
            baseline_maj_acc = (test_labels == baseline_maj_vote).mean()
            baseline_top1_recall = recall_score(test_labels, baseline_closest_1, average="macro", zero_division=0)
            baseline_maj_recall = recall_score(test_labels, baseline_maj_vote, average="macro", zero_division=0)
            per_dataset_baseline_metrics[dataset_name]["top1-acc"].append(baseline_top1_acc)
            per_dataset_baseline_metrics[dataset_name]["top1-recall"].append(baseline_top1_recall)
            per_dataset_baseline_metrics[dataset_name]["maj-acc"].append(baseline_maj_acc)
            per_dataset_baseline_metrics[dataset_name]["maj-recall"].append(baseline_maj_recall)
            if split_id == 0: print(f"Input space baseline top1-acc {baseline_top1_acc * 100:.2f}, maj-acc {baseline_maj_acc * 100:.2f}")
    # Average metrics across splits
    per_dataset_model_metrics[dataset_name] = {metric: np.mean(per_split_values) for metric, per_split_values in per_dataset_model_metrics[dataset_name].items()}
    per_dataset_baseline_metrics[dataset_name] = {metric: np.mean(per_split_values) for metric, per_split_values in per_dataset_baseline_metrics[dataset_name].items()}

Processing dataset UCDAVIS19-script with splits [0, 1, 2, 3, 4] (printing output for the first split)
Creating embeddings for train and test sets

100%|██████████| 4/4 [00:00<00:00, 12.20it/s]
100%|██████████| 1/1 [00:00<00:00, 181.10it/s]

Computing ranking with faiss
Time elapsed for faiss ranking: 0.12 s
Embedding model top1-acc 100.00, maj-acc 99.33
Computing input space ranking with faiss
Time elapsed for faiss ranking: 0.12 s
Input space baseline top1-acc 98.00, maj-acc 97.33

Processing dataset UCDAVIS19-human with splits [0, 1, 2, 3, 4] (printing output for the first split)
Creating embeddings for train and test sets

100%|██████████| 4/4 [00:00<00:00, 10.19it/s]
100%|██████████| 1/1 [00:00<00:00, 226.87it/s]

Computing ranking with faiss
Time elapsed for faiss ranking: 0.11 s
Embedding model top1-acc 81.93, maj-acc 81.93
Computing input space ranking with faiss
Time elapsed for faiss ranking: 0.11 s
Input space baseline top1-acc 71.08, maj-acc 71.08

Processing dataset UTMOBILENET21 with splits [0, 1, 2, 3, 4] (printing output for the first split)
Creating embeddings for train and test sets

100%|██████████| 4/4 [00:00<00:00, 35.47it/s]
100%|██████████| 1/1 [00:00<00:00, 105.93it/s]

Computing ranking with faiss

Time elapsed for faiss ranking: 0.11 s
Embedding model top1-acc 87.00, maj-acc 87.95
Computing input space ranking with faiss
Time elapsed for faiss ranking: 0.12 s
Input space baseline top1-acc 84.25, maj-acc 84.99

Processing dataset MIRAGE19 with splits [0, 1, 2, 3, 4] (printing output for the first split)
Creating embeddings for train and test sets

100%|██████████| 26/26 [00:01<00:00, 22.29it/s]
100%|██████████| 4/4 [00:00<00:00, 36.94it/s]

Computing ranking with faiss
Time elapsed for faiss ranking: 0.15 s
Embedding model top1-acc 84.04, maj-acc 83.69
Computing input space ranking with faiss
Time elapsed for faiss ranking: 0.18 s
Input space baseline top1-acc 80.32, maj-acc 79.92

Processing dataset MIRAGE22 with splits [0, 1, 2, 3, 4] (printing output for the first split)
Creating embeddings for train and test sets

100%|██████████| 11/11 [00:00<00:00, 19.28it/s]
100%|██████████| 2/2 [00:00<00:00, 34.18it/s]

Computing ranking with faiss
Time elapsed for faiss ranking: 0.13 s
Embedding model top1-acc 97.87, maj-acc 97.39
Computing input space ranking with faiss
Time elapsed for faiss ranking: 0.12 s
Input space baseline top1-acc 95.48, maj-acc 94.96

In [10]:

from IPython.display import display

def prepare_metrics_df(per_dataset_model_metrics, per_dataset_baseline_metrics, metric: str, column_name: str) -> pd.DataFrame:
    df = pd.DataFrame.from_dict({d: m[metric] for d, m in per_dataset_model_metrics.items()}, orient="index", columns=[column_name])
    df[column_name] = (df[column_name] * 100).round(2)
    df.insert(0, "Input Space", df.index.map(lambda d: round(per_dataset_baseline_metrics[d][metric] * 100, 2)))
    df.insert(1, "Input Space Delta", df[column_name] - df["Input Space"])
    return df

df_top1_acc = prepare_metrics_df(per_dataset_model_metrics, per_dataset_baseline_metrics, "top1-acc", "Top-1 Accuracy")
df_top1_recall = prepare_metrics_df(per_dataset_model_metrics, per_dataset_baseline_metrics, "top1-recall", "Top-1 Recall")
df_maj_acc = prepare_metrics_df(per_dataset_model_metrics, per_dataset_baseline_metrics, "maj-acc", f"Maj-{MAJVOTE_K} Accuracy")
df_maj_recall = prepare_metrics_df(per_dataset_model_metrics, per_dataset_baseline_metrics, "maj-recall", f"Maj-{MAJVOTE_K} Recall")

display(df_top1_acc)
display(df_maj_acc)
display(df_top1_recall)
display(df_maj_recall)

	Input Space	Input Space Delta	Top-1 Accuracy
UCDAVIS19-script	98.00	2.00	100.00
UCDAVIS19-human	71.08	10.85	81.93
UTMOBILENET21	83.55	3.13	86.68
MIRAGE19	80.01	3.72	83.73
MIRAGE22	95.63	2.14	97.77

	Input Space	Input Space Delta	Maj-3 Accuracy
UCDAVIS19-script	97.33	2.00	99.33
UCDAVIS19-human	71.08	10.85	81.93
UTMOBILENET21	84.04	2.94	86.98
MIRAGE19	79.20	3.95	83.15
MIRAGE22	95.21	2.24	97.45

	Input Space	Input Space Delta	Top-1 Recall
UCDAVIS19-script	98.00	2.00	100.00
UCDAVIS19-human	70.00	11.11	81.11
UTMOBILENET21	72.75	3.62	76.37
MIRAGE19	75.80	4.32	80.12
MIRAGE22	95.70	2.11	97.81

	Input Space	Input Space Delta	Maj-3 Recall
UCDAVIS19-script	97.33	2.00	99.33
UCDAVIS19-human	70.33	10.56	80.89
UTMOBILENET21	72.90	3.43	76.33
MIRAGE19	74.96	4.37	79.33
MIRAGE22	95.28	2.22	97.50