import sys
!{sys.executable} -m pip install "torch>=1.10" --index-url https://download.pytorch.org/whl/cu118
!{sys.executable} -m pip install cesnet-datazoo cesnet-models tqdm
Prepare data transformations for the model.
from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI, NormalizeHistograms, ScalerEnum
ppi_transform = ClipAndScalePPI(psizes_scaler_enum=ScalerEnum.STANDARD,
ipt_scaler_enum=ScalerEnum.STANDARD,)
flowstats_transform = ClipAndScaleFlowstats(flowstats_scaler_enum=ScalerEnum.ROBUST, quantile_clip=0.99)
packet_histograms_transform = NormalizeHistograms()
Initialize the dataset class and prepare its configuration.
import logging
from cesnet_datazoo.config import AppSelection, DatasetConfig, ValidationApproach
from cesnet_datazoo.datasets import CESNET_QUIC22
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")
DATASET_SIZE = "XS"
dataset = CESNET_QUIC22(data_root="data/CESNET-QUIC22", size=DATASET_SIZE)
dataset_config = DatasetConfig(
dataset=dataset,
train_period_name="W-2022-46",
test_period_name="W-2022-47",
# train_size=500_000, # Uncomment to limit the number of training samples to speed up this example
val_approach=ValidationApproach.SPLIT_FROM_TRAIN,
train_val_split_fraction=0.2,
apps_selection=AppSelection.ALL_KNOWN,
return_tensors=True,
use_packet_histograms=True,
ppi_transform=ppi_transform,
flowstats_transform=flowstats_transform,
flowstats_phist_transform=packet_histograms_transform,)
dataset.set_dataset_config_and_initialize(dataset_config)
[2024-04-08 17:40:19,224][cesnet_datazoo.pytables_data.indices_setup][INFO] - Processing train indices [2024-04-08 17:40:19,774][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221114 took 0.51 seconds [2024-04-08 17:40:20,281][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221115 took 0.51 seconds [2024-04-08 17:40:20,696][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221116 took 0.42 seconds [2024-04-08 17:40:20,870][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221117 took 0.17 seconds [2024-04-08 17:40:21,101][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221118 took 0.23 seconds [2024-04-08 17:40:21,236][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221119 took 0.13 seconds [2024-04-08 17:40:21,413][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221120 took 0.18 seconds [2024-04-08 17:40:21,431][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Found applications with less than 100 train samples: ['livescore']. Disabling these applications [2024-04-08 17:40:21,442][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Selected 101 known applications and 0 unknown applications [2024-04-08 17:40:23,261][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Processing indices took 1.85 seconds [2024-04-08 17:40:27,834][cesnet_datazoo.pytables_data.data_scalers][INFO] - Reading data and fitting scalers took 3.68 seconds
Show dataset classes in the current configuration, together with train, validation, and test counts.
dataset.known_app_counts.sort_values(by="Train", ascending=False)
Train | Validation | Test | |
---|---|---|---|
google-www | 121836 | 30459 | 205010 |
google-ads | 116419 | 29105 | 195979 |
google-services | 109998 | 27499 | 177295 |
google-play | 97905 | 24476 | 161546 |
google-gstatic | 92789 | 23197 | 150633 |
... | ... | ... | ... |
toggl | 150 | 37 | 247 |
ebay-kleinanzeigen | 150 | 38 | 176 |
alza-identity | 130 | 32 | 215 |
bitdefender-nimbus | 118 | 29 | 204 |
uber | 87 | 22 | 118 |
101 rows × 3 columns
Reuse neural network architecture from the cesnet-models
package without using pre-trained weights, i.e., start the training from scratch.
from cesnet_models.models import mm_cesnet_v2
model = mm_cesnet_v2(weights=None, num_classes=dataset.get_num_classes(), ppi_input_channels=len(dataset_config.get_ppi_channels()), flowstats_input_size=dataset_config.get_flowstats_features_len())
print(model)
Multimodal_CESNET( (cnn_ppi): Sequential( (0): Conv1d(3, 200, kernel_size=(7,), stride=(1,), padding=(3,)) (1): ReLU() (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Sequential( (0): Conv1d(200, 200, kernel_size=(5,), stride=(1,), padding=(2,)) (1): ReLU() (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (4): Sequential( (0): Conv1d(200, 200, kernel_size=(5,), stride=(1,), padding=(2,)) (1): ReLU() (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (5): Sequential( (0): Conv1d(200, 200, kernel_size=(5,), stride=(1,), padding=(2,)) (1): ReLU() (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (6): Conv1d(200, 300, kernel_size=(5,), stride=(1,)) (7): ReLU() (8): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (9): Conv1d(300, 300, kernel_size=(5,), stride=(1,)) (10): ReLU() (11): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (12): Conv1d(300, 300, kernel_size=(4,), stride=(2,)) (13): ReLU() ) (cnn_global_pooling): Sequential( (0): GeM(kernel_size=10, p=3.0000, eps=1e-06) (1): Flatten(start_dim=1, end_dim=-1) (2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Dropout(p=0.1, inplace=False) ) (mlp_flowstats): Sequential( (0): Linear(in_features=43, out_features=225, bias=True) (1): ReLU() (2): BatchNorm1d(225, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Sequential( (0): Linear(in_features=225, out_features=225, bias=True) (1): ReLU() (2): BatchNorm1d(225, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (4): Sequential( (0): Linear(in_features=225, out_features=225, bias=True) (1): ReLU() (2): BatchNorm1d(225, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (5): Linear(in_features=225, out_features=225, bias=True) (6): ReLU() (7): BatchNorm1d(225, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): Dropout(p=0.1, inplace=False) ) (mlp_shared): Sequential( (0): Linear(in_features=525, out_features=600, bias=True) (1): ReLU() (2): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Dropout(p=0.2, inplace=False) ) (classifier): Linear(in_features=600, out_features=101, bias=True) )
Train the model with a standard training loop using the cross-entropy loss, the AdamW optimizer, and the OneCycleLR learning scheduler.
The number of epochs is set to five, and the model is validated after each epoch.
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
def train_one_epoch(model: nn.Module, train_dataloader: DataLoader, optimizer: optim.Optimizer, scheduler, loss_fn, device) -> None:
model.train()
for _, batch_ppi, batch_flowstats, batch_labels in train_dataloader:
batch_ppi, batch_flowstats, batch_labels = batch_ppi.to(device), batch_flowstats.to(device), batch_labels.to(device)
optimizer.zero_grad()
out = model((batch_ppi, batch_flowstats))
loss = loss_fn(out, batch_labels)
loss.backward()
optimizer.step()
scheduler.step()
def test(model: nn.Module, dataloader: DataLoader, device, progress: bool = False) -> float:
model.eval()
true_labels = []
preds = []
with torch.no_grad():
for __, batch_ppi, batch_flowstats, batch_labels in tqdm(dataloader, total=len(dataloader), disable=not progress):
batch_ppi, batch_flowstats, batch_labels = batch_ppi.to(device), batch_flowstats.to(device), batch_labels.to(device)
out = model((batch_ppi, batch_flowstats))
batch_preds = out.argmax(dim=1)
true_labels.append(batch_labels)
preds.append(batch_preds)
true_labels, preds = torch.cat(true_labels).cpu().numpy(), torch.cat(preds).cpu().numpy()
return (true_labels == preds).mean()
EPOCHS = 5
train_dataloader = dataset.get_train_dataloader()
val_dataloader = dataset.get_val_dataloader()
optimizer = optim.AdamW(model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, steps_per_epoch=len(train_dataloader), epochs=EPOCHS)
loss_fn = nn.CrossEntropyLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
for i in tqdm(range(1, EPOCHS + 1), total=EPOCHS, file=sys.stdout):
train_one_epoch(model, train_dataloader, optimizer, scheduler, loss_fn, device)
validation_accuracy = test(model, val_dataloader, device)
tqdm.write(f"Epoch {i}, validation accuracy: {validation_accuracy:.4f}")
Epoch 1, validation accuracy: 0.8280 Epoch 2, validation accuracy: 0.8897 Epoch 3, validation accuracy: 0.8751 Epoch 4, validation accuracy: 0.9321 Epoch 5, validation accuracy: 0.9413 100%|██████████| 5/5 [09:41<00:00, 116.35s/it]
Evaluate the trained model on the test set.
test_dataloader = dataset.get_test_dataloader()
print("Computing model predictions on the test set.")
test_accuracy = test(model, test_dataloader, device, progress=True)
print(f"The trained model achieved an accuracy of {test_accuracy:.5f} on the test period {dataset_config.test_period_name} of the {dataset.name} dataset.")
Computing model predictions on the test set.
100%|██████████| 1289/1289 [00:38<00:00, 33.07it/s]
The trained model achieved an accuracy of 0.93962 on the test period W-2022-47 of the CESNET-QUIC22-XS dataset.