import tenseal as ts
import torch
import pandas as pd
import random
import numpy as np
import pytest
from time import time
import sys
import os
The TenSEALContext is a special object that holds different encryption keys and parameters for you, so that you only need to use a single object to make your encrypted computation instead of managing all the keys and the HE details. Basically, you will want to create a single TenSEALContext before doing your encrypted computation. Let's see how to create one !
context = ts.Context(
ts.SCHEME_TYPE.CKKS,
poly_modulus_degree=8192,
coeff_mod_bit_sizes=[60, 40, 40, 60]
)
context.global_scale = 2**40
# this key is needed for doing dot-product operations
context.generate_galois_keys()
context
<tenseal.enc_context.Context at 0x1aba5ad2160>
v1 = [0, 1, 2, 3, 4]
v2 = [4, 3, 2, 1, 0]
enc_v1 = ts.ckks_vector(context, v1)
enc_v2 = ts.ckks_vector(context, v2)
(enc_v1, enc_v2)
(<tenseal.tensors.ckksvector.CKKSVector at 0x1aba5ad21f0>, <tenseal.tensors.ckksvector.CKKSVector at 0x1aba5ad26a0>)
print(enc_v1.shape)
[5]
result_add = enc_v1 + enc_v2
decrypted_result = result_add.decrypt()
decrypted_result
[4.000000000112872, 4.000000000242888, 4.000000000972386, 3.999999999642842, 3.9999999994070214]
assert pytest.approx(decrypted_result, abs=10**-3) == [v1 + v2 for v1, v2 in zip(v1, v2)]
result_mul = enc_v1 * enc_v2
decrypted_result = result_mul.decrypt()
decrypted_result
[3.920404223478613e-09, 3.000000406002841, 4.000000537622926, 3.000000400956933, -3.3927722853377418e-09]
assert pytest.approx(decrypted_result, abs=10**-3) == [v1 * v2 for v1, v2 in zip(v1, v2)]
result_poly = enc_v1.polyval([1,0,1,1]) # 1 + X^2 + X^3
decrypted_result = result_poly.decrypt()
decrypted_result
[1.0000000022713067, 3.0000009493306408, 13.000006974897476, 37.00002292364773, 81.00005366743699]
assert pytest.approx(decrypted_result, abs=10**-3) == [1 + v**2 + v**3 for v in v1]
data = pd.read_csv("framingham.csv")
data.head()
male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 195.0 | 106.0 | 70.0 | 26.97 | 80.0 | 77.0 | 0 |
1 | 0 | 46 | 2.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 250.0 | 121.0 | 81.0 | 28.73 | 95.0 | 76.0 | 0 |
2 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 245.0 | 127.5 | 80.0 | 25.34 | 75.0 | 70.0 | 0 |
3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 | 1 |
4 | 0 | 46 | 3.0 | 1 | 23.0 | 0.0 | 0 | 0 | 0 | 285.0 | 130.0 | 84.0 | 23.10 | 85.0 | 85.0 | 0 |
We now prepare the training and test data, the dataset was downloaded from Kaggle. T his dataset provides patients' information along with a 10-year risk of future coronary heart disease (CHD) as a label, and the goal is to build a model that can predict this 10-year CHD risk based on patients' information.
torch.random.manual_seed(73)
random.seed(73)
def split_train_test(x, y, test_ratio=0.3):
idxs = [i for i in range(len(x))]
random.shuffle(idxs)
# delimiter between test and train data
delim = int(len(x) * test_ratio)
test_idxs, train_idxs = idxs[:delim], idxs[delim:]
return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]
def heart_disease_data():
data = pd.read_csv("framingham.csv")
# drop rows with missing values
data = data.dropna()
# drop some features
data = data.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
# balance data
grouped = data.groupby('TenYearCHD')
data = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
# extract labels
y = torch.tensor(data["TenYearCHD"].values).float().unsqueeze(1)
data = data.drop(labels="TenYearCHD", axis='columns')
# standardize data
data = (data - data.mean()) / data.std()
x = torch.tensor(data.values).float()
return split_train_test(x, y)
x_train, y_train, x_test, y_test = heart_disease_data()
print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")
############# Data summary ############# x_train has shape: torch.Size([780, 9]) y_train has shape: torch.Size([780, 1]) x_test has shape: torch.Size([334, 9]) y_test has shape: torch.Size([334, 1]) #######################################
We will start by training a logistic regression model (without any encryption), which can be viewed as a single layer neural network with a single node.
class LR(torch.nn.Module):
def __init__(self, n_features):
super(LR, self).__init__()
self.lr = torch.nn.Linear(n_features, 1)
def forward(self, x):
out = torch.sigmoid(self.lr(x))
return out
n_features = x_train.shape[1]
model = LR(n_features)
# use gradient descent with a learning_rate=1
optim = torch.optim.SGD(model.parameters(), lr=1)
# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()
EPOCHS = 20
def train(model, optim, criterion, x, y, epochs=EPOCHS):
for e in range(1, epochs + 1):
optim.zero_grad()
out = model(x)
loss = criterion(out, y)
loss.backward()
optim.step()
print(f"Loss at epoch {e}: {loss.data}")
return model
model = train(model, optim, criterion, x_train, y_train)
Loss at epoch 1: 0.8504331707954407 Loss at epoch 2: 0.6863384246826172 Loss at epoch 3: 0.6358115673065186 Loss at epoch 4: 0.6193529367446899 Loss at epoch 5: 0.6124349236488342 Loss at epoch 6: 0.6089244484901428 Loss at epoch 7: 0.6069258451461792 Loss at epoch 8: 0.6057038307189941 Loss at epoch 9: 0.6049202084541321 Loss at epoch 10: 0.604399561882019 Loss at epoch 11: 0.6040432453155518 Loss at epoch 12: 0.6037929058074951 Loss at epoch 13: 0.6036127805709839 Loss at epoch 14: 0.6034800410270691 Loss at epoch 15: 0.6033799648284912 Loss at epoch 16: 0.6033029556274414 Loss at epoch 17: 0.6032425165176392 Loss at epoch 18: 0.6031941175460815 Loss at epoch 19: 0.603154718875885 Loss at epoch 20: 0.6031221747398376
def accuracy(model, x, y):
out = model(x)
correct = torch.abs(y - out) < 0.5
return correct.float().mean()
plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")
Accuracy on plain test_set: 0.7005987763404846
We will be comparing accuracies over encrypted data against the plain_accuracy we got here.
In this part, we will just focus on evaluating the logistic regression model with plain parameters (optionally encrypted parameters) on the encrypted test set. We first create a PyTorch-like LR model that can evaluate encrypted data:
enc_x_test = [ts.ckks_vector(context, x.tolist()) for x in x_test]
class EncryptedLR:
def __init__(self, torch_lr):
# TenSEAL processes lists and not torch tensors
# so we take out parameters from the PyTorch model
self.weight = torch_lr.lr.weight.data.tolist()[0]
self.bias = torch_lr.lr.bias.data.tolist()
def forward(self, enc_x):
# We don't need to perform sigmoid as this model
# will only be used for evaluation, and the label
# can be deduced without applying sigmoid
enc_out = enc_x.dot(self.weight) + self.bias
return enc_out
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
## You can use the functions below to perform
## the evaluation with an encrypted model
def encrypt(self, context):
self.weight = ts.ckks_vector(context, self.weight)
self.bias = ts.ckks_vector(context, self.bias)
def decrypt(self, context):
self.weight = self.weight.decrypt()
self.bias = self.bias.decrypt()
eelr = EncryptedLR(model)
# encrypt the model's parameters
eelr.encrypt(context)
def encrypted_evaluation(model, enc_x_test, y_test):
correct = 0
for enc_x, y in zip(enc_x_test, y_test):
# encrypted evaluation
enc_out = model(enc_x)
# plain comparaison
out = enc_out.decrypt()
out = torch.tensor(out)
out = torch.sigmoid(out)
if torch.abs(out - y) < 0.5:
correct += 1
print(f"Evaluated test_set of {len(x_test)} entries. Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
return correct / len(x_test)
encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test)
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
print("Oh! We got a better accuracy on the encrypted test-set! The noise was on our side...")
Evaluated test_set of 334 entries. Accuracy: 234/334 = 0.7005988023952096 Difference between plain and encrypted accuracies: 0.0
We saw that evaluating on the encrypted test set doesn't affect the accuracy.