Simple Applications of Homomorphic Encryption¶

pip install tenseal torch pandas numpy pytest¶

In [1]:

import tenseal as ts
import torch
import pandas as pd
import random
import numpy as np

import pytest
from time import time
import sys
import os

TenSEAL Context¶

The TenSEALContext is a special object that holds different encryption keys and parameters for you, so that you only need to use a single object to make your encrypted computation instead of managing all the keys and the HE details. Basically, you will want to create a single TenSEALContext before doing your encrypted computation. Let's see how to create one !

In [2]:

context = ts.Context(
    ts.SCHEME_TYPE.CKKS,
    poly_modulus_degree=8192,
    coeff_mod_bit_sizes=[60, 40, 40, 60]
)
context.global_scale = 2**40
# this key is needed for doing dot-product operations
context.generate_galois_keys()
context

Out[2]:

<tenseal.enc_context.Context at 0x1aba5ad2160>

Encrypt the data¶

In [3]:

v1 = [0, 1, 2, 3, 4]
v2 = [4, 3, 2, 1, 0]

enc_v1 = ts.ckks_vector(context, v1)
enc_v2 = ts.ckks_vector(context, v2)
(enc_v1, enc_v2)

Out[3]:

(<tenseal.tensors.ckksvector.CKKSVector at 0x1aba5ad21f0>,
 <tenseal.tensors.ckksvector.CKKSVector at 0x1aba5ad26a0>)

In [4]:

print(enc_v1.shape)

[5]

Compute different operations over the two vectors locally and decrypt the results¶

Addition¶

In [5]:

result_add = enc_v1 + enc_v2
decrypted_result = result_add.decrypt()
decrypted_result

Out[5]:

[4.000000000112872,
 4.000000000242888,
 4.000000000972386,
 3.999999999642842,
 3.9999999994070214]

In [6]:

assert pytest.approx(decrypted_result, abs=10**-3) == [v1 + v2 for v1, v2 in zip(v1, v2)]

Multiplication¶

In [7]:

result_mul = enc_v1 * enc_v2
decrypted_result = result_mul.decrypt()
decrypted_result

Out[7]:

[3.920404223478613e-09,
 3.000000406002841,
 4.000000537622926,
 3.000000400956933,
 -3.3927722853377418e-09]

In [8]:

assert pytest.approx(decrypted_result, abs=10**-3) == [v1 * v2 for v1, v2 in zip(v1, v2)]

Polynomial conversion to 1 + X^2 + X^3¶

In [9]:

result_poly = enc_v1.polyval([1,0,1,1]) # 1 + X^2 + X^3
decrypted_result = result_poly.decrypt()
decrypted_result

Out[9]:

[1.0000000022713067,
 3.0000009493306408,
 13.000006974897476,
 37.00002292364773,
 81.00005366743699]

In [10]:

assert pytest.approx(decrypted_result, abs=10**-3) == [1 + v**2 + v**3 for v in v1]

Training a Logistic Regression Model on non-encrypted training data¶

In [11]:

data = pd.read_csv("framingham.csv")
data.head()

Out[11]:

	male	age	education	currentSmoker	cigsPerDay	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
0	1	39	4.0	0	0.0	0	195.0	106.0	70.0	26.97	80.0	77.0	0
1	0	46	2.0	0	0.0	0	250.0	121.0	81.0	28.73	95.0	76.0	0
2	1	48	1.0	1	20.0	0	245.0	127.5	80.0	25.34	75.0	70.0	0
3	0	61	3.0	1	30.0	1	225.0	150.0	95.0	28.58	65.0	103.0	1
4	0	46	3.0	1	23.0	0	285.0	130.0	84.0	23.10	85.0	85.0	0

Prepare the data¶

We now prepare the training and test data, the dataset was downloaded from Kaggle. T his dataset provides patients' information along with a 10-year risk of future coronary heart disease (CHD) as a label, and the goal is to build a model that can predict this 10-year CHD risk based on patients' information.

In [12]:

torch.random.manual_seed(73)
random.seed(73)

def split_train_test(x, y, test_ratio=0.3):
    idxs = [i for i in range(len(x))]
    random.shuffle(idxs)
    # delimiter between test and train data
    delim = int(len(x) * test_ratio)
    test_idxs, train_idxs = idxs[:delim], idxs[delim:]
    return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]
            
def heart_disease_data():    
    data = pd.read_csv("framingham.csv")
    # drop rows with missing values
    data = data.dropna()
    # drop some features
    data = data.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
    # balance data
    grouped = data.groupby('TenYearCHD')
    data = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
    # extract labels
    y = torch.tensor(data["TenYearCHD"].values).float().unsqueeze(1)
    data = data.drop(labels="TenYearCHD", axis='columns')
    # standardize data
    data = (data - data.mean()) / data.std()
    x = torch.tensor(data.values).float()
    return split_train_test(x, y)

x_train, y_train, x_test, y_test = heart_disease_data()

print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")

############# Data summary #############
x_train has shape: torch.Size([780, 9])
y_train has shape: torch.Size([780, 1])
x_test has shape: torch.Size([334, 9])
y_test has shape: torch.Size([334, 1])
#######################################

Here we define the models used.¶

LR is a simple logistic regression for plain data

We will start by training a logistic regression model (without any encryption), which can be viewed as a single layer neural network with a single node.

In [13]:

class LR(torch.nn.Module):

    def __init__(self, n_features):
        super(LR, self).__init__()
        self.lr = torch.nn.Linear(n_features, 1)
        
    def forward(self, x):
        out = torch.sigmoid(self.lr(x))
        return out

Train the logistic regression¶

In [14]:

n_features = x_train.shape[1]
model = LR(n_features)

# use gradient descent with a learning_rate=1
optim = torch.optim.SGD(model.parameters(), lr=1)

# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()

In [15]:

EPOCHS = 20

def train(model, optim, criterion, x, y, epochs=EPOCHS):
    for e in range(1, epochs + 1):
        optim.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optim.step()
        print(f"Loss at epoch {e}: {loss.data}")
    return model

model = train(model, optim, criterion, x_train, y_train)

Loss at epoch 1: 0.8504331707954407
Loss at epoch 2: 0.6863384246826172
Loss at epoch 3: 0.6358115673065186
Loss at epoch 4: 0.6193529367446899
Loss at epoch 5: 0.6124349236488342
Loss at epoch 6: 0.6089244484901428
Loss at epoch 7: 0.6069258451461792
Loss at epoch 8: 0.6057038307189941
Loss at epoch 9: 0.6049202084541321
Loss at epoch 10: 0.604399561882019
Loss at epoch 11: 0.6040432453155518
Loss at epoch 12: 0.6037929058074951
Loss at epoch 13: 0.6036127805709839
Loss at epoch 14: 0.6034800410270691
Loss at epoch 15: 0.6033799648284912
Loss at epoch 16: 0.6033029556274414
Loss at epoch 17: 0.6032425165176392
Loss at epoch 18: 0.6031941175460815
Loss at epoch 19: 0.603154718875885
Loss at epoch 20: 0.6031221747398376

In [16]:

def accuracy(model, x, y):
    out = model(x)
    correct = torch.abs(y - out) < 0.5
    return correct.float().mean()

plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")

Accuracy on plain test_set: 0.7005987763404846

We will be comparing accuracies over encrypted data against the plain_accuracy we got here.

Encrypted Evaluation on Encrypted Test Data¶

In this part, we will just focus on evaluating the logistic regression model with plain parameters (optionally encrypted parameters) on the encrypted test set. We first create a PyTorch-like LR model that can evaluate encrypted data:

In [17]:

enc_x_test = [ts.ckks_vector(context, x.tolist()) for x in x_test]

We create the model variant that can handle encrypted tensors¶

EncryptedLR adapts the forward method to the API exposed by the encrypted ciphertexts.

In [19]:

class EncryptedLR:
    def __init__(self, torch_lr):
        # TenSEAL processes lists and not torch tensors
        # so we take out parameters from the PyTorch model
        self.weight = torch_lr.lr.weight.data.tolist()[0]
        self.bias = torch_lr.lr.bias.data.tolist()
        
    def forward(self, enc_x):
        # We don't need to perform sigmoid as this model
        # will only be used for evaluation, and the label
        # can be deduced without applying sigmoid
        enc_out = enc_x.dot(self.weight) + self.bias
        return enc_out
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
    
    ## You can use the functions below to perform 
    ## the evaluation with an encrypted model   
    
    def encrypt(self, context):
        self.weight = ts.ckks_vector(context, self.weight)
        self.bias = ts.ckks_vector(context, self.bias)
        
    def decrypt(self, context):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()

eelr = EncryptedLR(model)
 
# encrypt the model's parameters
eelr.encrypt(context)

In [20]:

def encrypted_evaluation(model, enc_x_test, y_test):    
    correct = 0
    for enc_x, y in zip(enc_x_test, y_test):
        # encrypted evaluation
        enc_out = model(enc_x)
        # plain comparaison
        out = enc_out.decrypt()
        out = torch.tensor(out)
        out = torch.sigmoid(out)
        if torch.abs(out - y) < 0.5:
            correct += 1
    
    print(f"Evaluated test_set of {len(x_test)} entries. Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
    return correct / len(x_test)

In [21]:

encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test)
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")

if diff_accuracy < 0:
    print("Oh! We got a better accuracy on the encrypted test-set! The noise was on our side...")

Evaluated test_set of 334 entries. Accuracy: 234/334 = 0.7005988023952096
Difference between plain and encrypted accuracies: 0.0

We saw that evaluating on the encrypted test set doesn't affect the accuracy.