This notebook shows how to implement Low-Rank Tensor Completion with Truncated Nuclear Norm minimization (LRTC-TNN) on some real-world data sets. For an in-depth discussion of LRTC-TNN, please see our article [1].
This notebook is publicly available for any usage at our data imputation project. Please check out transdim - GitHub.
We start by importing the necessary dependencies.
import numpy as np
from numpy.linalg import inv as inv
def ten2mat(tensor, mode):
return np.reshape(np.moveaxis(tensor, mode, 0), (tensor.shape[mode], -1), order = 'F')
X = np.array([[[1, 2, 3, 4], [3, 4, 5, 6]],
[[5, 6, 7, 8], [7, 8, 9, 10]],
[[9, 10, 11, 12], [11, 12, 13, 14]]])
print('tensor size:')
print(X.shape)
print('original tensor:')
print(X)
print()
print('(1) mode-1 tensor unfolding:')
print(ten2mat(X, 0))
print()
print('(2) mode-2 tensor unfolding:')
print(ten2mat(X, 1))
print()
print('(3) mode-3 tensor unfolding:')
print(ten2mat(X, 2))
tensor size: (3, 2, 4) original tensor: [[[ 1 2 3 4] [ 3 4 5 6]] [[ 5 6 7 8] [ 7 8 9 10]] [[ 9 10 11 12] [11 12 13 14]]] (1) mode-1 tensor unfolding: [[ 1 3 2 4 3 5 4 6] [ 5 7 6 8 7 9 8 10] [ 9 11 10 12 11 13 12 14]] (2) mode-2 tensor unfolding: [[ 1 5 9 2 6 10 3 7 11 4 8 12] [ 3 7 11 4 8 12 5 9 13 6 10 14]] (3) mode-3 tensor unfolding: [[ 1 5 9 3 7 11] [ 2 6 10 4 8 12] [ 3 7 11 5 9 13] [ 4 8 12 6 10 14]]
def mat2ten(mat, tensor_size, mode):
index = list()
index.append(mode)
for i in range(tensor_size.shape[0]):
if i != mode:
index.append(i)
return np.moveaxis(np.reshape(mat, list(tensor_size[index]), order = 'F'), 0, mode)
def svt_tnn(mat, alpha, rho, theta):
"""This is a Numpy dependent singular value thresholding (SVT) process."""
u, s, v = np.linalg.svd(mat, full_matrices = 0)
vec = s.copy()
vec[theta :] = s[theta :] - alpha / rho
vec[vec < 0] = 0
return u @ np.diag(vec) @ v
Understanding these codes:
line 1
: Necessary inputs including any input matrix $\boldsymbol{X}$, weight of Truncated Nuclear Norm (TNN) regularization $\alpha$, learning rate $\rho$, and positive integer number $\theta$ for nuclear norm truncation.
line 2
: Compute the Singular Value Decomposition (SVD) for any matrix $\boldsymbol{X}$ with numpy.linalg.svd
(i.e., SVD function in Numpy
's linear algebra package).
line 3-5
: Truncate singular values $\sigma_{\theta+1},...$ with the following rule:
line 6
: Return the resulted matrix.Potential alternative for this:
This is a competitively efficient algorithm for implementing SVT-TNN.
def svt_tnn(mat, alpha, rho, theta):
tau = alpha / rho
[m, n] = mat.shape
if 2 * m < n:
u, s, v = np.linalg.svd(mat @ mat.T, full_matrices = 0)
s = np.sqrt(s)
idx = np.sum(s > tau)
mid = np.zeros(idx)
mid[:theta] = 1
mid[theta:idx] = (s[theta:idx] - tau) / s[theta:idx]
return (u[:, :idx] @ np.diag(mid)) @ (u[:, :idx].T @ mat)
elif m > 2 * n:
return svt_tnn(mat.T, alpha, rho, theta).T
u, s, v = np.linalg.svd(mat, full_matrices = 0)
idx = np.sum(s > tau)
vec = s[:idx].copy()
vec[theta:idx] = s[theta:idx] - tau
return u[:, :idx] @ np.diag(vec) @ v[:idx, :]
compute_mape
: Compute the value of Mean Absolute Percentage Error (MAPE).compute_rmse
: Compute the value of Root Mean Square Error (RMSE).Note that $$\mathrm{MAPE}=\frac{1}{n} \sum_{i=1}^{n} \frac{\left|y_{i}-\hat{y}_{i}\right|}{y_{i}} \times 100, \quad\mathrm{RMSE}=\sqrt{\frac{1}{n} \sum_{i=1}^{n}\left(y_{i}-\hat{y}_{i}\right)^{2}},$$ where $n$ is the total number of estimated values, and $y_i$ and $\hat{y}_i$ are the actual value and its estimation, respectively.
def compute_rmse(var, var_hat):
return np.sqrt(np.sum((var - var_hat) ** 2) / var.shape[0])
def compute_mape(var, var_hat):
return np.sum(np.abs(var - var_hat) / var) / var.shape[0]
Numpy
¶def LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter):
"""Low-Rank Tenor Completion with Truncated Nuclear Norm, LRTC-TNN."""
dim = np.array(sparse_tensor.shape)
pos_missing = np.where(sparse_tensor == 0)
pos_test = np.where((dense_tensor != 0) & (sparse_tensor == 0))
X = np.zeros(np.insert(dim, 0, len(dim))) # \boldsymbol{\mathcal{X}}
T = np.zeros(np.insert(dim, 0, len(dim))) # \boldsymbol{\mathcal{T}}
Z = sparse_tensor.copy()
last_tensor = sparse_tensor.copy()
snorm = np.sqrt(np.sum(sparse_tensor ** 2))
it = 0
while True:
rho = min(rho * 1.05, 1e5)
for k in range(len(dim)):
X[k] = mat2ten(svt_tnn(ten2mat(Z - T[k] / rho, k), alpha[k], rho, int(np.ceil(theta * dim[k]))), dim, k)
Z[pos_missing] = np.mean(X + T / rho, axis = 0)[pos_missing]
T = T + rho * (X - np.broadcast_to(Z, np.insert(dim, 0, len(dim))))
tensor_hat = np.einsum('k, kmnt -> mnt', alpha, X)
tol = np.sqrt(np.sum((tensor_hat - last_tensor) ** 2)) / snorm
last_tensor = tensor_hat.copy()
it += 1
if (it + 1) % 50 == 0:
print('Iter: {}'.format(it + 1))
print('RMSE: {:.6}'.format(compute_rmse(dense_tensor[pos_test], tensor_hat[pos_test])))
print()
if (tol < epsilon) or (it >= maxiter):
break
print('Imputation MAPE: {:.6}'.format(compute_mape(dense_tensor[pos_test], tensor_hat[pos_test])))
print('Imputation RMSE: {:.6}'.format(compute_rmse(dense_tensor[pos_test], tensor_hat[pos_test])))
print()
return tensor_hat
Understanding these codes:
line 18-19
: Update $\boldsymbol{\mathcal{Z}}_{k}^{l+1},k=1,2,3$.
line 20-22
: Update $\boldsymbol{\mathcal{X}}_{k}^{l+1}$ by
line 23
: Update $\boldsymbol{\mathcal{T}}_{k}^{l+1}$ byimport numpy as np
import time
import scipy.io
## 30% RM
r = 0.3
print('Missing rate = {}'.format(r))
missing_rate = r
## Random Missing (RM)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim2, dim3) + 0.5 - missing_rate)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.25
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
## 70% RM
r = 0.7
print('Missing rate = {}'.format(r))
missing_rate = r
## Random Missing (RM)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim2, dim3) + 0.5 - missing_rate)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.2
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
## 90% RM
r = 0.9
print('Missing rate = {}'.format(r))
missing_rate = r
## Random Missing (RM)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim2, dim3) + 0.5 - missing_rate)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-4
theta = 0.1
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 5.51797 Iter: 100 RMSE: 3.00147 Imputation MAPE: 0.070093 Imputation RMSE: 2.99615 Running time: 115 seconds Missing rate = 0.7 Iter: 50 RMSE: 5.32424 Iter: 100 RMSE: 3.58887 Imputation MAPE: 0.0836949 Imputation RMSE: 3.57911 Running time: 151 seconds Missing rate = 0.9 Iter: 50 RMSE: 4.05595 Iter: 100 RMSE: 4.05 Imputation MAPE: 0.0950771 Imputation RMSE: 4.05026 Running time: 167 seconds
import numpy as np
import time
import scipy.io
for r in [0.3, 0.7]:
print('Missing rate = {}'.format(r))
missing_rate = r
## Non-random Missing (NM)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim3) + 0.5 - missing_rate)[:, None, :]
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.05
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 5.06925 Iter: 100 RMSE: 4.08494 Imputation MAPE: 0.0965249 Imputation RMSE: 4.08516 Running time: 152 seconds Missing rate = 0.7 Iter: 50 RMSE: 5.14929 Iter: 100 RMSE: 4.30086 Imputation MAPE: 0.101477 Imputation RMSE: 4.30102 Running time: 123 seconds
import numpy as np
import time
import scipy.io
np.random.seed(1000)
missing_rate = 0.3
## Block-out Missing (BM)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
dim_time = dim2 * dim3
block_window = 6
vec = np.random.rand(int(dim_time / block_window))
temp = np.array([vec] * block_window)
vec = temp.reshape([dim2 * dim3], order = 'F')
sparse_tensor = mat2ten(ten2mat(dense_tensor, 0) * np.round(vec + 0.5 - missing_rate)[None, :], np.array([dim1, dim2, dim3]), 0)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Iter: 50 RMSE: 5.27782 Iter: 100 RMSE: 3.97509 Imputation MAPE: 0.094045 Imputation RMSE: 3.96451 Running time: 147 seconds
import numpy as np
import time
import scipy.io
for r in [0.3, 0.7, 0.9]:
print('Missing rate = {}'.format(r))
missing_rate = r
## Random Missing (RM)
dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim2, dim3) + 0.5 - missing_rate)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 25.2411 Iter: 100 RMSE: 24.944 Imputation MAPE: 0.186277 Imputation RMSE: 24.9491 Running time: 15 seconds Missing rate = 0.7 Iter: 50 RMSE: 29.1678 Iter: 100 RMSE: 29.5341 Imputation MAPE: 0.201632 Imputation RMSE: 29.5459 Running time: 19 seconds Missing rate = 0.9 Iter: 50 RMSE: 37.7603 Iter: 100 RMSE: 38.0407 Imputation MAPE: 0.229517 Imputation RMSE: 38.0515 Running time: 23 seconds
import numpy as np
import time
import scipy.io
for r in [0.3, 0.7]:
print('Missing rate = {}'.format(r))
missing_rate = r
## Non-random Missing (NM)
dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim3) + 0.5 - missing_rate)[:, None, :]
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 49.406 Iter: 100 RMSE: 47.6061 Imputation MAPE: 0.193862 Imputation RMSE: 47.5992 Running time: 21 seconds Missing rate = 0.7 Iter: 50 RMSE: 42.8749 Iter: 100 RMSE: 41.8305 Imputation MAPE: 0.226381 Imputation RMSE: 41.8327 Running time: 14 seconds
import numpy as np
import time
import scipy.io
np.random.seed(1000)
missing_rate = 0.3
## Block-out Missing (BM)
dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
dim_time = dim2 * dim3
block_window = 6
vec = np.random.rand(int(dim_time / block_window))
temp = np.array([vec] * block_window)
vec = temp.reshape([dim2 * dim3], order = 'F')
sparse_tensor = mat2ten(ten2mat(dense_tensor, 0) * np.round(vec + 0.5 - missing_rate)[None, :], np.array([dim1, dim2, dim3]), 0)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Iter: 50 RMSE: 28.8777 Iter: 100 RMSE: 29.2744 Imputation MAPE: 0.21374 Imputation RMSE: 29.2814 Running time: 20 seconds
import numpy as np
import pandas as pd
import time
import scipy.io
for r in [0.3, 0.7, 0.9]:
print('Missing rate = {}'.format(r))
missing_rate = r
## Random missing (RM)
dense_tensor = np.load('../datasets/Seattle-data-set/tensor.npz')['arr_0'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim2, dim3) + 0.5 - missing_rate)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.30
if r > 0.8:
rho = 5e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 5.86304 Iter: 100 RMSE: 3.1126 Imputation MAPE: 0.0480528 Imputation RMSE: 3.10965 Running time: 263 seconds Missing rate = 0.7 Iter: 50 RMSE: 5.89289 Iter: 100 RMSE: 3.79913 Imputation MAPE: 0.0613629 Imputation RMSE: 3.7982 Running time: 228 seconds Missing rate = 0.9 Iter: 50 RMSE: 4.86097 Iter: 100 RMSE: 4.81283 Imputation MAPE: 0.081929 Imputation RMSE: 4.81347 Running time: 260 seconds
import numpy as np
import pandas as pd
import time
import scipy.io
for r in [0.3, 0.7]:
print('Missing rate = {}'.format(r))
missing_rate = r
## Non-random Missing (NM)
dense_tensor = np.load('../datasets/Seattle-data-set/tensor.npz')['arr_0'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim3) + 0.5 - missing_rate)[:, None, :]
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.05
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 5.10438 Iter: 100 RMSE: 4.43494 Imputation MAPE: 0.074171 Imputation RMSE: 4.43533 Running time: 249 seconds Missing rate = 0.7 Iter: 50 RMSE: 6.14372 Iter: 100 RMSE: 5.4049 Imputation MAPE: 0.0928467 Imputation RMSE: 5.40446 Running time: 239 seconds
import numpy as np
import scipy.io
np.random.seed(1000)
missing_rate = 0.3
## Block-out Missing (BM)
dense_tensor = np.load('../datasets/Seattle-data-set/tensor.npz')['arr_0'].transpose(0, 2, 1)
dim1, dim2, dim3 = dense_tensor.shape
block_window = 12
vec = np.random.rand(int(dim2 * dim3 / block_window))
temp = np.array([vec] * block_window)
vec = temp.reshape([dim2 * dim3], order = 'F')
sparse_tensor = mat2ten(dense_mat * np.round(vec + 0.5 - missing_rate)[None, :], np.array([dim1, dim2, dim3]), 0)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.30
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Iter: 50 RMSE: 6.60377 Iter: 100 RMSE: 5.69047 Imputation MAPE: 0.0981021 Imputation RMSE: 5.69791 Running time: 220 seconds
import numpy as np
import pandas as pd
import time
import scipy.io
for r in [0.3, 0.7, 0.9]:
print('Missing rate = {}'.format(r))
missing_rate = r
# Random Missing (RM)
dense_mat = np.load('../datasets/Portland-data-set/volume.npy')
dim1, dim2 = dense_mat.shape
dim = np.array([dim1, 96, 31])
dense_tensor = mat2ten(dense_mat, dim, 0)
np.random.seed(1000)
sparse_tensor = mat2ten(dense_mat * np.round(np.random.rand(dim1, dim2) + 0.5 - missing_rate), dim, 0)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 16.6038 Iter: 100 RMSE: 15.6579 Imputation MAPE: 0.172064 Imputation RMSE: 15.6594 Running time: 693 seconds Missing rate = 0.7 Iter: 50 RMSE: 19.5941 Iter: 100 RMSE: 19.2446 Imputation MAPE: 0.204501 Imputation RMSE: 19.2494 Running time: 698 seconds Missing rate = 0.9 Iter: 50 RMSE: 23.7422 Iter: 100 RMSE: 24.1852 Imputation MAPE: 0.244044 Imputation RMSE: 24.1911 Running time: 646 seconds
import numpy as np
import pandas as pd
import time
import scipy.io
for r in [0.3, 0.7]:
print('Missing rate = {}'.format(r))
missing_rate = r
# Non-random Missing (NM)
dense_mat = np.load('../datasets/Portland-data-set/volume.npy')
dim1, dim2 = dense_mat.shape
dim = np.array([dim1, 96, 31])
dense_tensor = mat2ten(dense_mat, dim, 0)
np.random.seed(1000)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim1, dim[2]) + 0.5 - missing_rate)[:, None, :]
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.10
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Missing rate = 0.3 Iter: 50 RMSE: 19.2963 Iter: 100 RMSE: 18.5759 Imputation MAPE: 0.193184 Imputation RMSE: 18.5804 Running time: 722 seconds Missing rate = 0.7 Iter: 50 RMSE: 38.5421 Iter: 100 RMSE: 38.2218 Imputation MAPE: 0.269787 Imputation RMSE: 38.2252 Running time: 656 seconds
import numpy as np
import scipy.io
np.random.seed(1000)
missing_rate = 0.3
## Block-out Missing (BM)
dense_mat = np.load('../datasets/Portland-data-set/volume.npy')
dim1, dim2 = dense_mat.shape
dim = np.array([dim1, 96, 31])
dense_tensor = mat2ten(dense_mat, dim, 0)
block_window = 4
vec = np.random.rand(int(dim2 / block_window))
temp = np.array([vec] * block_window)
vec = temp.reshape([dim2], order = 'F')
sparse_tensor = mat2ten(dense_mat * np.round(vec + 0.5 - missing_rate)[None, :], dim, 0)
start = time.time()
alpha = np.ones(3) / 3
rho = 1e-5
theta = 0.05
epsilon = 1e-4
maxiter = 100
LRTC(dense_tensor, sparse_tensor, alpha, rho, theta, epsilon, maxiter)
end = time.time()
print('Running time: %d seconds'%(end - start))
print()
Iter: 50 RMSE: 23.3329 Iter: 100 RMSE: 23.0543 Imputation MAPE: 0.230816 Imputation RMSE: 23.0534 Running time: 697 seconds