sessionInfo()
R version 4.2.2 (2022-10-31) Platform: x86_64-apple-darwin17.0 (64-bit) Running under: macOS Catalina 10.15.7 Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 attached base packages: [1] stats graphics grDevices utils datasets methods base loaded via a namespace (and not attached): [1] fansi_1.0.4 crayon_1.5.2 digest_0.6.31 utf8_1.2.2 [5] IRdisplay_1.1 repr_1.1.5 lifecycle_1.0.3 jsonlite_1.8.4 [9] evaluate_0.20 pillar_1.8.1 rlang_1.0.6 cli_3.6.0 [13] uuid_1.1-0 vctrs_0.5.2 IRkernel_1.3.2 tools_4.2.2 [17] glue_1.6.2 fastmap_1.1.0 compiler_4.2.2 base64enc_0.1-3 [21] pbdZMQ_0.3-9 htmltools_0.5.4
library(Matrix)
library(mvtnorm)
library(microbenchmark)
Named after André-Louis Cholesky, 1875-1918.
Our mantra 2:
The structure should be exploited whenever possible in solving a problem.
Common structures include: symmetry, positive (semi)definiteness, sparsity, Kronecker product, low rank, ...
We can always assume that a postive (semi)definite matrix is symmetric (why?)
Example: normal equation
for linear regression, the coefficient matrix $\mathbf{X}^T \mathbf{X}$ is symmetric and positive semidefinite. How to exploit this structure?
Proof (by induction):
If $n=1$, then $0 < a = \sqrt{a}\sqrt{a}$. For $n>1$, the block equation
$$
\begin{eqnarray*}
\begin{bmatrix}
a_{11} & \mathbf{a}^T \\ \mathbf{a} & \mathbf{A}_{22}
\end{bmatrix} =
\begin{bmatrix}
\ell_{11} & \mathbf{0}_{n-1}^T \\ \mathbf{b} & \mathbf{L}_{22}
\end{bmatrix}
\begin{bmatrix}
\ell_{11} & \mathbf{b}^T \\ \mathbf{0}_{n-1} & \mathbf{L}_{22}^T
\end{bmatrix}
\end{eqnarray*}
$$
entails
$$
\begin{eqnarray*}
a_{11} &=& \ell_{11}^2 \\
\mathbf{a} &=& \ell_{11} \mathbf{b} \\
\mathbf{A}_{22} &=& \mathbf{b} \mathbf{b}^T + \mathbf{L}_{22} \mathbf{L}_{22}^T
.
\end{eqnarray*}
$$
Since $a_{11}>0$ (why?), $\ell_{11}=\sqrt{a_{11}}$ and $\mathbf{b}=a_{11}^{-1/2}\mathbf{a}$ are uniquely determined.
$\mathbf{A}_{22} - \mathbf{b} \mathbf{b}^T = \mathbf{A}_{22} - a_{11}^{-1} \mathbf{a} \mathbf{a}^T$ is positive definite of size $(n-1)\times(n-1)$ because $\mathbf{A}_{22}$ is positive definite (why? homework). By the induction hypothesis, lower triangular $\mathbf{L}_{22}$ such that $\mathbf{A}_{22} - \mathbf{b} \mathbf{b}^T = \mathbf{L}_{22}^T\mathbf{L}_{22}$ exists and is unique.
for (k in seq_len(n)) {
for (j in k + seq_len(n - k)) {
a_jk_div_a_kk <- A[j, k] / A[k, k]
for (i in j - 1 + seq_len(n - j + 1)) {
A[i, j] <- A[i, j] - A[i, k] * a_jk_div_a_kk # L_22
}
}
sqrt_akk <- sqrt(A[k, k])
for (i in k - 1 + seq_len(n - k + 1)) {
A[i, k] = A[i, k] / sqrt_akk # l_11 and b
}
}
Source: http://www.netlib.org
plus $n$ square roots. Half the cost of LU decomposition.
Pivoting is only needed if you want the Cholesky factor of a positive semidefinite matrix.
When $\mathbf{A}$ does not have full rank, e.g., $\mathbf{X}^T \mathbf{X}$ with a non-full column rank $\mathbf{X}$, we encounter $a_{kk} = 0$ during the procedure.
Symmetric pivoting. At each stage $k$, we permute both row and column such that $\max_{k \le i \le n} a_{ii}$ becomes the pivot. If we encounter $\max_{k \le i \le n} a_{ii} = 0$, then $\mathbf{A}[k:n,k:n] = \mathbf{0}$ (why?) and the algorithm terminates.
With symmetric pivoting:
where $\mathbf{P}$ is a permutation matrix and $\mathbf{L} \in \mathbb{R}^{n \times r}$, $r = \text{rank}(\mathbf{A})$.
A <- t(Matrix(c(4, 12, -16, 12, 37, -43, -16, -43, 98), nrow=3)) # check out this is pd!
A
3 x 3 Matrix of class "dsyMatrix" [,1] [,2] [,3] [1,] 4 12 -16 [2,] 12 37 -43 [3,] -16 -43 98
# Cholesky without pivoting
Achol <- Matrix::chol(Matrix::symmpart(A))
Achol
3 x 3 Matrix of class "Cholesky" [,1] [,2] [,3] [1,] 2 6 -8 [2,] . 1 5 [3,] . . 3
class(Achol)
getSlots("Cholesky")
str(Achol)
Formal class 'Cholesky' [package "Matrix"] with 5 slots ..@ Dim : int [1:2] 3 3 ..@ Dimnames:List of 2 .. ..$ : NULL .. ..$ : NULL ..@ x : num [1:9] 2 0 0 6 1 0 -8 5 3 ..@ uplo : chr "U" ..@ diag : chr "N"
b <- c(1.0, 2.0, 3.0)
solve(A, b) # this does LU; wasteful!; 2/3 n^3 + 2n^2
3 x 1 Matrix of class "dgeMatrix" [,1] [1,] 28.583333 [2,] -7.666667 [3,] 1.333333
y <- solve(t(Achol), b) # two triangular solves; only 2n^2 flops
solve(Achol, y)
3 x 1 Matrix of class "dgeMatrix" [,1] [1,] 28.583333 [2,] -7.666667 [3,] 1.333333
det(A) # this actually does LU; wasteful!
det(Achol)^2 # cheap
solve(A) # this does LU!
3 x 3 Matrix of class "dsyMatrix" [,1] [,2] [,3] [1,] 49.361111 -13.5555556 2.1111111 [2,] -13.555556 3.7777778 -0.5555556 [3,] 2.111111 -0.5555556 0.1111111
solve(Achol, solve(t(Achol))) # 2n triangular solves
3 x 3 Matrix of class "dgeMatrix" [,1] [,2] [,3] [1,] 49.361111 -13.5555556 2.1111111 [2,] -13.555556 3.7777778 -0.5555556 [3,] 2.111111 -0.5555556 0.1111111
(A <- Matrix(c(1,1,.5,1,1,.5,.5,.5,1), nrow=3))
eigen(A, symmetric = TRUE)$values
3 x 3 Matrix of class "dsyMatrix" [,1] [,2] [,3] [1,] 1.0 1.0 0.5 [2,] 1.0 1.0 0.5 [3,] 0.5 0.5 1.0
Matrix::chol(A, pivot=FALSE) # 2nd argument requests pivoting
Error in value[[3L]](cond): chol(x) is undefined: 'x' is not positive definite Traceback: 1. Matrix::chol(A, pivot = FALSE) 2. Matrix::chol(A, pivot = FALSE) 3. tryCatch(.Call(dpoMatrix_trf, if (x@uplo == "U") x else t(x), . 2L), error = function(e) stop("chol(x) is undefined: 'x' is not positive definite")) 4. tryCatchList(expr, classes, parentenv, handlers) 5. tryCatchOne(expr, names, parentenv, handlers[[1L]]) 6. value[[3L]](cond) 7. stop("chol(x) is undefined: 'x' is not positive definite")
(Achol <- base::chol(A, pivot=TRUE)) # turn off checking pd
attr(Achol, "pivot")
attr(Achol, "rank")
Warning message in chol.default(A, pivot = TRUE): “the matrix is either rank-deficient or indefinite”
1 | 0.5000000 | 1 |
0 | 0.8660254 | 0 |
0 | 0.0000000 | 0 |
Matrix::rankMatrix(A) # determine rank from SVD, which is more numerically stable
Achol[, attr(Achol, "pivot")]
1 | 1 | 0.5000000 |
0 | 0 | 0.8660254 |
0 | 0 | 0.0000000 |
(P <- as(attr(Achol, "pivot"), "pMatrix")) # permutation matrix, actually P' in our notation
3 x 3 sparse Matrix of class "pMatrix" [1,] | . . [2,] . . | [3,] . | .
# P A P' = L L'
Matrix::norm(t(P) %*% A %*% P - t(Achol) %*% Achol, type="F") # Frobenius norm
Multivariate normal density $\mathcal{N}(0, \Sigma)$, where $\Sigma$ is $n\times n$ positive definite, is $$ \frac{1}{(2\pi)^{n/2}\det(\Sigma)^{1/2}}\exp\left(-\frac{1}{2}\mathbf{y}^T\Sigma^{-1}\mathbf{y}\right). $$
Hence the log likelihood is $$
$$
Method 1:
Method 2:
Which method is better?
# word-by-word transcription of mathematical expression
# Matrix::determinant computes the logarithm of the determinant
logpdf_mvn_1 <- function(y, Sigma) {
n <- length(y)
- (n / 2) * log(2 * pi) - (1 / 2) * Matrix::determinant(Sigma)$modulus - (1 / 2) * t(y) %*% solve(Sigma) %*% y
}
# you learnt why you should avoid inversion
logpdf_mvn_2 <- function(y, Sigma) {
n <- length(y)
Sigmachol <- Matrix::chol(Matrix::symmpart(Sigma))
- (n / 2) * log(2 * pi) - Matrix::determinant(Sigmachol)$modulus - (1 / 2) * sum(abs(solve(t(Sigmachol), y))^2)
}
# this is for the efficiency-concerned
logpdf_mvn_3 <- function(y, Sigma) {
n <- length(y)
Sigmachol <- Matrix::chol(Matrix::symmpart(Sigma))
- 0.5 * n * log(2 * pi) - sum(log(diag(Sigmachol))) - 0.5 * sum(abs(solve(t(Sigmachol), y))^2)
}
set.seed(123) # seed
n <- 1000
# a pd matrix
Sigma_half <- Matrix(rnorm(n * (2 * n)), nrow=n)
Sigma <- Sigma_half %*% t(Sigma_half)
Sigma <- Matrix::symmpart(Sigma)
Sigmachol <- Matrix::chol(Sigma)
y <- Sigmachol %*% rnorm(n) # one random sample from N(0, Sigma). Homework
# at least they give same answer
(logpdf_mvn_1(y, Sigma))
(logpdf_mvn_2(y, Sigma))
(logpdf_mvn_3(y, Sigma))
1 x 1 Matrix of class "dgeMatrix" [,1] [1,] -5272.565
res <- microbenchmark::microbenchmark(logpdf_mvn_1(y, Sigma), logpdf_mvn_2(y, Sigma), logpdf_mvn_3(y, Sigma))
print(res)
Unit: milliseconds expr min lq mean median uq logpdf_mvn_1(y, Sigma) 194.263957 197.941488 202.391593 202.178377 205.562591 logpdf_mvn_2(y, Sigma) 2.657588 3.105828 5.582544 3.476632 4.333807 logpdf_mvn_3(y, Sigma) 2.536665 3.143556 5.134440 3.367640 3.741760 max neval cld 223.3571 100 b 111.4274 100 a 111.8951 100 a
Total computational cost is $np^2 + (1/3) p^3$ (without s.e.) or $np^2 + (5/3) p^3$ (with s.e.) flops.