import statsmodels.api as sm
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
from scipy import stats
from sklearn.datasets import make_regression
from numpy import linalg as la
Model fitting:
$$F(d_i, w) = y_i \\ y_i= \text{label data} \\ \\ F= \text{model} \\ y_i = \text{label/outputs}$$
Linear model example:
Loss function:
Least-squares loss function:
$$min ||D w- b||^2 \\ \ell (d_i, w, b_i) = (d^T_i w-b_i)^2$$
Where $d_i^T$ is rows of matrix d, then d*w is matrix vector product (inner product) minus $b_i$
Penalized regressions: ridge penality
$$min ||w||_2^2 + ||D w- b||^2$$
A = np.random.randint(20, size=(3,3))
A
array([[ 8, 18, 4], [ 4, 9, 13], [17, 19, 5]])
values, vectors = np.linalg.eig(A)
U = vectors
U_inv = np.linalg.inv(vectors)
D = np.diag(values)
eigen = U@D@U_inv
print(np.real(np.round(eigen,2)))
[[ 8. 18. 4.] [ 4. 9. 13.] [17. 19. 5.]]
Example of eigenvalue decomposition in optimization: $$x = \beta_1 e_1 + \beta_2 e_2 \\ Ax = A(\beta_1 e_1 + \beta_2 e_2) \\ = A\beta_1e_1 + A\beta_2 e_2 $$
Mulitply A by eigenvector = eigenvalue eigenvalue $$= \beta_1 \lambda_1 e_1 + \beta_2 \lambda_2 e_2 \tag{2.1}$$
Inverse of 2.1 $\beta_1 \lambda_1^{-1} e_1 + \beta_2 \lambda_2^{-1} e_2 \tag{2.2}$
Using slightly different notation:
$$AV= c IV$$Where:
Rearranged:
$$AV- c IV = 0 \\ (A-c I)V=0$$Where A-cI is called the characteristic matrix of A
Matrix A is:
A = np.array([[-6,3],[3,-6]])
np.linalg.eig(A)[0]
array([-3., -9.])
def eigen(matrix):
A = np.diag(matrix) #Diagonal
I = np.diag(np.fliplr(matrix)) #Opposite Diagonal
#Quadratic equation
c = np.prod(A)-np.prod(I) #C-term
b = -np.sum(A) #B-term
root_term = math.sqrt(b**2-4*(c))#Quadrtic formula root term
solu1 = (-(b)+root_term)/2 #Quadratic formula solution1
solu2 = ((-b)-root_term)/2 #Quadratic formula solution2
vec1 = matrix-np.diag(np.repeat(solu1,2))
vec2 = matrix-np.diag(np.repeat(solu2,2))
print("Original matrix: \n",matrix)
print("Eigen-values: {}, {}\n".format(round(solu1,3),round(solu2,3)))
#Classification of matrix
if solu1>0 and solu2>0:
print('Pos definite')
if solu1<0 and solu2<0:
print('Neg definite')
if (solu1==0 or solu2==0) and (solu1>0 and solu2>0):
print('Pos semi-def')
if (solu1==0 or solu2==0) and (solu1<0 and solu2<0):
print('Pos semi-def')
if (solu1<0 and solu2>0) or (solu1>0 and solu2<0):
print('Indefinite')
A = np.random.randint(5, size=(2,2))
eigen(A)
Original matrix: [[0 0] [0 4]] Eigen-values: 4.0, 0.0
Suppose:
$$\lambda_1 =1, \ \lambda_2=0.1, \ \lambda_3 = 0.01 \\ Ax = b= \beta_1 e_1 +0.1\beta_2 e_2 +0.01 \beta_3 e_3 \\ \hat{b} = b + \eta$$$\eta$, noise, would show up at every frequency uniformly
Example of adding random Guassian 2-d noise with no directionality at 0.02
$$AX = \beta_1 1.02 +\beta_2 0.12+0.03 \beta_3 $$Inversion would blow up the matrix
This issue crops up all the time
Condition number:
Ratio of smallest to largest singular value
Code written by: CMSC 764 hmwk2
def condition_blowup(n):
A = sp.linalg.hilbert(n)# construct an n by n matrix
x = np.random.randn(n,1)# construct n by 1 signal
b = A@x
x_recovered = la.inv(A)@b #Can also use la.solve(A,b)
# 1. Recovering X: inverting A dot b
print('1. Recovery error with no noise = {:0.3}'.format(la.norm(x_recovered-x)))
# 2.1 Now add small amount of noise to vector b
b_noise = b+np.random.randn(n,1)*0.0000001
# 2.2 The error blows up when we try to recover X
x_recovered_withnoise = la.inv(A)@b_noise
print('2. Recovery error with noise of {:.03} = {:.03f}'.format(la.norm(b_noise-b), la.norm(x_recovered_withnoise-x)))
np.random.seed(0)
condition_blowup(8)
1. Recovery error with no noise = 1.63e-06 2. Recovery error with noise of 1.79e-07 = 178.930
import numpy as np
def buildmat(m,n,condition_number):
A = np.random.randint(10, size=(m,n))
print("The {}x{} matrix A with condition number={}:\n{}".format(m,n, condition_number,A))
#print("Np.linalg.cond",np.linalg.cond(A))
U, S, V = np.linalg.svd(A)
S = np.array([[S[j] if i==j else 0 for j in range(n)] for i in range(m)]) #Creates diagnoal matrix with S entries
S[S!=0]= np.linspace(condition_number,1,min(m,n)) #Creates matrix with condition number in 1,1
A = U@S@V
print("Constructed matrix which will yield a condtion number={}".format(A, condition_number))
m,n,condNumber = 2,2,2
np.random.seed(100)
buildmat(m,n,condNumber)
print(np.linalg.cond(A),"\n")
The 2x2 matrix A with condition number=2: [[8 8] [3 7]] Constructed matrix which will yield a condtion number=[[1.46829282 0.97228825] [0.02360495 1.37775707]] inf
What if a matrix the matrix is not full-rank?
$$A \in R^{MxN}, \ \ M<N \\ b= Ax + \eta$$If the error is bounded $||\eta||<\varepsilon$ solve:
$$\text{Min} ||x|| s.j.t. \ \ ||Ax-b||\leq \varepsilon$$If the error is bounded $(||\eta||\leq \varepsilon)$
$$b= Ax+\eta$$$$Min ||x|| s.j.t ||Ax-b||\leq \varepsilon$$This is equivalent to:
$$Min \ \ \lambda||x||^2 +||Ax-b||^2$$One way to analyze ridge penalty is to look at what at it does to the condition number of the system
The new condition number:
$$\frac{\sigma^2_{\text{max}}+\lambda}{\sigma^2_{\text{min}}+\lambda}$$Anything that involves a smooth l-2 penalty
Easy to solve these linear systems
Improved condition number, less noise sensitivity
Parameter $\lambda$ can be set in two ways:
Suppose we randomly draw a vanilla cookie. What is the probability it came from bowl 1?
Instead the answer can be found using Bayes
$$P(\text{Bowl 1}|\text{Vanilla})= \frac{p(B_1)p(V|B_1)}{p(V)}\\ = \frac{1/2\cdot3/4}{5/8}\\ =.6$$Assumptions:
Bayes' rule $$P(X|Y) \propto P(X|Y)\times P(X)$$
$$Max x P(x|b) \propto p(b|x)p(x)$$Probability of data: $$P(b|X) = \prod_i \frac{1}{\sigma \sqrt{2\pi}}e ^{-\frac{1}{2\sigma^2}(b_i-(Ax)_i)^2} \\ = (2\pi\sigma^2)^{{-m/2}} e ^{-\frac{1}{2\sigma^2}||b_i-Ax||^2} $$
Prior: $$P(x) = (2\pi E^2)^{-n/2} e^{-1\frac{1}{2E^2}}||x||^2$$
Max:
$$Exp(\frac{-||b-Ax||^2}{2\sigma^2})exp(\frac{-||x||^2}{2E^2})$$Negative Log-Likelihood
$$-\frac{||b-Ax||^2}{2\sigma^2}-\frac{||x||^2}{2E^2}$$Multiply by -1 to make into minimization problem
$$\frac{\sigma^2}{E^2}||x||^2+||b-Ax||^2$$from sklearn.datasets import make_regression
np.random.seed(1)
x, y = make_regression(n_samples=100, n_features=1, noise=10,effective_rank=1, bias=5)
x = pd.DataFrame(x)
# Create polynomial model
for i in range(0,20):
x['x'+str(i)] = x.iloc[:,[0]]**i
x=x.drop(0, axis=1)
results = sm.OLS(y, x.iloc[:, 1:]).fit()
plt.scatter(x['x1'],y)
plt.scatter(x.x1, results.predict())
plt.title(r'Example of overfitting: $y=a_1+\cdots+a_{20}$')
plt.legend(['Observations','Fitted'])
plt.show()
A large condition number $\rightarrow$ risk of overfitting
When the condition number is big you will likely overfit if there is any type of noise or rounding error.
The rounding error blows up when inverting matrixs
If the condition number is very big and you start solving the linear system by row elimination you a little rounding error which become part of the solution.
The condition number to a linear system is usually large so the errors explode
If you start solving the system you are accumulating the rounding errors, and when you invert the system this blows up the error
If you have a well conditioned problem then it is difficult to overfit because the X has weak dependence on the noise
np.random.seed(1)
x, y = make_regression(n_samples=20, n_features=1, noise=10,effective_rank=1, bias=5)
x = pd.DataFrame(x)
# Create polynomial model
rsq = np.zeros(20)
degree = np.arange(1,20)
for i in degree:
x['x'+str(i)] = x.iloc[:,[0]]**i
results = sm.OLS(y, x.iloc[:, 1:]).fit()
rsq[i] = results.rsquared
print('Condition number: {}'.format(la.cond(x,2)))
ax = plt.plot(degree, rsq[1:])
plt.ylabel('R-squared')
plt.xlabel('Polynomial degree fit')
#Find the polynomial when r-squared decreases
for idx,i in enumerate(np.diff(rsq[1:], axis=0)):
if i<0:
plt.axvline(idx, color='red')
plt.title('Model is best fit @ '+str(idx))
Condition number: 9.285999564131141e+17
Used to control over-fitting
Usually used when matrix is fat (M>N)
$$Min ||x||_0 \ \ s.j.t \ \ AX=b$$The problem is that L-0 is not convex but we can find a solution
Basic Pursuit:
Basic Pursuit Denoising:
Lasso:
Basic Pursuit Denoising:
$$\lambda|x|+ \frac{1}{2} ||Ax=b||^2$$Prior is often Laplace distribution which is more robust to outliers
Laplace distribution:
Laplace PDF:
$$f(x; \mu, \lambda)= \frac{1}{2\lambda}\exp(-\frac{|x - \mu|}{\lambda})$$
loc, scale = 0, 1
laplace = np.random.laplace(loc, scale, 1000)
x = np.arange(-8., 8., .01)
laplace_pdf = np.exp(-abs(x-loc)/scale)/(2.*scale)
plt.plot(x, laplace_pdf, label='Laplacian')
gaussian_pdf = (1/(scale * np.sqrt(2 * np.pi)) *
np.exp(-(x - loc)**2 / (2 * scale**2)))
plt.plot(x,gaussian_pdf, label='Gaussian')
plt.legend()
plt.show()
Most straightforward CV is train vs. test