using Requests

longleyurl = get("http://hua-zhou.github.io/teaching/biostatm280-2018spring/slides/13-cond/longley.txt")
longley = readdlm(IOBuffer(longleyurl.data))

using StatPlots
gr()

corrplot(longley, 
    label = ["Employ" "Price" "GNP" "Jobless" "Military" "PopSize" "Year"])

# response: Employment
y = longley[:, 1]
# predictor matrix
X = [ones(length(y)) longley[:, 2:end]]

# Julia function for obtaining condition number
cond(X)

# what's the implementation?
@which cond(X)

# we see the smallest singular value (aka trouble number) is very small
xsvals = svdvals(X)

# condition number of the design matrix
xcond = maximum(xsvals) / minimum(xsvals)

# X is full rank from SVD
xrksvd = rank(X)

@which rank(X)

# least squares from QR
X \ y

# Gram matrix
G = X'X

# rank of Gram matrix from SVD
# rank deficient!
rank(G)

svdvals(G)

# rank of Gram matrix from cholesky
# full!
gchol = cholfact(G, :L, Val{true})
rank(gchol)

# least squares solution from Cholesky matches those from QR
gchol \ (X'y)

Xsp = convert(Matrix{Float32}, X)
ysp = convert(Vector{Float32}, y)

# least squares by QR: dramatically different from Float64 solution
Xsp \ ysp

# least squares by Cholesky: failed
G = Xsp'Xsp
gchol = cholfact(G, :L, Val{true})
gchol \ (Xsp'ysp)

rank(Xsp)

# rank of Gram matrix by Cholesky
rank(gchol)

# rank of Gram matrix by SVD
rank(G)

using StatsBase

# center and standardize matrix along dimension 1
Xcs = zscore(X[:, 2:end], 1)
Xcs = [ones(length(y)) Xcs]

cond(Xcs)

rank(Xcs)

rank(Xcs'Xcs)

versioninfo()