using LinearAlgebra
using Plots
default(linewidth=4, legendfontsize=12)
function vander(x, k=nothing)
if isnothing(k)
k = length(x)
end
m = length(x)
V = ones(m, k)
for j in 2:k
V[:, j] = V[:, j-1] .* x
end
V
end
function vander_chebyshev(x, n=nothing)
if isnothing(n)
n = length(x) # Square by default
end
m = length(x)
T = ones(m, n)
if n > 1
T[:, 2] = x
end
for k in 3:n
#T[:, k] = x .* T[:, k-1]
T[:, k] = 2 * x .* T[:,k-1] - T[:, k-2]
end
T
end
function chebyshev_regress_eval(x, xx, n)
V = vander_chebyshev(x, n)
vander_chebyshev(xx, n) / V
end
runge(x) = 1 / (1 + 10*x^2)
runge_noisy(x, sigma) = runge.(x) + randn(size(x)) * sigma
CosRange(a, b, n) = (a + b)/2 .+ (b - a)/2 * cos.(LinRange(-pi, 0, n))
vcond(mat, points, nmax) = [cond(mat(points(-1, 1, n))) for n in 2:nmax]
vcond (generic function with 1 method)
Instead of solving the least squares problem using linear algebra (QR factorization), we could solve it using gradient descent. That is, on each iteration, we'll take a step in the direction of the negative gradient.
function grad_descent(loss, grad, c0; gamma=1e-3, tol=1e-5)
"""Minimize loss(c) via gradient descent with initial guess c0
using learning rate gamma. Declares convergence when gradient
is less than tol or after 500 steps.
"""
c = copy(c0)
chist = [copy(c)]
lhist = [loss(c)]
for it in 1:500
g = grad(c)
c -= gamma * g
push!(chist, copy(c))
push!(lhist, loss(c))
if norm(g) < tol
break
end
end
(c, hcat(chist...), lhist)
end
grad_descent (generic function with 1 method)
A = [1 1; 1 8]
@show cond(A)
loss(c) = .5 * c' * A * c
grad(c) = A * c
c, chist, lhist = grad_descent(loss, grad, [.9, .9],
gamma=.22)
plot(lhist, yscale=:log10, xlims=(0, 80))
cond(A) = 9.46578492882319
plot(chist[1, :], chist[2, :], marker=:circle)
x = LinRange(-1, 1, 30)
contour!(x, x, (x,y) -> loss([x, y]))
x = LinRange(-1, 1, 200)
sigma = 0.5; n = 8
y = runge_noisy(x, sigma)
V = vander(x, n) # or vander_chebyshev
function loss(c)
r = V * c - y
.5 * r' * r
end
function grad(c)
r = V * c - y
V' * r
end
c, _, lhist = grad_descent(loss, grad, ones(n),
gamma=0.008)
c
8-element Vector{Float64}: 0.7805671076161028 0.07125885267568355 -1.8025038787243435 0.03382484774772874 0.8464406682785461 -0.17304293350946157 0.6962824216982956 0.0711130108623033
c0 = V \ y
l0 = 0.5 * norm(V * c0 - y)^2
@show cond(V)
@show cond(V' * V)
plot(lhist, yscale=:log10, ylim=(15, 50))
plot!(i -> l0, color=:black)
cond(V) = 230.00549982014527 cond(V' * V) = 52902.52994792632
Instead of the linear model $$ f(x,c) = V(x) c = c_0 + c_1 \underbrace{x}_{T_1(x)} + c_2 T_2(x) + \dotsb $$ let's consider a rational model with only three parameters $$ f(x,c) = \frac{1}{c_1 + c_2 x + c_3 x^2} = (c_1 + c_2 x + c_3 x^2)^{-1} . $$ We'll use the same loss function $$ L(c; x,y) = \frac 1 2 \lVert f(x,c) - y \rVert^2 . $$
We will also need the gradient $$ \nabla_c L(c; x,y) = \big( f(x,c) - y \big)^T \nabla_c f(x,c) $$ where \begin{align} \frac{\partial f(x,c)}{\partial c_1} &= -(c_1 + c_2 x + c_3 x^2)^{-2} = - f(x,c)^2 \\ \frac{\partial f(x,c)}{\partial c_2} &= -(c_1 + c_2 x + c_3 x^2)^{-2} x = - f(x,c)^2 x \\ \frac{\partial f(x,c)}{\partial c_3} &= -(c_1 + c_2 x + c_3 x^2)^{-2} x^2 = - f(x,c)^2 x^2 . \end{align}
f(x, c) = 1 ./ (c[1] .+ c[2].*x + c[3].*x.^2)
function gradf(x, c)
f2 = f(x, c).^2
[-f2 -f2.*x -f2.*x.^2]
end
function loss(c)
r = f(x, c) - y
0.5 * r' * r
end
function gradient(c)
r = f(x, c) - y
vec(r' * gradf(x, c))
end
gradient (generic function with 1 method)
c, _, lhist = grad_descent(loss, gradient, 1ones(3),
gamma=5e-2)
plot(lhist, yscale=:linear, ylim=(20, 60), title="Loss $(lhist[end])")
┌ Warning: scale linear is unsupported with Plots.GRBackend(). Choose from: [:identity, :log10] └ @ Plots /home/jed/.julia/packages/Plots/4UTBj/src/args.jl:1662
scatter(x, y)
V = vander_chebyshev(x, 20)
plot!(x -> runge(x), color=:black, label="Runge")
plot!(x, V * (V \ y), label="Chebyshev fit")
plot!(x -> f(x, c), label="Rational fit")
Classical accuracy analysis assumes that functions are sufficiently smooth, meaning that derivatives exist and Taylor expansions are valid within a neighborhood. In particular, $$ f(x+h) = f(x) + f'(x) h + f''(x) \frac{h^2}{2!} + \underbrace{f'''(x) \frac{h^3}{3!} + \dotsb}_{O(h^3)} . $$
The big-$O$ notation is meant in the limit $h\to 0$. Specifically, a function $g(h) \in O(h^p)$ (sometimes written $g(h) = O(h^p)$) when there exists a constant $C$ such that $$ g(h) \le C h^p $$ for all sufficiently small $h$.
We have an additional source of error, rounding error, which comes from not being able to compute $f(x)$ or $f(x+h)$ exactly, nor subtract them exactly. Suppose that we can, however, compute these functions with a relative error on the order of $\epsilon_{\text{machine}}$. This leads to $$ \begin{split} \tilde f(x) &= f(x)(1 + \epsilon_1) \\ \tilde f(x \oplus h) &= \tilde f((x+h)(1 + \epsilon_2)) \\ &= f((x + h)(1 + \epsilon_2))(1 + \epsilon_3) \\ &= [f(x+h) + f'(x+h)(x+h)\epsilon_2 + O(\epsilon_2^2)](1 + \epsilon_3) \\ &= f(x+h)(1 + \epsilon_3) + f'(x+h)x\epsilon_2 + O(\epsilon_{\text{machine}}^2 + \epsilon_{\text{machine}} h) \end{split} $$
where we have assumed that $h \ge \epsilon_{\text{machine}}$. This error becomes large (relative to $f'$ -- we are concerned with relative error after all)
diff(f, x; h=1e-8) = (f(x+h) - f(x)) / h
function diff_wp(f, x; h=1e-8)
"""Diff using Walker and Pernice (1998) choice of step"""
h *= (1 + abs(x))
(f(x+h) - f(x)) / h
end
x = 1000
diff_wp(sin, x) - cos(x)
-4.139506408429305e-6
using Symbolics
@variables x
Dx = Differential(x)
y = sin(x)
Dx(y)
expand_derivatives(Dx(y))
y = x
for _ in 1:2
y = cos(y^pi) * log(y)
end
expand_derivatives(Dx(y))
function f(x)
y = x
for _ in 1:2
a = y^pi
b = cos(a)
c = log(y)
y = b * c
end
y
end
f(1.9), diff_wp(f, 1.9)
(-1.5346823414986814, -34.032439961925064)
function df(x, dx)
y = x
dy = dx
for _ in 1:2
a = y ^ pi
da = pi * y^(pi - 1) * dy
b = cos(a)
db = -sin(a) * da
c = log(y)
dc = dy / y
y = b * c
dy = db * c + b * dc
end
y, dy
end
df(1.9, 1)
(-1.5346823414986814, -34.032419599140475)
We can differentiate a composition $h(g(f(x)))$ as
\begin{align} \operatorname{d} h &= h' \operatorname{d} g \\ \operatorname{d} g &= g' \operatorname{d} f \\ \operatorname{d} f &= f' \operatorname{d} x. \end{align}What we've done above is called "forward mode", and amounts to placing the parentheses in the chain rule like
$$ \operatorname d h = \frac{dh}{dg} \left(\frac{dg}{df} \left(\frac{df}{dx} \operatorname d x \right) \right) .$$The expression means the same thing if we rearrange the parentheses,
$$ \operatorname d h = \left( \left( \left( \frac{dh}{dg} \right) \frac{dg}{df} \right) \frac{df}{dx} \right) \operatorname d x .$$import Zygote
Zygote.gradient(f, 1.9)
(-34.03241959914049,)
g(x) = exp(x) + x^2
@code_llvm Zygote.gradient(g, 1.9)
; @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:95 within `gradient` define [1 x double] @julia_gradient_12050(double %0) #0 { top: ; @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:96 within `gradient` ; ┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:42 within `pullback` @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:44 ; │┌ @ In[21]:1 within `_pullback` @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl:9 ; ││┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface2.jl within `macro expansion` ; │││┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/chainrules.jl:218 within `chain_rrule` ; ││││┌ @ /home/jed/.julia/packages/ChainRulesCore/C73ay/src/rules.jl:134 within `rrule` @ /home/jed/.julia/packages/ChainRules/hVHC4/src/rulesets/Base/fastmath_able.jl:56 %1 = call double @j_exp_12052(double %0) #0 ; └└└└└ ; @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:97 within `gradient` ; ┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:45 within `#60` ; │┌ @ In[21]:1 within `Pullback` ; ││┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/chainrules.jl:206 within `ZBack` ; │││┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/lib/number.jl:12 within `literal_pow_pullback` ; ││││┌ @ promotion.jl:389 within `*` @ float.jl:385 %2 = fmul double %0, 2.000000e+00 ; │└└└└ ; │┌ @ /home/jed/.julia/packages/Zygote/dABKa/src/lib/lib.jl:17 within `accum` ; ││┌ @ float.jl:383 within `+` %3 = fadd double %2, %1 ; └└└ ; @ /home/jed/.julia/packages/Zygote/dABKa/src/compiler/interface.jl:98 within `gradient` %.fca.0.insert = insertvalue [1 x double] zeroinitializer, double %3, 0 ret [1 x double] %.fca.0.insert }