Principal Component Analysis from scatch - preparations¶

Libraries and helper functions¶

In [1]:
import math as m
import random
import pandas as pd
import numpy as np

import altair as alt

In [2]:
from typing import List
Vector = List[float]

In [3]:
def add(vector1: Vector, vector2: Vector) -> Vector:
assert len(vector1) == len(vector2)
return [v1 + v2 for v1, v2 in zip(vector1, vector2)]

In [4]:
def subtract(vector1: Vector, vector2:Vector) -> Vector:
assert len(vector1) == len(vector2)
return [v1 - v2 for v1, v2 in zip(vector1, vector2)]

In [5]:
def vector_sum(vectors: List[Vector]) -> Vector:
assert vectors

vector_length = len(vectors[0])
assert all(len(v) == vector_length for v in vectors)

sums = [0] * vector_length
for vector in vectors:

return sums

In [6]:
def scalar_multiply(c: float, vector: Vector) -> Vector:
return [c * v for v in vector]

In [7]:
def vector_mean(vector: Vector) -> float:
n = len(vector)
return scalar_multiply(1/n, vector)

In [8]:
def dot(vector1: Vector, vector2: Vector) -> float:
assert len(vector1) == len(vector2)
return sum(v1 * v2 for v1, v2 in zip(vector1, vector2))

In [9]:
def sum_of_squares(v: Vector) -> Vector:
return dot(v, v)

In [10]:
def magnitude(v: Vector) -> Vector:
return m.sqrt(sum_of_squares(v))

In [11]:
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
"""


Steps¶

In [12]:
intercept = random.randint(-30, 30)
coefficient = random.uniform(-1, 1)
n = 30

xs = np.random.randint(-50, 10 + 1, 30)
ys = np.random.randint(-20, 50 + 1, 30)
df = pd.DataFrame({'x': xs, 'y': ys})

print(intercept, coefficient)

alt.Chart(df).mark_point().encode(
alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y'])
)

-10 0.9679420748641416

Out[12]:

De-meaning¶

In [13]:
def de_mean(data: List[Vector]) -> List[Vector]:
# mean = vector_mean(data)
return [vector - np.mean(vector) for vector in data]

In [14]:
xs_demean, ys_demean = de_mean([xs, ys])

df = pd.DataFrame({'x': xs_demean, 'y': ys_demean})
alt.Chart(df).mark_point().encode(
alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y'])
)

Out[14]:

Direction¶

In [15]:
def direction(w: Vector) -> Vector:
mag = magnitude(w)
return [w_i / mag for w_i in w]

direction(xs)

Out[15]:
[-0.22863117335525085,
-0.11431558667762542,
-0.07396890902669881,
-0.24208006590555972,
-0.21518228080494198,
-0.02017333882546331,
-0.1277644792279343,
0.04034667765092662,
-0.07396890902669881,
-0.24208006590555972,
-0.22863117335525085,
-0.2353556196304053,
-0.10759114040247099,
-0.06724446275154437,
-0.053795570201235494,
-0.04707112392608106,
-0.31604897493225853,
0.04707112392608106,
-0.10759114040247099,
-0.1815600494291698,
-0.29587563610679524,
-0.2017333882546331,
-0.1613867106037065,
-0.006724446275154437,
-0.19500894197947868,
-0.18828449570432423,
-0.21518228080494198,
-0.32949786748256743,
-0.27570229728133194,
-0.08741780157700768]
In [16]:
xs_dir = direction(xs_demean)
ys_dir = direction(ys_demean)

df = pd.DataFrame({'x': xs_dir, 'y': ys_dir})
alt.Chart(df).mark_point().encode(
alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y'])
)

Out[16]: