Linear Regression from scratch

In [1]:
!pip3 install plotly
Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (4.5.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from plotly) (1.14.0)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly) (1.3.3)
In [2]:
import re
from pathlib import Path
from typing import Union, List
from plotly import express as px
from plotly import graph_objects as go
In [3]:
# Ensure that we have a `data` directory we use to store downloaded data
!mkdir -p data
data_dir: Path = Path('data')
In [4]:
# Downloading the "Auto Insurance in Sweden" data set
!wget -nc -P data https://www.math.muni.cz/~kolacek/docs/frvs/M7222/data/AutoInsurSweden.txt
File ‘data/AutoInsurSweden.txt’ already there; not retrieving.

In [5]:
!head -n 20 data/AutoInsurSweden.txt
Auto Insurance in Sweden

In the following data
X = number of claims
Y = total payment for all the claims in thousands of Swedish Kronor
for geographical zones in Sweden
Reference: Swedish Committee on Analysis of Risk Premium in Motor Insurance
http://college.hmco.com/mathematics/brase/understandable_statistics/7e/students/datasets/
       slr/frames/frame.html

X	Y
108	392,5
19	46,2
13	15,7
124	422,2
40	119,4
57	170,9
23	56,9
14	77,5
45	214
In [6]:
# Create the Python path pointing to the `AutoInsurSweden.txt` file
insurance_data_path: Path = data_dir / 'AutoInsurSweden.txt'
In [7]:
# Read the `AutoInsurSweden.txt` file, extract the `x` and `y` values via regex and store them into vectors
xs: List[float] = []
ys: List[float] = []

with open(insurance_data_path) as file:
    content: str = file.read()
    for x, y in re.findall(r'([\d,]+)\t([\d,]+)', content):
        xs.append(float(x.replace(',', '.')))
        ys.append(float(y.replace(',', '.')))
In [8]:
# A convenience function which creates a scatter plot with an optional line
def plot(xs: List[float], ys: List[float], ys_pred: Union[List[float], None] = None) -> None:
    fig = px.scatter(x=xs, y=ys, labels={'x': 'Number of claims', 'y': 'Total payment'})
    # If present, add the line
    if ys_pred:
        fig.add_trace(
            go.Scatter(
                x=xs, y=ys_pred, name='Guess'
            )
        )
    fig.show()
In [9]:
plot(xs, ys)