In [1]:

from datetime import datetime
print(f'Päivitetty {datetime.now()}')

Päivitetty 2022-09-11 13:41:41.389289

Auton hinnan ennustaminen¶

Olen lainannut ideoita lähteestä https://www.kaggle.com/mohaiminul101/car-price-prediction

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# train_test_split osaa jakaa datan opetusdataan ja testidataan
from sklearn.model_selection import train_test_split

# Käytettävät mallit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

Funktio mallin sovittamiseen ja tarkasteluun¶

Jos samaa koodia käytetään toistuvasti, niin siitä kannattaa tehdä funktio.

In [3]:

def mallinna(malli):
    
    # Mallin sovitus opetusdataan
    malli.fit(X_train, y_train)
            
    # Selityskerroin opetusdatalle
    y_pred_train = malli.predict(X_train)
    R2_train_malli = malli.score(X_train, y_train)
    
    # Selityskerroin testidatalle
    y_pred_test = malli.predict(X_test)
    R2_test_malli = malli.score(X_test, y_test)
    
    # Selityskertoimien tulostus
    print(f'Opetusdatan selityskerroin {R2_train_malli:.3f}')
    print(f'Testidatan selityskerroin {R2_test_malli:.3f}')
       
    # Opetusdatan virhetermit kaaviona
    fig, ax = plt.subplots(1, 2, figsize=(10, 4))
    ax[0].set_title('Ennustevirheiden jakauma opetusdatassa')
    sns.histplot((y_train-y_pred_train), kde=True, ax=ax[0])
    ax[0].set_xlabel('y_train - y_pred_train')
    
    # toteutuneet ja ennustetut hajontakaaviona testidatalle
    ax[1].set_title('Toteutuneet ja ennustetut testidatassa')
    ax[1].scatter(x=y_test, y=y_pred_test)
    ax[1].set_xlabel('toteutunut')
    ax[1].set_ylabel('ennuste')

Datan tarkastelua¶

In [4]:

df = pd.read_csv('https://taanila.fi/car_data.csv')
df.head()

Out[4]:

	Car_Name	Year	Selling_Price	Present_Price	Kms_Driven	Fuel_Type	Seller_Type	Transmission
0	ritz	2014	3.35	5.59	27000	Petrol	Dealer	Manual
1	sx4	2013	4.75	9.54	43000	Diesel	Dealer	Manual
2	ciaz	2017	7.25	9.85	6900	Petrol	Dealer	Manual
3	wagon r	2011	2.85	4.15	5200	Petrol	Dealer	Manual
4	swift	2014	4.60	6.87	42450	Diesel	Dealer	Manual

In [5]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB

In [6]:

df.describe()

Out[6]:

	Year	Selling_Price	Present_Price	Kms_Driven	Owner
count	301.000000	301.000000	301.000000	301.000000	301.000000
mean	2013.627907	4.661296	7.628472	36947.205980	0.043189
std	2.891554	5.082812	8.644115	38886.883882	0.247915
min	2003.000000	0.100000	0.320000	500.000000	0.000000
25%	2012.000000	0.900000	1.200000	15000.000000	0.000000
50%	2014.000000	3.600000	6.400000	32000.000000	0.000000
75%	2016.000000	6.000000	9.900000	48767.000000	0.000000
max	2018.000000	35.000000	92.600000	500000.000000	3.000000

In [7]:

# Muunnetaan 'Year' auton iäksi olettaen, että data vuodelta 2020
df['Age'] = 2020 - df['Year']
df = df.drop('Year', axis=1)

In [8]:

# Pudotetaan 'Car_Name' pois, koska sitä ei tarvita
df = df.drop('Car_Name', axis=1)

In [9]:

# Kategoristen muuttujien jakaumat

cat_cols = ['Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']

for i in range(0, 3, 2):
    fig = plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    sns.countplot(x=cat_cols[i], data=df)
    plt.subplot(1, 2, 2)
    sns.countplot(x=cat_cols[i + 1], data=df)

In [10]:

# Määrällisten muuttujien jakaumat

num_cols = ['Selling_Price', 'Present_Price', 'Kms_Driven', 'Age']

for i in range(0, 3, 2):
    fig = plt.figure(figsize=(13, 3))
    plt.subplot(1, 2, 1)
    sns.boxplot(x=num_cols[i], data=df)
    plt.subplot(1, 2, 2)
    sns.boxplot(x=num_cols[i + 1], data=df)

In [11]:

# Korrelaatiot
sns.heatmap(df.corr(), annot=True)

Out[11]:

<AxesSubplot:>

In [12]:

df.head()

Out[12]:

	Selling_Price	Present_Price	Kms_Driven	Fuel_Type	Seller_Type	Transmission	Age
0	3.35	5.59	27000	Petrol	Dealer	Manual	6
1	4.75	9.54	43000	Diesel	Dealer	Manual	7
2	7.25	9.85	6900	Petrol	Dealer	Manual	3
3	2.85	4.15	5200	Petrol	Dealer	Manual	9
4	4.60	6.87	42450	Diesel	Dealer	Manual	6

In [13]:

# Kategoriset muuttujat dummy-muuttujiksi
df = pd.get_dummies(data=df, drop_first=True) 

In [14]:

df.head()

Out[14]:

	Selling_Price	Present_Price	Kms_Driven	Age	Fuel_Type_Diesel	Fuel_Type_Petrol	Transmission_Manual
0	3.35	5.59	27000	6	0	1	1
1	4.75	9.54	43000	7	1	0	1
2	7.25	9.85	6900	3	0	1	1
3	2.85	4.15	5200	9	0	1	1
4	4.60	6.87	42450	6	1	0	1

Mallien sovittaminen¶

In [15]:

# Selittävät muuttujat
X = df.drop('Selling_Price', axis=1)

# Selitettävä/ennustettava muuttuja
y = df['Selling_Price']

In [16]:

# Jako opetus- ja testidataan
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

Lineaarinen regressio¶

In [17]:

malli = LinearRegression()

# Tässä hyödynnän aiemmin määriteltyä mallinna-funktiota
mallinna(malli)

Opetusdatan selityskerroin 0.888
Testidatan selityskerroin 0.842

Satunnaismetsä¶

In [18]:

malli = RandomForestRegressor()
mallinna(malli)

Opetusdatan selityskerroin 0.988
Testidatan selityskerroin 0.954

Gradient Boosting¶

In [19]:

malli = GradientBoostingRegressor()
mallinna(malli)

Opetusdatan selityskerroin 0.997
Testidatan selityskerroin 0.957