#!/usr/bin/env python # coding: utf-8 # # Iris Data Classification # - Link Component Color Annotations # - Yellow : data load / preprocessing # - Green : EDA # - Violet : model train / predict # ### Required Python Packages # - `numpy` # - `pandas` # - `scikit-learn` # - `seaborn` # - `matplotlib` # # Run the following cell to install the packages. # In[ ]: # # Required Packages # Run this cell to install required packages. # get_ipython().run_line_magic('pip', 'install "matplotlib>=2.0" "numpy>=1.19" "pandas>=1.1" "scikit-learn>=0.22.2" "seaborn>=0.11"') # ### 0. Global Parameters # - global paprameter of link pipeline # - test_size : rate of valid-set when train-valid-split # - random_state : random_state # ### 1. Load package,data # In[ ]: import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # In[ ]: iris = load_iris() # In[ ]: df = pd.DataFrame(data=iris.data, columns=iris.feature_names) df["target"] = iris.target display(df) # The `df` of green components and the `df` of purple components can be used independently of each other from the point of branching # # * Modeling -> `target` of `df` : **0,1,2** # * EDA -> `target` of `df` : **setosa, versicolor, virginica** # ### 2. EDA # In[ ]: df["target"] = df["target"].map({0: "setosa", 1: "versicolor", 2: "virginica"}) display(df) # In[ ]: sns.pairplot(df, x_vars=["sepal length (cm)"], y_vars=["sepal width (cm)"], hue="target", height=5) sns.pairplot(df, x_vars=["petal length (cm)"], y_vars=["petal width (cm)"], hue="target", height=5) plt.show() # In[ ]: plt.figure(figsize=(7, 4)) sns.heatmap( df.corr(), annot=True, cmap="coolwarm" ) # draws heatmap with input as the correlation matrix calculted by(iris.corr()) plt.show() # ### 3. Modeling # In[ ]: df_X = df.drop("target", axis=1) df_y = df["target"] train_X, valid_X, train_y, valid_y = train_test_split( df_X, df_y, test_size=test_size, random_state=random_state, stratify=df_y ) # In[ ]: model = LogisticRegression(random_state=random_state) model.fit(train_X, train_y) # In[ ]: pred = model.predict(valid_X) print(f"Accuracy : {accuracy_score(valid_y, pred)}") # In[ ]: