#!/usr/bin/env python
# coding: utf-8
# # Iris Data Classification
# - Link Component Color Annotations
# - Yellow : data load / preprocessing
# - Green : EDA
# - Violet : model train / predict
# ### Required Python Packages
# - `numpy`
# - `pandas`
# - `scikit-learn`
# - `seaborn`
# - `matplotlib`
#
# Run the following cell to install the packages.
# In[ ]:
#
# Required Packages
# Run this cell to install required packages.
#
get_ipython().run_line_magic('pip', 'install "matplotlib>=2.0" "numpy>=1.19" "pandas>=1.1" "scikit-learn>=0.22.2" "seaborn>=0.11"')
# ### 0. Global Parameters
# - global paprameter of link pipeline
# - test_size : rate of valid-set when train-valid-split
# - random_state : random_state
# ### 1. Load package,data
# In[ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# In[ ]:
iris = load_iris()
# In[ ]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df["target"] = iris.target
display(df)
# The `df` of green components and the `df` of purple components can be used independently of each other from the point of branching
#
# * Modeling -> `target` of `df` : **0,1,2**
# * EDA -> `target` of `df` : **setosa, versicolor, virginica**
# ### 2. EDA
# In[ ]:
df["target"] = df["target"].map({0: "setosa", 1: "versicolor", 2: "virginica"})
display(df)
# In[ ]:
sns.pairplot(df, x_vars=["sepal length (cm)"], y_vars=["sepal width (cm)"], hue="target", height=5)
sns.pairplot(df, x_vars=["petal length (cm)"], y_vars=["petal width (cm)"], hue="target", height=5)
plt.show()
# In[ ]:
plt.figure(figsize=(7, 4))
sns.heatmap(
df.corr(), annot=True, cmap="coolwarm"
) # draws heatmap with input as the correlation matrix calculted by(iris.corr())
plt.show()
# ### 3. Modeling
# In[ ]:
df_X = df.drop("target", axis=1)
df_y = df["target"]
train_X, valid_X, train_y, valid_y = train_test_split(
df_X, df_y, test_size=test_size, random_state=random_state, stratify=df_y
)
# In[ ]:
model = LogisticRegression(random_state=random_state)
model.fit(train_X, train_y)
# In[ ]:
pred = model.predict(valid_X)
print(f"Accuracy : {accuracy_score(valid_y, pred)}")
# In[ ]: