#!/usr/bin/env python
# coding: utf-8
# ## Name : ADVAIT GURUNATH CHAVAN
# ## Contact No : +91 70214 55852
# ## Mail ID : advaitchavan135@gmail.com
#
# ## Oasis Infobyte Data Science Internship
#
# ## Task 1 : Iris Flower Classification
#
#
# ### 1. Importing the necessary dependencies
# In[1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from warnings import filterwarnings
filterwarnings(action='ignore')
# ### 2. Exploring the dataset
# In[2]:
iris_data = pd.read_csv('iris.csv')
iris_data
# In[3]:
iris_data.info()
# ### So, we have 150 rows and 6 columns and there are no null values or blank spaces in our dataset.
#
# ### Hence, there is no need of imputing(filling up blank spaces with mean/median/mode values) the dataset
# In[4]:
iris_data['SepalLengthCm'].max(), iris_data['SepalLengthCm'].min()
# In[5]:
iris_data['SepalWidthCm'].max(), iris_data['SepalWidthCm'].min()
# In[6]:
iris_data['PetalLengthCm'].max(), iris_data['PetalLengthCm'].min()
# In[7]:
iris_data['PetalWidthCm'].max(), iris_data['PetalWidthCm'].min()
# In[8]:
iris_data['Species'].describe()
# In[9]:
iris_data['Species'].value_counts()
# In[10]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
l = ['Versicolor', 'Setosa', 'Virginica']
s = [50,50,50]
ax.pie(s, labels = l,autopct='%1.2f%%')
plt.show()
# In[11]:
plt.figure(1)
plt.boxplot([iris_data['PetalLengthCm']])
plt.title('Outliers if any in PetalLengthCm')
plt.figure(2)
plt.boxplot([iris_data['PetalWidthCm']])
plt.title('Outliers if any in PetalWidthCm')
plt.figure(3)
plt.boxplot([iris_data['SepalLengthCm']])
plt.title('Outliers if any in SepalLengthCm')
plt.figure(4)
plt.boxplot([iris_data['SepalWidthCm']])
plt.title('Outliers if any in SepalWidthCm')
plt.show()
# In[12]:
plt.figure(1)
iris_data['PetalLengthCm'].hist()
plt.title('Histogram distribution of PetalLengthCm')
plt.figure(2)
iris_data['PetalWidthCm'].hist()
plt.title('Histogram distribution of PetalWidthCm')
plt.figure(3)
iris_data['SepalLengthCm'].hist()
plt.title('Histogram distribution of SepalLengthCm')
plt.figure(4)
iris_data['SepalWidthCm'].hist()
plt.title('Histogram distribution of SepalWidthCm')
plt.show()
# In[13]:
plt.figure(1)
iris_data['PetalLengthCm'].plot(kind ='density')
plt.title('Density Plot of PetalLengthCm')
plt.figure(2)
iris_data['PetalWidthCm'].plot(kind ='density')
plt.title('Density Plot of PetalWidthCm')
plt.figure(3)
iris_data['SepalLengthCm'].plot(kind ='density')
plt.title('Density Plot of SepalLengthCm')
plt.figure(4)
iris_data['SepalWidthCm'].plot(kind ='density')
plt.title('Density Plot of SepalWidthCm')
plt.show()
# In[14]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.violinplot(x='Species',y='PetalLengthCm',data=iris_data)
plt.title('Species vs PetalLengthCm')
plt.subplot(2,2,2)
sns.violinplot(x='Species',y='PetalWidthCm',data=iris_data)
plt.title('Species vs PetalWidthCm')
plt.subplot(2,2,3)
sns.violinplot(x='Species',y='SepalLengthCm',data=iris_data)
plt.title('Species vs SepalLengthCm')
plt.subplot(2,2,4)
sns.violinplot(x='Species',y='SepalWidthCm',data=iris_data)
plt.title('Species vs SepalWidthCm')
# In[15]:
iris_data.iloc[:,1:-1].corr()
# In[16]:
sns.heatmap(iris_data.iloc[:,1:-1].corr(), annot=True)
# #### From the above heatmap of correlation of features that define the flower we can infer that PetalLengthCm has the more
# #### influence on determining the species of flower
# In[17]:
sns.pairplot(iris_data, hue="Species")
# ### 3. Preparing the data for training and testing of model
# In[18]:
x = iris_data.iloc[:, 1:-1]
y = iris_data.Species
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3)
# ### 4. Training using Decision Tree Model
# In[19]:
model_1 = DecisionTreeClassifier()
model_1.fit(x_train, y_train)
# ### 5. Evaluating the performance by Decision Tree Model
# In[20]:
y_pred = model_1.predict(x_test)
score = accuracy_score(y_pred, y_test)
score
# In[21]:
y_pred
# In[22]:
print(classification_report(y_pred, y_test))
# In[23]:
plt.figure(figsize=(15, 10))
tree.plot_tree(model_1, filled=True)
# In[24]:
color_class = {
'Iris-setosa' : 'red',
'Iris-versicolor' : 'green',
'Iris-virginica' : 'blue'
}
colors = [color_class[label] for label in y_pred]
plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors)
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.title('Predicted Species using Decision Tree Model')
legend_entries = [
plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label)
for label, color in color_class.items()
]
plt.legend(handles=legend_entries, title='Species')
plt.show()
# #### From the above Tree Plot and Predicted Species using Decision Tree Model we can infer the following:-
# #### If Petal Length <= 2.45cm then the Species of the flower is Setosa.
# #### If Petal Length > 2.45 cm then the Specis of the flower may be Versicolor or Virginica
# #### For Petal Length > 3 and Petal Length < 5 most of the Species are Versicolor
# In[25]:
iris_setosa_dataset = iris_data[iris_data.PetalLengthCm <= 2.45]
iris_setosa_dataset.to_excel('iris_setosa_dataset.xlsx',index=False)
iris_setosa_dataset.to_csv('iris_setosa_dataset.csv',index=False)
pd.read_excel('iris_setosa_dataset.xlsx')
# In[26]:
iris_versicolor_dataset = iris_data[(iris_data['PetalLengthCm'] < 5) & (iris_data['PetalLengthCm'] > 3)]
iris_versicolor_dataset.to_excel('iris_versicolor_dataset.xlsx',index=False)
iris_versicolor_dataset.to_csv('iris_versicolor_dataset.csv',index=False)
pd.read_excel('iris_versicolor_dataset.xlsx')
# In[27]:
iris_virginica_dataset = iris_data[(iris_data['PetalLengthCm'] > 5)]
iris_virginica_dataset.to_csv('iris_virginica_dataset.csv',index=False)
pd.read_excel('iris_virginica_dataset.xlsx')
# ### 6. Training using LinearRegressionModel
# In[28]:
model_2 = LogisticRegression()
# In[29]:
model_2.fit(x_train,y_train)
# ### 7. Evaluating the performance using LinearRegressionModel
# In[30]:
y_pred_2 = model_2.predict(x_test)
y_pred_2
# In[31]:
confusion_matrix(y_test,y_pred_2)
# In[32]:
print(classification_report(y_test,y_pred_2))
# In[33]:
accuracy_score(y_test,y_pred_2)
# In[34]:
color_class = {
'Iris-setosa' : 'red',
'Iris-versicolor' : 'green',
'Iris-virginica' : 'blue'
}
colors = [color_class[label] for label in y_pred_2]
plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors)
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.title('Predicted Species using Linear Regression Model')
legend_entries = [
plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label)
for label, color in color_class.items()
]
plt.legend(handles=legend_entries, title='Species')
plt.show()
# ### 8. Training using Linear Sub Vector Machine (SVM)
# In[35]:
model_3 = LinearSVC()
model_3.fit(x_train, y_train)
# ### 9. Evaluating the performance using the Linear Sub Vector Machine (SVM)
# In[36]:
y_pred_3 = model_3.predict(x_test)
y_pred_3
# In[37]:
print(confusion_matrix(y_test, y_pred_3))
# In[38]:
print(classification_report(y_test, y_pred_3))
# In[39]:
print(accuracy_score(y_test,y_pred_3))
# In[40]:
color_class = {
'Iris-setosa' : 'red',
'Iris-versicolor' : 'green',
'Iris-virginica' : 'blue'
}
colors = [color_class[label] for label in y_pred_3]
plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors)
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.title('Predicted Species using Linear Sub-Vector Machine Model')
legend_entries = [
plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label)
for label, color in color_class.items()
]
plt.legend(handles=legend_entries, title='Species')
plt.show()
# ### 10. Training using Random Forest Classifier
# In[41]:
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
# In[42]:
y_pred_4 = classifier.predict(x_test)
y_pred_4
# ### 11. Evaluation using Random Forest Classifier
# In[43]:
accuracy = classifier.score(x_test, y_test)
print(f"Accuracy: {accuracy}")
# In[44]:
print(confusion_matrix(y_test, y_pred_4))
# In[45]:
print(classification_report(y_test, y_pred_4))
# In[46]:
print(accuracy_score(y_test,y_pred_4))
# In[47]:
color_class = {
'Iris-setosa' : 'red',
'Iris-versicolor' : 'green',
'Iris-virginica' : 'blue'
}
colors = [color_class[label] for label in y_pred_4]
plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors)
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.title('Predicted Species using Random Forest Classifier')
legend_entries = [
plt.Line2D([0], [0], marker='o', color='y', markerfacecolor=color, markersize=10, label=label)
for label, color in color_class.items()
]
plt.legend(handles=legend_entries, title='Species')
plt.show()
# ### 12 Final Conclusion
#
# #### Hence, we have classified the Iris Flower Dataset using four Classification Models namely Decision Tree Model, Linear Regression Model, Linear SVC Model and Random Forest Classifier
#
# ### In, first three cases i.e. using Linear Regression Model and Random Forest Classifier Model we achieved an overall accuracy score of 0.98; incase of Sub-vector Machine Model and Decision Tree Model we achieved an accuracy of 0.96
#
# #### From the Multi-class classification plot we can conclude that
# ###
1. For Flowers have Petal Length < 2.45 cm, belong to Setosa Species
# ###2. For Flowers having Petal Length > 3cm and Petal Length < 5cm, most of them belong Versicolor Species
# ###3. For Flowers having Petal Length > 5cm, belong to Virginca Species #
# In[ ]: