#!/usr/bin/env python # coding: utf-8 # ## Name : ADVAIT GURUNATH CHAVAN # ## Contact No : +91 70214 55852 # ## Mail ID : advaitchavan135@gmail.com # # ## Oasis Infobyte Data Science Internship # # ## Task 1 : Iris Flower Classification #

# # ### 1. Importing the necessary dependencies # In[1]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn import tree from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from warnings import filterwarnings filterwarnings(action='ignore') # ### 2. Exploring the dataset # In[2]: iris_data = pd.read_csv('iris.csv') iris_data # In[3]: iris_data.info() # ### So, we have 150 rows and 6 columns and there are no null values or blank spaces in our dataset. # # ### Hence, there is no need of imputing(filling up blank spaces with mean/median/mode values) the dataset # In[4]: iris_data['SepalLengthCm'].max(), iris_data['SepalLengthCm'].min() # In[5]: iris_data['SepalWidthCm'].max(), iris_data['SepalWidthCm'].min() # In[6]: iris_data['PetalLengthCm'].max(), iris_data['PetalLengthCm'].min() # In[7]: iris_data['PetalWidthCm'].max(), iris_data['PetalWidthCm'].min() # In[8]: iris_data['Species'].describe() # In[9]: iris_data['Species'].value_counts() # In[10]: fig = plt.figure() ax = fig.add_axes([0,0,1,1]) ax.axis('equal') l = ['Versicolor', 'Setosa', 'Virginica'] s = [50,50,50] ax.pie(s, labels = l,autopct='%1.2f%%') plt.show() # In[11]: plt.figure(1) plt.boxplot([iris_data['PetalLengthCm']]) plt.title('Outliers if any in PetalLengthCm') plt.figure(2) plt.boxplot([iris_data['PetalWidthCm']]) plt.title('Outliers if any in PetalWidthCm') plt.figure(3) plt.boxplot([iris_data['SepalLengthCm']]) plt.title('Outliers if any in SepalLengthCm') plt.figure(4) plt.boxplot([iris_data['SepalWidthCm']]) plt.title('Outliers if any in SepalWidthCm') plt.show() # In[12]: plt.figure(1) iris_data['PetalLengthCm'].hist() plt.title('Histogram distribution of PetalLengthCm') plt.figure(2) iris_data['PetalWidthCm'].hist() plt.title('Histogram distribution of PetalWidthCm') plt.figure(3) iris_data['SepalLengthCm'].hist() plt.title('Histogram distribution of SepalLengthCm') plt.figure(4) iris_data['SepalWidthCm'].hist() plt.title('Histogram distribution of SepalWidthCm') plt.show() # In[13]: plt.figure(1) iris_data['PetalLengthCm'].plot(kind ='density') plt.title('Density Plot of PetalLengthCm') plt.figure(2) iris_data['PetalWidthCm'].plot(kind ='density') plt.title('Density Plot of PetalWidthCm') plt.figure(3) iris_data['SepalLengthCm'].plot(kind ='density') plt.title('Density Plot of SepalLengthCm') plt.figure(4) iris_data['SepalWidthCm'].plot(kind ='density') plt.title('Density Plot of SepalWidthCm') plt.show() # In[14]: plt.figure(figsize=(15,10)) plt.subplot(2,2,1) sns.violinplot(x='Species',y='PetalLengthCm',data=iris_data) plt.title('Species vs PetalLengthCm') plt.subplot(2,2,2) sns.violinplot(x='Species',y='PetalWidthCm',data=iris_data) plt.title('Species vs PetalWidthCm') plt.subplot(2,2,3) sns.violinplot(x='Species',y='SepalLengthCm',data=iris_data) plt.title('Species vs SepalLengthCm') plt.subplot(2,2,4) sns.violinplot(x='Species',y='SepalWidthCm',data=iris_data) plt.title('Species vs SepalWidthCm') # In[15]: iris_data.iloc[:,1:-1].corr() # In[16]: sns.heatmap(iris_data.iloc[:,1:-1].corr(), annot=True) # #### From the above heatmap of correlation of features that define the flower we can infer that PetalLengthCm has the more # #### influence on determining the species of flower # In[17]: sns.pairplot(iris_data, hue="Species") # ### 3. Preparing the data for training and testing of model # In[18]: x = iris_data.iloc[:, 1:-1] y = iris_data.Species x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3) # ### 4. Training using Decision Tree Model # In[19]: model_1 = DecisionTreeClassifier() model_1.fit(x_train, y_train) # ### 5. Evaluating the performance by Decision Tree Model # In[20]: y_pred = model_1.predict(x_test) score = accuracy_score(y_pred, y_test) score # In[21]: y_pred # In[22]: print(classification_report(y_pred, y_test)) # In[23]: plt.figure(figsize=(15, 10)) tree.plot_tree(model_1, filled=True) # In[24]: color_class = { 'Iris-setosa' : 'red', 'Iris-versicolor' : 'green', 'Iris-virginica' : 'blue' } colors = [color_class[label] for label in y_pred] plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors) plt.xlabel('Sepal Length') plt.ylabel('Petal Length') plt.title('Predicted Species using Decision Tree Model') legend_entries = [ plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label) for label, color in color_class.items() ] plt.legend(handles=legend_entries, title='Species') plt.show() # #### From the above Tree Plot and Predicted Species using Decision Tree Model we can infer the following:- # #### If Petal Length <= 2.45cm then the Species of the flower is Setosa. # #### If Petal Length > 2.45 cm then the Specis of the flower may be Versicolor or Virginica # #### For Petal Length > 3 and Petal Length < 5 most of the Species are Versicolor # In[25]: iris_setosa_dataset = iris_data[iris_data.PetalLengthCm <= 2.45] iris_setosa_dataset.to_excel('iris_setosa_dataset.xlsx',index=False) iris_setosa_dataset.to_csv('iris_setosa_dataset.csv',index=False) pd.read_excel('iris_setosa_dataset.xlsx') # In[26]: iris_versicolor_dataset = iris_data[(iris_data['PetalLengthCm'] < 5) & (iris_data['PetalLengthCm'] > 3)] iris_versicolor_dataset.to_excel('iris_versicolor_dataset.xlsx',index=False) iris_versicolor_dataset.to_csv('iris_versicolor_dataset.csv',index=False) pd.read_excel('iris_versicolor_dataset.xlsx') # In[27]: iris_virginica_dataset = iris_data[(iris_data['PetalLengthCm'] > 5)] iris_virginica_dataset.to_csv('iris_virginica_dataset.csv',index=False) pd.read_excel('iris_virginica_dataset.xlsx') # ### 6. Training using LinearRegressionModel # In[28]: model_2 = LogisticRegression() # In[29]: model_2.fit(x_train,y_train) # ### 7. Evaluating the performance using LinearRegressionModel # In[30]: y_pred_2 = model_2.predict(x_test) y_pred_2 # In[31]: confusion_matrix(y_test,y_pred_2) # In[32]: print(classification_report(y_test,y_pred_2)) # In[33]: accuracy_score(y_test,y_pred_2) # In[34]: color_class = { 'Iris-setosa' : 'red', 'Iris-versicolor' : 'green', 'Iris-virginica' : 'blue' } colors = [color_class[label] for label in y_pred_2] plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors) plt.xlabel('Sepal Length') plt.ylabel('Petal Length') plt.title('Predicted Species using Linear Regression Model') legend_entries = [ plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label) for label, color in color_class.items() ] plt.legend(handles=legend_entries, title='Species') plt.show() # ### 8. Training using Linear Sub Vector Machine (SVM) # In[35]: model_3 = LinearSVC() model_3.fit(x_train, y_train) # ### 9. Evaluating the performance using the Linear Sub Vector Machine (SVM) # In[36]: y_pred_3 = model_3.predict(x_test) y_pred_3 # In[37]: print(confusion_matrix(y_test, y_pred_3)) # In[38]: print(classification_report(y_test, y_pred_3)) # In[39]: print(accuracy_score(y_test,y_pred_3)) # In[40]: color_class = { 'Iris-setosa' : 'red', 'Iris-versicolor' : 'green', 'Iris-virginica' : 'blue' } colors = [color_class[label] for label in y_pred_3] plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors) plt.xlabel('Sepal Length') plt.ylabel('Petal Length') plt.title('Predicted Species using Linear Sub-Vector Machine Model') legend_entries = [ plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label) for label, color in color_class.items() ] plt.legend(handles=legend_entries, title='Species') plt.show() # ### 10. Training using Random Forest Classifier # In[41]: classifier = RandomForestClassifier(n_estimators=100) classifier.fit(x_train, y_train) # In[42]: y_pred_4 = classifier.predict(x_test) y_pred_4 # ### 11. Evaluation using Random Forest Classifier # In[43]: accuracy = classifier.score(x_test, y_test) print(f"Accuracy: {accuracy}") # In[44]: print(confusion_matrix(y_test, y_pred_4)) # In[45]: print(classification_report(y_test, y_pred_4)) # In[46]: print(accuracy_score(y_test,y_pred_4)) # In[47]: color_class = { 'Iris-setosa' : 'red', 'Iris-versicolor' : 'green', 'Iris-virginica' : 'blue' } colors = [color_class[label] for label in y_pred_4] plt.scatter(x_test.SepalLengthCm, x_test.PetalLengthCm, c=colors) plt.xlabel('Sepal Length') plt.ylabel('Petal Length') plt.title('Predicted Species using Random Forest Classifier') legend_entries = [ plt.Line2D([0], [0], marker='o', color='y', markerfacecolor=color, markersize=10, label=label) for label, color in color_class.items() ] plt.legend(handles=legend_entries, title='Species') plt.show() # ### 12 Final Conclusion # # #### Hence, we have classified the Iris Flower Dataset using four Classification Models namely Decision Tree Model, Linear Regression Model, Linear SVC Model and Random Forest Classifier # # ### In, first three cases i.e. using Linear Regression Model and Random Forest Classifier Model we achieved an overall accuracy score of 0.98; incase of Sub-vector Machine Model and Decision Tree Model we achieved an accuracy of 0.96 # # #### From the Multi-class classification plot we can conclude that # ###

1. For Flowers have Petal Length < 2.45 cm, belong to Setosa Species

# ###

2. For Flowers having Petal Length > 3cm and Petal Length < 5cm, most of them belong Versicolor Species

# ###

3. For Flowers having Petal Length > 5cm, belong to Virginca Species #

# In[ ]: