import pandas as pd import numpy as np import seaborn as sns #visualisation import matplotlib.pyplot as plt #visualisation %matplotlib inline sns.set(color_codes=True) df = pd.read_csv("data.csv") # To display the top 5 rows df.head(5) df.tail(5) # To display the botton 5 rows df.dtypes df = df.drop(['Engine Fuel Type', 'Market Category', 'Vehicle Style', 'Popularity', 'Number of Doors', 'Vehicle Size'], axis=1) df.head(5) df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" }) df.head(5) df.shape duplicate_rows_df = df[df.duplicated()] print("number of duplicate rows: ", duplicate_rows_df.shape) df.count() # Used to count the number of rows df = df.drop_duplicates() df.head(5) df.count() print(df.isnull().sum()) df = df.dropna() # Dropping the missing values. df.count() print(df.isnull().sum()) # After dropping the values sns.boxplot(x=df['Price']) sns.boxplot(x=df['HP']) sns.boxplot(x=df['Cylinders']) Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 print(IQR) df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)] df.shape df.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5)) plt.title("Number of cars by make") plt.ylabel('Number of cars') plt.xlabel('Make'); plt.figure(figsize=(10,5)) c= df.corr() sns.heatmap(c,cmap="BrBG",annot=True) c fig, ax = plt.subplots(figsize=(10,6)) ax.scatter(df['HP'], df['Price']) ax.set_xlabel('HP') ax.set_ylabel('Price') plt.show()