The given datasaet contains about 50,000 entries of deatils of used cars put up for sales. We will do some analysis on this dataset.
import pandas as pd
import numpy as np
autos=pd.read_csv('autos.csv',encoding = 'Latin-1')
autos
autos.info()
autos.head(4)
Columns "gearbox", "model", "vehicleType", "fuelType" and "notRepairedDamage" have null values for some entries. 5 columns have int type entries. While rest of them are object.
def clean_string(char):
word=""
for a in char:
if a.isupper():
new_a="_"+ a.lower()
a=new_a
word+=a
word=word.strip("_")
return word
auto_copy=autos
print(auto_copy.columns)
label_list=[]
for name in auto_copy.columns:
if name=="yearOfRegistration":
name="registration_year"
elif name=="monthOfRegistration":
name="registration_month"
elif name=="notRepairedDamage":
name="unrepaired_damage"
elif name=="dateCreated":
name="ad_created"
else:
name=clean_string(name)
label_list.append(name)
auto_copy.columns=label_list
print(auto_copy.columns)
The column names have been made more readable.
print(auto_copy.describe())
1. "nr_of_pictures" seems to have no data. This column can be deleted.
2. "price" and "odometer" have been stored as a string. This needs to be converted into a numeric type data.
3. To calculate how old the car is, "registration year" could be converted into an datetime object. Also the "ad_created" column needs to be converted into a datetime object. With reference to date of ad creation, we can get an idea of how old the car is.
print(auto_copy.loc[:,"odometer"].head(2))
auto_copy["odometer"]=auto_copy["odometer"].str.replace(",","").str.replace("km","")
auto_copy["odometer"]=auto_copy["odometer"].astype(float)
auto_copy.rename({"odometer":"odometer_km"},axis=1,inplace=True)
print(auto_copy["price"].head(2))
auto_copy["price"]=auto_copy["price"].str.replace(",","").str.replace("$","")
auto_copy["price"]=auto_copy["price"].astype(float)
print(auto_copy["price"].describe())
print(auto_copy["price"].value_counts().sort_index().head(50))
auto_copy=auto_copy[auto_copy["price"].between(100,4000000)]
auto_copy["price"].value_counts().sort_index(ascending=False)
#date_crawled
a=auto_copy["date_crawled"].str[:10]
print(a.value_counts(normalize=True,dropna=False).sort_index(ascending=False))
#ad_created
b=auto_copy["ad_created"].str[:10]
print(b.value_counts(normalize=True,dropna=False).sort_index(ascending=False).head(50))
#date_crawled
c=auto_copy["last_seen"].str[:10]
print(c.value_counts(normalize=True,dropna=False).sort_index(ascending=False))
auto_copy["registration_year"].describe()
bool_reg=auto_copy["registration_year"].between (1950,2016)
corrected_auto_copy=auto_copy[bool_reg]
corrected_auto_copy["registration_year"].value_counts().sort_index()
auto_copy=corrected_auto_copy
print(auto_copy["brand"].value_counts(normalize=True))
brand_dictionary_price={}
brand_dictionary_miles={}
brand_dictionary_counts={}
brands=auto_copy["brand"].value_counts(normalize=True).head(17)
for brand in brands.index:
bool_brand=auto_copy["brand"]==brand
selected_rows= auto_copy[bool_brand]
mean_price=selected_rows["price"].mean()
brand_dictionary_price[brand]=mean_price
mean_miles=selected_rows["odometer_km"].mean()
brand_dictionary_miles[brand]=mean_miles
number_of_data=selected_rows["brand"].value_counts()
brand_dictionary_counts[brand]=number_of_data.iloc[0]
#sorted_brand=sorted(brand_dictionary.items(),key=lambda x:x[1],reverse=True)
#$sorted_brand_cnt=sorted(brand_dictionary_counts.items(),key=lambda x:x[1],reverse=True)
final_table=pd.Series(brand_dictionary_price)
final_table=pd.DataFrame(final_table,columns=["mean_price"])
series_miles=pd.Series(brand_dictionary_miles)
series_counts=pd.Series(brand_dictionary_counts)
final_table['mean_miles']=series_miles
final_table['data_counts']=series_counts
print(final_table)
auto_copy.head(2)
auto_copy["unrepaired_damage"].value_counts()
auto_copy["unrepaired_damage"]=auto_copy["unrepaired_damage"].str.replace("nein","no")
auto_copy["unrepaired_damage"]=auto_copy["unrepaired_damage"].str.replace("ja","yes")
print(auto_copy["unrepaired_damage"].unique())
print(auto_copy["model"].value_counts().describe())
print(auto_copy["brand"].value_counts().describe())
print(auto_copy["model"].value_counts().head(5))
print(auto_copy["brand"].value_counts().head(5))
There are 244 models and 40 brands. We will see most common models for first five brands.
brand_names=["volkswagen","bmw","opel","mercedes_benz","audi"]
for brand in brand_names:
bool_brand=auto_copy["brand"]==brand
selected_rows=auto_copy[bool_brand]
common_models=selected_rows["model"].value_counts()
common_model=common_models.index[0]
print("Common model for {} is {} .".format(brand,common_model))