#!/usr/bin/env python # coding: utf-8 # **Ebay Germany Car Sales dataset analysis** # In[2]: import pandas as pd import numpy as np print("imported") # In[3]: autos=pd.read_csv("autos.csv", encoding="Latin-1") print(autos.head(1)) # In[4]: autos # In[5]: autos.info() # In[6]: autos.columns # **Renaming columns - from camelCase to snake_case** # In[7]: autos.rename({"yearOfRegistration":"registration_year", "monthOfRegistration":"registration_month", "notRepairedDamage":"unrepaired_damage", "dateCreated":"ad_created", "dateCrawled":"date_crawled", "offerType":"offer_type", "vehicleType":"vehicle_type", "powerPS":"power_ps", "fuelType":"fuel_type", "nrOfPictures":"nr_of_pictures", "postalCode":"postal_code", "lastSeen":"last_seen"},axis=1,inplace=True) autos.head() # In[8]: autos.describe(include="all") #show all columns # **Formating some columns' type from st to int.** # In[9]: try: #rerunning of code will raise attrib error because it already applied to dataframe autos["price"]=autos["price"].str.replace("$","").str.replace(",","").astype(int) autos["odometer"]=autos["odometer"].str.replace("km","").str.replace(",","").astype(int) except: pass autos.rename({"odometer":"odometer_km"},axis=1,inplace=True) autos["price"].head() autos["odometer_km"].head() # In[10]: autos["price"].head() # In[11]: print(autos["price"].unique().shape) print(autos["odometer_km"].unique().shape) print(autos["price"].describe()) autos["odometer_km"].describe() # In[12]: print(autos.isnull().sum()) print(autos["price"].isnull().sum()) # In[13]: print(autos["price"].value_counts().sort_index().head(10)) print(autos["price"].value_counts().sort_index(ascending=False).head(10)) print(autos["odometer_km"].value_counts().sort_index().head()) # In[14]: print(autos[autos["price"]<100].head()) # **Lets remove outliers with selecting only appropriate columns** # In[15]: autos=autos[(autos["price"]>100) & (autos["price"]<400000)] print(autos.describe()) # **Let's check dates and their distribution. Normlaize for percentage expression** # In[16]: autos.loc[:,["date_crawled","last_seen","ad_created", "registration_month","registration_year"]].head() # In[17]: print(autos['date_crawled'].str[:10].value_counts( normalize=True, dropna=False).sort_index(ascending=False)[0:5]) #or head() print(autos['ad_created'].str[:10].value_counts( normalize=True, dropna=False).sort_index()[0:5]) print(autos['last_seen'].str[:10].value_counts( normalize=True, dropna=False).sort_index()[0:5]) # In[ ]: # In[18]: autos["registration_year"].describe() # **Oldest registration year is year 1000 and neĆ¼est car is made in 9999 which is odd..Lets check what percent is between normal values** # In[19]: autos["registration_year"].between(1900,2020).value_counts(normalize=True) # **It seem 99.9% is normal dates, so we can remove odd date rows # In[23]: autos=autos[autos["registration_year"].between(1900,2020)] # In[25]: autos["registration_year"].value_counts(normalize=True).sort_index() # In[22]: 48071/autos.shape[0] # **Discover brand columns** # In[62]: print(autos["brand"].isnull().sum()) print(autos["brand"].value_counts()) print(autos["brand"].value_counts(normalize=True)) #with percentage print(autos["brand"].unique()) unique_brands=autos["brand"].unique() # **Check price mean for unique brands(Excel Pivot can also do..)** # In[75]: brands_dict={} for brand in unique_brands: print("\033[1m+Mean price of {}:".format(brand),autos[autos["brand"]==brand]["price"] .mean().astype(int)) #int yaziram ki float-u round off edim cevirsin. brands_dict[brand]=autos[autos["brand"]==brand]["price"].mean().astype(int) print(brands_dict) # **Below is another way of getting the same result as above (value_counts.index method)** # In[80]: brand_counts=autos["brand"].value_counts() brands_dict={} for brand in brand_counts.index: brands_dict[brand]=brand_counts[brand] print(brands_dict) # **Filter column itself according desired indicators** # In[113]: brand_counts=autos["brand"].value_counts(normalize=True) common_brands=brand_counts[brand_counts>0.05].index print(common_brands) c_brands_mean_prices={} for brand in common_brands: mean_prices=autos[autos["brand"]==brand]["price"].mean() print(mean_prices) c_brands_mean_prices[brand]=int(mean_prices) c_brands_mean_prices # **Above: of the top 6 brands most expensive are: audi,mercedes and bmw in sequence.** # In[100]: d={} a=autos["brand"].value_counts() print(type(a)) '''for i in a: d[a.index]=3''' print(a[a>5000].values) # **Series and dataframe constructos** # In[108]: bmp_series=pd.Series(c_brands_mean_prices) #print(bmp_series) sample_series=pd.Series(autos["brand"].value_counts(normalize=True)) print('indexes\n',sample_series.index) #print(sample_series.array) print('values\n',sample_series.values) print('shape\n',sample_series.shape) # In[109]: sample_series=pd.DataFrame(autos["brand"].value_counts(normalize=True), columns=['mean_price']) # **Calculate mean mileages for common brands** # In[114]: c_brands_mean_mileages={} for brand in common_brands: mean_mileage=autos[autos["brand"]==brand]["odometer_km"].mean() print(mean_mileage) c_brands_mean_mileages[brand]=int(mean_mileage) c_brands_mean_mileages # In[115]: print(c_brands_mean_mileages) print(c_brands_mean_prices) # **Lets convert mean dicts to pd.DataFrames with merging series method** # In[119]: bmm_series=pd.Series(c_brands_mean_mileages) bmp_series=pd.Series(c_brands_mean_prices) bmm_df=pd.DataFrame(bmm_series,columns=['mean_mileage']) #bmp_df=pd.DataFrame(bmp_series, columns=['mean_price']) bmm_df['mean_price']=bmp_series print(bmm_df) # **Lets find most frequent models of common brands and collect to dict** # In[145]: c_models_dict={} for brand in common_brands: freq_models=autos[autos["brand"]==brand]["model"].value_counts() print(brand.upper(),'\n',freq_models[:3],'\n') c_models_dict[brand]={freq_models.index[0],freq_models.values[0]} c_models_dict # In[149]: autos["odometer_km"].value_counts() # **Compare mean price of damaged and undamaged cars** # In[160]: not_damaged_price=autos[autos['unrepaired_damage']=="nein"]["price"].mean() damaged_price=autos[autos['unrepaired_damage']=="ja"]["price"].mean() print(damaged_price,not_damaged_price) print("Price difference is:",int(not_damaged_price-damaged_price),"$") # In[ ]: