#!/usr/bin/env python # coding: utf-8 # # Project: Exploring Ebay Car Sales Data # > This poject is aimed at gaining insight on used cars from eBay site # > The dataset was originally scraped and uploaded to [Kaggle](https://www.kaggle.com/orgesleka/used-cars-database/data). Dataquest has made a few modifications from the original dataset that was uploaded to Kaggle. # # A few notes describing the data columns can be found [here](https://www.kaggle.com/orgesleka/used-cars-database/data) # # # * dateCrawled : when this ad was first crawled, all field-values are taken from this date # * name : "name" of the car # * seller : private or dealer # * price : the price on the ad to sell the car # * yearOfRegistration : at which year the car was first registered # * powerPS : power of the car in PS # * lastSeenOnline : when the crawler saw this ad last online # # In[1]: #importing the numpy and pandas library import numpy as np import pandas as pd autos = pd.read_csv("autos.csv", encoding="Latin-1") #reading the datafile # In[2]: #chcking for missing data and wrong data types using info() autos.info() # In[3]: #printing first few rows of data using .head() autos.head() # # In[4]: #using DataFrame.columns to print out existing columns autos.columns # In[5]: #editing the column names autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gear_box', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen'] # In[6]: #confirming changes to column names autos.columns # In[7]: #using dataFrame.head() to look at the current state of autos dataframe autos.head() # The camelCase columns have been changed to snake_case, some column names have been edited to make more meanings from the data dictionary. # In[8]: #looking at the descriptive statistics for all columns autos.describe(include='all') # There are about 50000 records in the dataset with 20 columns. 'vehicleType','model','fuelType','notRepairedDamage' all contain missing values. Columns casing are not regular. Price and Odometer contain unit symbols. dateCrawled,dateCreated dtype should be datetime and price,odometer should be int dtypes. seller, offerType, abtest, gearbox are about 2 unique values each. # # The mean, std, median and percentiles for price and odometer are NaN which indicates a wrong data type. These requires more investigation # In[9]: #renaming the price and odometer columns autos.rename(columns={'price':'price_usd','odometer':'odometer_km'},inplace=True) # In[10]: #confirming change in names autos.columns # In[11]: #removing the '$' and ',' chars in the price values autos['price_usd'] = autos['price_usd'].str.replace('$','').str.replace(',','').astype('int64') # In[12]: #viewing changes to price column autos.head() # In[13]: #confirming change to price dtype autos.info() # In[14]: #removing the km characters in odometer autos['odometer_km'] = autos['odometer_km'].str.replace('km','').str.replace(',','') # In[15]: #chaniging odometer_km to int type autos['odometer_km'] = autos['odometer_km'].astype('int64') # In[16]: autos.head() # In[17]: #understanding the shape of the price column autos['price_usd'].unique().shape # In[18]: #summary statistics for price column autos['price_usd'].describe() #.sort_index(ascending=False) # In[19]: #Series.value_counts() returns a series, better to use Series.sort_index() #with ascending= True or False, Second look at max outlier autos['price_usd'].value_counts().sort_index(ascending=False).head() # In[20]: #Second look at Min outlier autos['price_usd'].value_counts().sort_index(ascending=False).tail() # In[21]: #removing outliers autos = autos[autos["price_usd"].between(1,27322222)] # In[22]: autos['price_usd'].describe() # After removing the outliers, it is observed that average sales is around $8,000 dollars # In[23]: #understanding the shape of the odometer column autos['odometer_km'].unique().shape # In[24]: #summary statistics of the odometer column autos['odometer_km'].describe() # In[25]: #checking for high outliers for odometer autos['odometer_km'].value_counts().sort_index(ascending=False).head() # In[26]: #checking for low outliers for odometer autos['odometer_km'].value_counts().sort_index(ascending=False).tail() # The odometer column appears to not have outliers. Next stop is to look at the dates columns of 'date_crawled', 'ad_created', 'last_seen' which are in string formats # In[27]: #understanding the date columns autos[['date_crawled','ad_created','last_seen']][0:5] # In[28]: #calculating the distribution for date_crawled #To rank by date in ascending order (earliest to latest), chain the #Series.sort_index() method autos_date_crawled = autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[29]: autos_date_crawled # In[30]: #calculating the distribution of 'ad_created' values including missing values #using Series.value_counts(normalize=True, dropna=False) autos_ad_created = autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False) # In[51]: #viewing the values autos_ad_created.sort_index().head(10) # In[52]: #viewing the values autos_ad_created.sort_index().tail(10) # In[32]: #calculating the distribution of 'last_seen' values including missing values #using Series.value_counts(normalize=True, dropna=False) autos_last_seen = autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False) # In[49]: #viewing 'last_seen' autos_last_seen.sort_index().head(10) # In[34]: #understanding the distribution of registration year autos['registration_year'].describe() # The earliest year is 1000 and the latest year is 9999. Weird dates with regards to when cars were produced and the future projection.In 1903, [Henry Ford](https://en.wikipedia.org/wiki/History_of_Ford_Motor_Company) invented a modernized car and the year appears to most suitable while 2016 will be taken as the highest year since the last seen ads were in 2016. # In[35]: #removing upper and lower bounds in the registration_year autos = autos[autos["registration_year"].between(1902,2017)] # In[48]: autos['registration_year'].value_counts(normalize=True).sort_index().head(10) # There appears to be more vehicle listings between 1990s and 2000s. A closer look at the brand of cars would be neccessary # In[37]: #exploring the unique brand values autos['brand'].unique() # In[38]: #confirming brand index autos.info() # In[39]: #selecting the top 20 brands top_5_brands = autos['brand'].value_counts().sort_values(ascending=False).head(5).index top_5_brands # In[40]: #aggregating prices for top 5 brands top_brands_m_prices = {} #empty dictionary for top 5 brand mean prices for brand in top_5_brands: #looping over top brands selected_brand = autos[autos['brand'] == brand] #selecting top brands autos dataset mean_price = selected_brand['price_usd'].mean()#calculating mean price for top brand top_brands_m_prices[brand] = int(mean_price) #assigning to dictionary print(top_brands_m_prices) #printing out results # In[41]: sorted(top_brands_m_prices.items(), key=lambda x: x[1]) # From the results of our aggregation, audi, mercedesbenz and bmw are most expensive brands, followed by volkswagen and opel is least amongst top 5. # In[42]: #aggregating mileage for top 5 brands top_brands_m_mileage = {} #empty dictionary for top 5 brand mean mileage for brand in top_5_brands: #looping over top brands selected_brand = autos[autos['brand'] == brand] #selecting top brands autos dataset mean_mileage = selected_brand['odometer_km'].mean()#calculating mean mileage for top brand top_brands_m_mileage[brand] = int(mean_mileage) #assigning to dictionary print(top_brands_m_mileage) #printing out results # In[43]: #using pandas series constructor pd.Series to construct a series #for top brand mean prices top_bmp_series = pd.Series(top_brands_m_prices) print(top_bmp_series) # In[44]: # using the pandas DataFrame constructor to construct a dataframe #for mean prices df_brands = pd.DataFrame(top_bmp_series, columns=['mean_price']) df_brands # In[45]: #brands mean mileage top_bmm_series = pd.Series(top_brands_m_mileage) print(top_bmm_series) # In[46]: #adding brands mean mileage to the dataframe df_brands.insert(1, 'top_brands_m_mileage', top_bmm_series) # In[47]: #viewing new dataframe df_brands # Conclusion: # The cost of top brand vehicles do not neccessarily determing the cars mileage. # ### Resources # > * [Looping over series](https://community.dataquest.io/t/looping-over-the-correct-series/432463/10) # # > * [Sorting the dictionary](https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value)