#!/usr/bin/env python # coding: utf-8 # # INTRODUCTION # # Using a dataset of *eBay Kleinanzeigen* (the classified section of the German eBay website) consisting of 50K data points available on [kaggle](https://www.kaggle.com/orgesleka/used-cars-database/data), we want to (1) **CLEAN THE DATA** & (2) **ANALYZE THE DATA**. # # This dataset contains the following information: # # # | **Variable** | **Description** | # |-------------------------|-------------------------------------------| # | dateCrawled | When the ad was 1st crawled | # | name | Name of car | # | seller | Private or Dealer as seller | # | offerType | Type of listing | # | price | Price on the car ad | # | abtest | Whether the listing included A/B test | # | vehicleType | Type of vehicle | # | yearOfRegistration | Year when car was 1st registered | # | gearbox | The transmission type | # | powerPS | Horsepower of the car | # | model | Model of the car | # | kilometer | # of kilometers driven | # | monthOfRegistration | Month in which car was 1st registered | # | fuelType | Type of fuel the car uses | # | brand | Brand of the car | # | notRepairedDamage | If the repairs were done on damaged car | # | dateCreated | Date when eBay listing was created | # | nrOfPictures | # of pics of car in ad | # | postalCode | Postal code where the car is at | # | lastSeenOnline | When the crawler saw this ad last online | # # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') autos = pd.read_csv("autos.csv", encoding = "Latin-1") autos.head() # In[2]: autos.info() # There are several things that need to change: # # (1) need to lower case everything # # (2) deal with missing values in "vehicleType", "gearbox", "model", "fuelType", "notRepairedDamage" # # (3) turn things into snake_case # # (4) turn "price" and "odometer" into float # # Clean Column Name # In[3]: autos.rename(str.lower, axis = 1, inplace = True) autos.rename({"yearofregistration":"registration_year", "monthofregistration":"registration_month", "notrepaireddamage":"unrepaired_damage", "datecreated":"ad_created", "datecrawled":"date_crawled", "offertype":"offer_type", "vehicletype":"vehicle_type", "gearbox":"gear_box", "fueltype":"fuel_type", "nrofpictures":"num_pictures", "postalcode":"postal_code", "lastseen":"last_seen"}, axis = 1, inplace = True) autos.head(5) # # Initial Exploration + Cleaning # In[4]: autos.describe(include = "all") # In[5]: print(autos["offer_type"].value_counts()) print("\n") print(autos['seller'].value_counts()) print("\n") print(autos["num_pictures"].value_counts()) print("\n") print(autos['gear_box'].value_counts()) print("\n") print(autos['abtest'].value_counts()) # 1) Remove "seller", "num_pictures" and "seller" from dataframe # # 2) "powerps" has an unrealistic value (> 1000 HP). Need to further analyze # # 3) Similarly, 'price' also need to be examined as there is no way to have a $2 million+ vehicle # # 4) "registration_year" has an unrealistic range, particularly at the extremes. Need to remove them. # # 5) Odometer + price needs to return to floats # In[6]: # Fix price autos['price'].value_counts() # There's "$" and ",". Plus it's an object and not an integer/float autos['price'] = autos['price'].str.replace("$","").str.replace(",","") # In[7]: autos['price'] = autos['price'].astype(float) autos['price'].describe() # In[8]: # fix odometer autos['odometer'].value_counts() autos['odometer'].unique() # Need to deal with "," and "km" + turn it to float/int autos['odometer'] = autos['odometer'].str.replace(",", "") autos['odometer'] = autos['odometer'].str.replace("km", "") autos['odometer'] = autos['odometer'].astype(int) # In[9]: autos.rename({"odometer": "odometer_km"}, axis = 1, inplace = True) # In[10]: autos.columns # removing unnecessary columns # In[11]: cols = ['date_crawled', 'name', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gear_box', 'powerps', 'model', 'odometer_km', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'postal_code', 'last_seen'] autos = autos[cols] # In[12]: autos.head() autos.describe(include = 'all') # ## Examining powerps # # There is no way that horse power can be 17700. So we need to find out what's up. # In[13]: autos['powerps'].value_counts().sort_index(ascending = False).head(50) # In[14]: autos['powerps'].value_counts().sort_index(ascending = True).head(50) # As the Bugatti Cheron was the highest HP production car at approx. 1600, really anything above that is not valid. So we will use this as the top-end cap. Conversely the lowest HP production car is the Renault Twitzy at about 17 hp. Thus we will use this as the bottom-end cap. # In[15]: autos = autos[autos['powerps'].between(16, 1600)] autos.describe() # In[16]: autos['powerps'].value_counts().sort_index(ascending = False) # ## Examining Price # In[17]: autos['price'].shape[0] # In[18]: autos['price'].describe() # In[19]: autos['price'].value_counts().sort_index(ascending = False).head(10) # In[20]: autos['price'].value_counts().sort_index(ascending = False).tail(10) # In[21]: print(len(autos[autos['price'] < 100])) print(len(autos[autos['price'] > 500000])) print(len(autos)) # Of the 50K datapoints in this dataset, 1762 datapoints list price of car as less than **100 dollars** whilst 11 list the price of the car as greater than **500,000 dollars**. Considering both the unlikelihood of purchasing a car within these extremes and its make up of the total data point to be less than 5% (i.e. can remove without affecting the integrity of the data), we will use this as our liimts. # In[22]: autos = autos[autos['price'].between(100, 500000)] autos['price'].value_counts().sort_index() # ## Examining Odometer_km # In[23]: autos['odometer_km'].unique() # In[24]: autos['odometer_km'].describe() # In[25]: autos['odometer_km'].value_counts().sort_index(ascending = False) # Considering the nature of the items as used cars, which are likely to have a very high mileage, it would be expected that there would be a large number of vehicles with extraordinarily high mileage and few vehicles with low mileage. As such, we will not be modifying the vehicle in terms of odometer. # ## Exploring Date Columns # In[26]: # date_crawled + ad_created + last_seen autos[['date_crawled','ad_created','last_seen']].head() # the format is %YYYY-%mm-%dd (essentially 10 strings) # In[27]: autos['date_crawled'] = autos['date_crawled'].str[:10] # In[28]: autos['ad_created'] = autos['ad_created'].str[:10] autos['last_seen'] = autos['last_seen'].str[:10] # In[29]: # Counting date_crawled autos['date_crawled'].value_counts(normalize = True, dropna = False).sort_index(ascending = False).round(2) # Examining the distribution of date_crawled, which ranged from March 5th, 2016 - April 7th, 2016, there is a rounghly equivalent distribution at approx 3% across the dates in this range when the crawler accessed the ads. # In[30]: # Counting date_crawled autos['last_seen'].value_counts(normalize = True, dropna = False).sort_index(ascending = False).round(2) # In terms of "last_seen" that looked within the range of March 5th, 2016 to April 7th, 2016, there was a noted uptake in when the majority of the ads were last seen (i.e. near April 7th), which could be explained by: (1) the corresponding to the collection period when the dataset was being compiled or (2) this being the prime period to which car sales occur since it's the end of the 1st quarter. # In[31]: autos['ad_created'].value_counts(normalize = True, dropna = False).sort_index(ascending = False).round(2) # As for "ad_created", looking at the range from June 11th, 2015 to April 7th, 2016, the majority of the ads were created in March 2016. # # Dealing with Incorrect Registration Year # In[32]: autos['registration_year'].value_counts().sort_index() # Seeing as the earliest commercial car was made in 1917, we can just use this as the set minimum. Furthermore, as this dataset only took in data up until April 2017, it doesn't make sense to have registration years after 2017. So the range in registration year we are lookin for is b/t 1917 - 2017. # In[33]: min_bool = autos['registration_year'] > 1917 max_bool = autos['registration_year'] < 2018 combined = min_bool & max_bool autos = autos[combined] autos.describe() # ## Exploring Price by Brand # In[34]: autos['brand'].unique() # In[35]: top_10_brands = autos['brand'].value_counts()[:10] # In[36]: bottom_10_brands = autos['brand'].value_counts()[-10:] # In[37]: top_10_brands # In[38]: bottom_10_brands # In[39]: print(10157+5201+5155+4586+4118+3331+2272+1413+1232+888) print(len(autos)) print((10157+5201+5155+4586+4118+3331+2272+1413+1232+888) / len(autos)) # Looking at the data, it seems that the majority of the used cars are German brands (i.e. Volkswagen, BMW, Audi, Mercedes Benz) whilst the major foreign brand being Ford. Nevertheless, the most common used car brands are domestic. However, as it pertains to the 10 least popular brands, they are mainly foreign brands such as Subaru, Land Rover, Daewoo, etc. # # The decision to take the Top 10 brands is that these make up 89.3% of the entire dataset, which is a pretty good representation of the marketplace. # In[40]: volkswagen_bool = autos['brand'] == "volkswagen" bmw_bool = autos['brand'] == 'bmw' opel_bool = autos['brand'] == "opel" mercedes_bool = autos['brand'] == 'mercedes_benz' audi_bool = autos['brand'] == "audi" ford_bool = autos['brand'] =='ford' renault_bool = autos['brand'] == "renault" peugeot_bool = autos['brand'] == 'peugeot' fiat_bool = autos['brand'] == 'fiat' seat_bool = autos['brand'] == 'seat' # In[41]: volkswagen_only = autos[volkswagen_bool] bmw_only = autos[bmw_bool] opel_only = autos[opel_bool] mercedes_only = autos[mercedes_bool] audi_only = autos[audi_bool] ford_only = autos[ford_bool] renault_only = autos[renault_bool] peugeot_only = autos[peugeot_bool] fiat_only = autos[fiat_bool] seat_only = autos[seat_bool] # In[42]: top_brands_mean_price = {} top_10 = autos['brand'].value_counts(ascending = False)[:10].index for maker in top_10: selected_rows = autos[autos['brand'] == maker] mean_price = selected_rows['price'].mean() mean_price = mean_price.round(2) top_brands_mean_price[maker] = mean_price top_brands_mean_price # The avg. price for each of the top 10 car brands are: # # | Brand | Avg. Price (in dollars) | # |-----------------|-------------------------| # | Audi | 9659.37 | # | BMW | 8577.61 | # | Fiat | 2968.12 | # | Ford | 3933.09 | # | Mercedes Benz | 8936.97 | # | Opel | 3151.16 | # | Peugeot | 3249.60 | # | Renault | 2635.15 | # | Seat | 4675.29 | # | Volkswagen | 5660.48 | # # Looking at the data, it seems that the more luxurious brands (i.e. Audi, BMW, Mercedes Benz) have the highest price point whilst the foreign brands (i.e. Ford, Peugeot, Fiat) are on the lower end of the price point spectrum. As for the most populous brand (Volkswagen), its pricepoint is moderate. # ## Storing Aggregate Data # # Working with the top 10 brands, we want to see if mean mileage (i.e. odometer_km) and mean price are related to one another. # In[43]: top_brands_mean_mileage = {} top_10 = autos['brand'].value_counts(ascending = False)[:10].index for maker in top_10: selected_rows = autos[autos['brand'] == maker] mean_km= selected_rows['odometer_km'].mean().round(2) top_brands_mean_mileage[maker] = mean_km top_brands_mean_mileage # Looking at the average mileage of the Top 10 brands of cars, it seems that the range lies b/t 116000km to 133000km. There luxurious brands (BMW and Mercedes) appear to have a higher mileage as compared to the foreign brands. # In[44]: top_10_price = pd.Series(top_brands_mean_price) top_10_mileage = pd.Series(top_brands_mean_mileage) top_10_makers = pd.DataFrame(top_10_price, columns = ['mean_price']) top_10_makers['mean_mileage'] = top_10_mileage # In[45]: top_10_makers # It appears as though mileage doesn't seem to be as strongly related to price despite the more luxurious and priciest brands having a higher mean mileage as compared to the more economic and cheapest brands with a lower mean mileage. # # EXTRA ANALYSIS: Common Brand + Model # In[46]: col = ['brand', 'model'] Brand_Model = autos[col] Brand_Model = Brand_Model.dropna(axis = 0) # In[47]: Brand_Model['brand'].value_counts().head() # In[48]: Brand_Model['model'].value_counts().head() # In[49]: top_model_by_brand = {} brands = Brand_Model['brand'].unique() for b in brands: selected_rows = Brand_Model[Brand_Model['brand'] == b] sorted_rows = selected_rows.mode() top_row = selected_rows.iloc[0] top_model = top_row['model'] top_model_by_brand[b] = top_model top_model_by_brand # # EXTRA ANALYSIS: Split Odometer x Price # In[50]: autos['odometer_km'].value_counts().sort_index() # In[51]: len(autos) # Normally, the best approach should be a split in terms of equally distributed groups. However, considering the aggregation of vehicles with mileage greater than or equal to 150K km, we will break it down to the following distribution: # # 1) 0 - 30000 # 2) 30001 - 60000 # 3) 60001 - 90000 # 4) 90001 - 125000 # 5) 125000 + # In[52]: odometer_km_splits = [] for km in autos['odometer_km']: if km < 30001: odometer_km_splits.append("0 - 30,000") elif km < 60001: odometer_km_splits.append("30,001 - 60,000") elif km < 90001: odometer_km_splits.append("60,001 - 90,000") elif km < 125001: odometer_km_splits.append("90,001 - 125,000") else: odometer_km_splits.append("125,000+") # In[53]: autos["odometer_km_splits"] = odometer_km_splits # In[54]: autos['odometer_km_splits'].value_counts() # In[55]: price_by_mileage = {} odometer_splits = autos["odometer_km_splits"].unique() for km in odometer_splits: selected_rows = autos[autos["odometer_km_splits"] == km] avg_price = selected_rows['price'].mean().round(2) price_by_mileage[km] = avg_price price_by_mileage # Looking at the breakdown of the odometer, there seems to be a negative relationship between mean price points of a vehicle and mileage where the greater the mileage = the lower the price of the vehicle. # # Extra Analysis: Price x Damage # In[58]: col = ['price', 'unrepaired_damage'] price_by_damage = autos[col].dropna(axis = 0) price_by_damage # In[60]: price_by_status = {} status = price_by_damage['unrepaired_damage'].unique() for s in status: rows = price_by_damage[price_by_damage['unrepaired_damage'] == s] avg_price = rows['price'].mean().round(2) price_by_status[s] = avg_price price_by_status # Looking at whether or not there was any unrepaired damage to a vehicle, a greater average price point was found for vehicles without any unrepaired damage as compared to those with it.