#!/usr/bin/env python # coding: utf-8 # # Introduction # This is the first guided project in DataQuest. We'll be working with a dataset of used cars from a classified section of eBay. Data originally from Kaggle, but there have been a few modifications to the data to make it more useful to practice cleaning the data. # - We sampled 50,000 data points from the full dataset. # - Dirtied the dataset to resemble actual scraped data. # # #### Purpose: # To clean the data and analyze the included car listings. # In[2]: import pandas as pd import numpy as np # In[3]: autos = pd.read_csv('autos.csv',encoding='Latin-1') # In[4]: autos.info() # As we can see from the .info method, we have a few columns with some null values. Columns being vehicleType, gearbox, model, fuelType, and nonRepairedDamage. We must inspect these columns to see how we can deal with these null values. Also, we might want to change the price column to a float dtype so it is easier to work with. # In[5]: autos.head() # From the .head method, we can see that there is a lot of data cleaning that we need to do. # - Clean name column # - Convert price column to int/float dtype # - Inspect how many vehicleTypes. Clean accordingly # - Correct misspellings in gearbox column # - Inspect model # - Convert odomoter dtype # - Inspect notRepairedDamage column (seems like majority null) # - Maybe we need to convert to datetime objects? # In[6]: autos.columns # In[7]: new_columns = ({'yearOfRegistration':'registration_year', 'monthOfRegistration':'registration_month', 'notRepairedDamage':'unrepaired_damage','dateCreated':'ad_created', 'dateCrawled':'date_crawled', 'offerType':'offer_type','vehicleType':"vehicle_type", 'powerPS':'power_ps','fuelType':'fuel_type', 'nrOfPictures':'num_pictures','postalCode':'postal_code','lastSeen':'last_seen'} ) autos.rename(new_columns,axis=1,inplace=True) autos.head() # Camelcase to snakecase basically means changing the word to lower case and have underscores instead of spaces. So, I changed the all the columns to to snakecase just for it to be more readable. I'm not sure what date_crawled means so I just left it as is. One specific change I made was to nrOfPictures to num_pictures. # In[8]: autos.describe(include='all') # Price and odometer are stored as text, so we need the change these to numeric. We can say this because we see their descriptive statistics results in all null values # # We can also see that columns seller, offer_type, abtest, gearbox, unrepaired_damage all have only 2 values. We may consider dropping these columns because they won't be useful in our analyses. # In[9]: autos['price'] = autos['price'].str.replace('$','').str.replace(',','').astype(float) # In[10]: autos['odometer'] = autos['odometer'].str.lower().str.replace('km','').str.replace(',','').astype(float) autos.rename({'odometer':'odometer_km'}, axis=1,inplace=True) # ### Let's explore the price and odometer_km column # In[11]: autos['price'].unique().shape # theres 2357 unique values # In[12]: print('Max: {}\nMin: {}'.format(autos['price'].max(), autos['price'].min())) # What car costs 99,999,999 and 0? This isn't right autos['price'].describe() # In[13]: (autos['price'] > 500000).value_counts() # 14 cars that are more than $500,000 # In[14]: autos['price'].value_counts().sort_index(ascending=False).tail(200) # Looking at how many occurrences of each price value in descending order. # In[15]: price_q25, price_q75 = np.percentile(autos['price'].values, [25,75]) price_iqr = price_q75 - price_q25 price_cutoff = price_iqr * 2 price_upper, price_lower = price_q75+price_cutoff, price_q25-price_cutoff price_outliers = [x for x in autos['price'] if xprice_upper] len(price_outliers) # In[16]: autos.info() # In[17]: new_autos = autos[autos['price'].between(price_lower,price_upper)] new_autos.info() # 2644 outliers removed # To remove the outliers, I used the interquartile range method because it focuses solely on the data given. I used a multiplier of 2 for the cutoff because I noticed that some of the lower values were reasonable prices and I wanted to keep majority of them. # In[18]: price_outliers.sort() # In[19]: price_outliers # ### Let's change columns to quantifiable dates # - date_crawled, ad_created, last_seen are string values # - registration_year, registration_month are numeric values # In[20]: new_autos.info() # In[21]: autos[['date_crawled','ad_created','last_seen']].head() # In[22]: new_autos['date_crawled'].str[:10].value_counts(normalize=True,dropna=False).sort_index() # normalize make it a percentage instead of the actual count # we want to keep the null values # earliest to latest # With the date_crawled column and its distribution, we can see that most occurrences were between the middle of March and very early April. # In[23]: new_autos['ad_created'].str[:10].value_counts(normalize=True,dropna=False).sort_index() # Similarly to the date_crawled distribution, we can see that most of the ads were created between mid March and very early April. One thing to note is that some ads were created in 2015 and the date_crawled column did not have any. This may be something we would want to explore later. # In[24]: new_autos['last_seen'].str[:10].value_counts(normalize=True,dropna=False).sort_index() # The distribution of the last_seen column is as expected because you tend to see what is new rather than what is old in the public. So most of the occurrences lie within early April. # In[25]: new_autos['registration_year'].describe() # Using the describe method, we can see that there are definitely some unusual values, such as the max being 9999 and the min being 1000. Given the column tells us the year of the car was first released, these values (and others) may be removed. # ### Let's explore the registration_year column # In[26]: (new_autos[new_autos['registration_year']<1900]) # 6 cars that have a registration_year under 1900 # In[27]: (new_autos['registration_year']>2018).value_counts() # Looking at these specific years, I would say keeping the values between 1900 and 2018 is the most ideal. I would say its common to see car listings for cars in the future (1 or 2 years) so I decided to keep a few rows. # In[28]: new_autos = new_autos[new_autos['registration_year'].between(1900,2018)] # In[29]: new_autos['registration_year'].value_counts(normalize=True).head(20) # The distribution of the remaining values of the registration year seems to be slightly skewed in the early 2000's. Then we see some 2010's and 1990s. # ### Let's explore the brand column through aggregation # In[30]: new_autos['brand'].unique() # In[55]: selected_brands = new_autos['brand'].value_counts(normalize=True).head(10).index new_autos['brand'].value_counts(normalize=True).head(10) # Lets explore the top 10 based on percentages # I have chosen to aggregate the top 10 brands based on their respective percentage in the dataset. We see mostly German car manufacturers so I think it would be interesting to find any insights within their listings. # In[32]: brand_dict = {} for x in selected_brands: rows = new_autos[new_autos['brand'] == x] mean = rows['price'].mean() brand_dict[x]=mean sorted_dict = dict(sorted(brand_dict.items(),key=lambda x:x[1],reverse=True)) # the key parameter allows us to sort by values instead of keys # reverse makes it in descending order sorted_dict # From the selected brands, we see that the luxury brands have the highest mean price, as expected. The less known brands are much cheaper (to my knowledge). The means range from (2245,6090). # ### Let's link the top 6 brands to the mileage # - It's difficult to compare more than two aggregate series objects if we want to extend to more columns # - We can't compare more than a few rows from each series object # - We can only sort by the index (brand name) of both series objects so we can easily make visual comparisons # # So let's make a single dataframe with mean mileage and mean price. # In[33]: price_ser = pd.Series(data=sorted_dict,index=sorted_dict.keys()) price_df = pd.DataFrame(price_ser,columns=['mean_price']) # In[34]: list_top6_brand = list(sorted_dict.keys()) mile_dict = {} for x in list_top6_brand: selected = new_autos[new_autos['brand']==x] mean = selected['odometer_km'].mean() mile_dict[x] = mean mile_dict # In[35]: price_df['mean_mileage'] = mile_dict.values() # In[36]: price_df.round(2) # With both the mean price and mean mileage, we don't see any correlation between the 2. # In[37]: new_autos # ### Now, let's clean the data to make it more presentable. # - name: Replace underscores with spaces # - seller: correct spellings # - offer_type: translate # - vehicle_type: translate # - gearbox: correct spellings # - fuel_type: translate # - unrepaired_damage: translate # - date_crawled, last_seen: change to numeric values instead of date # In[38]: new_autos['name'].str.replace('_',' ') # In[39]: new_autos['seller'].unique() #privat -> private, gewerblich -> commercial # In[40]: new_autos['seller'] = new_autos['seller'].str.replace('privat','private') new_autos['seller'] = new_autos['seller'].str.replace('gewerblich','commercial') # In[41]: new_autos['offer_type'].unique() # angebot = offer, gesuch = request # In[42]: new_autos['offer_type'] = new_autos['offer_type'].str.replace('Angebot','offer').str.replace('Gesuch','request') # In[43]: new_autos['vehicle_type'].unique() # In[44]: #kleinwagen - supermini #kombi - volkswagen type 2 #cabrio - convertible #andere - other new_autos['vehicle_type'].value_counts(dropna=False) # In[45]: new_autos.loc[new_autos['vehicle_type']=='kleinwagen','vehicle_type'] = 'supermini' new_autos.loc[new_autos['vehicle_type']=='kombi','vehicle_type'] = 'volkswagen type 2' new_autos.loc[new_autos['vehicle_type']=='cabrio','vehicle_type'] = 'convertible' new_autos.loc[new_autos['vehicle_type']=='andere','vehicle_type'] = 'other' # In[46]: new_autos['gearbox'].unique() # In[47]: new_autos.loc[new_autos['gearbox']=='manuell','gearbox'] = 'manual' new_autos.loc[new_autos['gearbox']=='automatik','gearbox'] = 'auto' # In[48]: new_autos['fuel_type'].unique() # In[49]: #benzin = petrol #lpg = liquefied petroleum gas #cng = compressed natural gas #elektro = electric #andere = other new_autos.loc[new_autos['fuel_type']=='benzin','fuel_type'] = 'petrol' new_autos.loc[new_autos['fuel_type']=='elektro','fuel_type'] = 'electric' new_autos.loc[new_autos['fuel_type']=='andere','fuel_type'] = 'other' # In[50]: new_autos['unrepaired_damage'].unique() #nein = no #ja = yes # In[51]: new_autos.loc[new_autos['unrepaired_damage']=='nein','unrepaired_damage'] = 'no' new_autos.loc[new_autos['unrepaired_damage']=='ja','unrepaired_damage'] = 'yes' # In[52]: new_autos['date'] = new_autos['date_crawled'].str.split(' ',expand=True)[0].str.replace('-','').astype(int) # In[53]: new_autos # # Conclusion # # #### Findings: # - Top brands by listings: # - volkswagen (21.67%) # - opel (11.44%) # - bmw (10.43%) # - mercedes (8.89%) # - audi (7.94%) # - ford (7.2%) # - renault (5.06%) # - peugeot (3.07%) # - fiat (2.75%) # - seat(1.95%) # - German car brands make up for more than 50% of the listings. # - There is no apparent correlation between price and mileage. Further analysis would have to be done to determine whether or not it is true. # # #### Suggestions: # - Find out the interactions with the listings. (Impressions, views, etc.) # - Supply for each car model in respect to their demands. # - Somehow simplify the name of model # In[ ]: