#!/usr/bin/env python # coding: utf-8 # # Third Project- Datacleansing with Pandas and NumPy # # We will be cleaning the data provided in the file ** autos.csv** with the following description using Pandas and Numpy libraries using python. Let's have fun. # # The dataset was originally scraped and uploaded to Kaggle. We've made a few modifications from the original dataset that was uploaded to Kaggle: # # We sampled 50,000 data points from the full dataset, to ensure your code runs quickly in our hosted environment # We dirtied the dataset a bit to more closely resemble what you would expect from a scraped dataset (the version uploaded to Kaggle was cleaned to be easier to work with) # # ## Here is a quick brief about the data columns # # # | Data | Description | # |------|------| # | dateCrawled | When this ad was first crawled. All field-values are taken from this date. | # |name | Name of the car.| # |seller | Whether the seller is private or a dealer.| # |offerType | The type of listing| # |price | The price on the ad to sell the car.| # |abtest | Whether the listing is included in an A/B test.| # |vehicleType | The vehicle Type.| # |yearOfRegistration | The year in which the car was first registered.| # |gearbox | The transmission type.| # |powerPS | The power of the car in PS.| # |model | The car model name.| # |kilometer | How many kilometers the car has driven.| # |monthOfRegistration | The month in which the car was first registered.| # |fuelType | What type of fuel the car uses.| # |brand | The brand of the car.| # |notRepairedDamage | If the car has a damage which is not yet repaired.| # |dateCreated | The date on which the eBay listing was created.| # |nrOfPictures | The number of pictures in the ad.| # |postalCode | The postal code for the location of the vehicle.| # |lastSeenOnline | When the crawler saw this ad last online.| # # In[1]: import pandas as pd autos=pd.read_csv('autos.csv',encoding='Windows-1252') print(autos.info()) print(autos.head()) # In[2]: autos # As we can observe, the data is not groomed well, the column titles and the representation seems unprecendented. Let us see how we can cleanse the data and the best out of it # # The dataset contains 50,000 rows and 20 columns, most of which are strings. # Some columns have null values, but none have more than ~20% null values. # The column names use camelcase instead of Python's preferred snakecase, which means we can't just replace spaces with underscores. # # In[3]: column_change_map={'dateCrawled':'date_crawled', 'offerType':'offer_type', 'vehicleType':'vehicle_type', 'yearOfRegistration':'registration_year', 'gearbox':'gear_box', 'powerPS':'power_ps', 'monthOfRegistration':'registration_month', 'fuelType':'fuel_type','notRepairedDamage':'unrepaired_damage', 'dateCreated':'ad_created','nrOfPictures':'nr_of_pictures', 'postalCode':'postal_code','lastSeen':'last_seen'} autos.rename(columns=column_change_map,inplace=True) # As you can see, we renamed columns as instructed uning **.rename** function of pandas. Lets go further. Be aware that we are not going to run this cell again. Just for a safe understanding, we will leave the cell as a mark up cell. # In[4]: autos.info() # As we observed further, **data_crawled** and **last_seen** columns should be datetime types, **price, odometer** columns are object types which can be retyped as **int** # In[5]: autos.rename(columns={'odometer':'odometer_km'},inplace=True) #This section is commented as we changed the names from odometer to odometer_km # In[6]: autos['odometer_km']=autos['odometer_km'].str.replace("km","").str.replace(",","") # In[7]: autos['odometer_km']=autos['odometer_km'].astype(int) # print(autos['odometer_km']) autos['price']=autos['price'].str.replace(",","").str.replace("$","").astype(float) # In[8]: autos # In[9]: print(autos['odometer_km'].unique().shape) print(autos['price'].unique().shape) print(autos['odometer_km'].describe()) print(autos['price'].describe()) print(autos['odometer_km'].value_counts(dropna=False)) print(autos['price'].value_counts(dropna=False)) # There are 1421 records that are cars that were never driven. We should inspect this further. # While the following columns are added by the crawler that scraped data and the ad data. We can further see if the data is relevant. # - `date_crawled`: added by the crawler # - `last_seen`: added by the crawler # - `ad_created`: from the website # - `registration_month`: from the website # - `registration_year`: from the website # ** We will remove the columns date_crawled, last_seen and ad_created as these dates may not provide the output we are aiming at # **value_counts(normalize=True,dropna=False)), normalize function is used to provide percentage of each value available # ## Through this we can observe that there are values above 2018 and there are values below 2000 will be discarded in the subs- the reason is first man made car came into existence and transport only in 1900s which apparently makes it non-reasonable to calculate before the years that are most relevant # In[10]: # analysis=(autos['registration_year'].value_counts(normalize=True).sort_index(ascending=False)) # new=autos['registration_year']>2000 | (autos['registration_year']<2019) # print(new.unique()) # autos.registration_year.describe() autos1=(autos['registration_year']> 2019) | (autos['registration_year']<2000) autos.drop(autos[autos1].index,inplace=True) # |autos[autos.registration_year < 2000]) # In[11]: autos.shape # In[12]: autos['brand'].value_counts(normalize=True) # As we observed, the maximum of values are listed under the brands with little less or more than 1%. Discarding others may not affect the analysis as the data set is not at stake. As these contribute to more than 93% of the data # # # We created new autos with resetting the data_frame indexes so that we can run the from loop using **statements reset_index() and drop(columns=[''])** # # new=pd.DataFrame(autos,index=None).reset_index() # In[84]: brands_price={} brands_length={} autos=pd.DataFrame(autos).reset_index().drop(columns='index') count=0 brands=autos['brand'].value_counts(normalize=True).head(20) mean_price={} mean_mileage={} for brand in brands.index: mean_price[brand]=autos.loc[autos['brand']==brand,'price'].mean() for brand in brands.index: mean_mileage[brand]=autos.loc[autos['brand']==brand,'odometer_km'].mean() for key in sorted(mean_values): print(key,':',mean_values[key],) print(key,':',mean_mileage[key],'\n') # We can observe that the cars that own major propotion are cheaper than 10,000 USD compared to the ones that are less in number. This indicates that price is a clear indication of customer selection while major prorportion of top 5 brands is dominated by brand value as well such as BMW, audi, citreon and mercedez-benz. # # As a retailer I would go with cars with better brand vlaue and moderat cost than the low budget cars. You can observe that this data is from the skimmed rows. # # **Now let us store the values back into a dictionary** # In[141]: best_brands=pd.DataFrame.from_dict(mean_price,orient='index') columns=[] for keys in mean_mileage: columns.append(mean_mileage[keys]) best_brands['mean_mileage']=columns best_brands.rename(columns={0:'mean_price'},inplace=True) best_brands # ## So, here goes our final result where we can precisely see a realtionship between mileage and price. The mileage of costlier cars are substantially high. This established a strong user behavior towards the brand value than any other factor