import pandas as pd
import numpy as np
import missingno as msno
import datetime as dt
from stringcase import snakecase
#with np.printoptions(threshold=np.inf):
# create a new class which makes it possible to format text in colour or bold etc.
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
autos = pd.read_csv("autos.csv",encoding='Latin-1')
This is an analysis of used car listings on eBay Kleinanzeigen, a classifieds section of the German eBay website.
During this analysis data will be cleaned and explored.\ After cleaning and exploration the top ten car brands will be analysed in more detail (mean price and mean mileage).\ Lastly the effect of mileage on price will be investigated.
The dataset was originally scraped and uploaded to Kaggle.\ The version of the dataset that is used for this analysis is a sample of 50,000 data points that was prepared by Dataquest including simulating a less-cleaned version of the data.
autos.head(5)
autos.info()
Dataframe of 50,000 rows and 20 columns.
Column description can be seen below:
Column | Description |
---|---|
dateCrawled | When this ad was first crawled, all field-values are taken from this date |
name | Name of the car |
seller | Whether the seller is private or a dealer |
offerType | The type of listing |
price | The price on the ad to sell the car |
abtest | Whether the listing is included in an A/B test |
vehicleType | The vehicle Type |
yearOfRegistration | The year in which the car was first registered |
gearbox | The transmission type |
powerPS | The power of the car in PS |
model | The car model name |
odometer | How many kilometers the car has driven |
monthOfRegistration | The month in which the car was first registered |
fuelType | What type of fuel the car uses |
brand | The brand of the car |
notRepairedDamage | If the car has a damage which is not yet repaired |
dateCreated | The date on which the eBay listing was created |
nrOfPictures | The number of pictures in the ad |
postalCode | The postal code for the location of the vehicle |
lastSeenOnline | When the crawler saw this ad last online |
The following columns seem to contain numeric data but are currently stored as object data:
There are five columns that have null values:
None of these columns have more than 20% null values.
msno.bar(autos)
The column names use camelcase instead of Python's preferred snakecase.\ In below chunk of code column names will be converted from camelcase to snakecase.\ Furthermore, some of the column names will be renamed based on the data dictionary to be more descriptive.
#turning camelcase columns into snakecase columns
snakecase_col = []
for col in autos.columns:
col = snakecase(col)
snakecase_col.append(col)
autos.columns = snakecase_col
#renaming some columns by using a for loop with old and new values
rename_columns = [
['year_of_registration','registration_year'],
['month_of_registration','registration_month'],
['not_repaired_damage','unrepaired_damage'],
['date_created','ad_created'],
['power_p_s','power_PS']
]
for old_col, new_col in rename_columns:
autos.rename({old_col: new_col},axis=1,inplace=True)
autos.describe(include='all')
What can be seen is that columns "seller" and "offer_type" consist of almost all the same values.\ Therefore, these columns are deemed unuseful for further analysis and will be dropped from the dataframe.
#remove columns that provide unuseful data
autos.drop(['seller','offer_type'],axis=1,inplace=True)
As mentioned before, there are two columns which contain numeric data that need to be reformatted.\ This will be done below. Afterwards they will be investigated for the presence of outliers.
#price
autos['price'] = (autos['price']
.str.replace("$","")
.str.replace(",","")
.astype(float)
#.apply('{:,}'.format) # I thought to use this to format a 1000 separator,
# but this changes the float into an object at the same time.
# Someone knows how to prevent this?
)
#odometer
autos['odometer'] = (autos['odometer'].
str.replace("km","").
str.replace(",","").
astype(int)
)
autos.rename({'odometer': 'odometer_km'},axis=1,inplace=True)
The columns 'price' and 'odometer_km' have now succesfully been transformed to integer values.\ They will now be further analysed in order to check for outliers that might need to be removed.
def explore_series(dataframe, column, value_counts=False): #define a function to give detailed info about a column
# length of .unique()
template = (color.BOLD + "Number of unique values in column "
+ color.RED + column + color.END
+ color.BOLD + " is "
+ color.RED + "{:,} " + color.END)
print(template.format(len(dataframe[column].unique())))
print('\n')
# .describe()
print(color.BOLD + "Descriptive statistics of column "
+ color.RED + column + color.END)
print(dataframe[column].describe())
print('\n')
# .value_counts (put into a dataframe to calculate cumulative percentages as well)
if value_counts:
print(color.BOLD + "Unique values with their respective counts:" + color.END)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
df = dataframe[column].value_counts().sort_index(ascending=True).to_frame()
df.columns = ['count'] #rename column to 'count'
df.index.name = column #rename index to column thats being explored
df['cum_sum'] = df['count'].cumsum() #calculating cumulative sum
df['cum_perc'] = round(100*df.cum_sum/df['count'].sum(),2) #calculating cumulative perc.
print(df)
explore_series(autos,"price", value_counts=True)
A mean of nearly ten thousand euro's for used cars of all types seems realistic.\ The mean is only a fraction of the standard deviation (which is 481,000 euro's).\ This suggests that there are outliers in the dataset.
The lowest price is zero, which has 1421 occurences in the dataset.\ Price represents the starting value of an action. Since it is only the starting price it does not mean that a car will be sold for that price.\ Therefore anything below 500 euro's (which seems like a realistic lower bound for used car prices) is not removed from the dataset.
There are also outliers at the top. When investigating the unique values with their respective counts the starting price increases slowly until 350,000.\ Onwards the price increases rapidly. Occurences with a starting price higher than 350,000 will be investigated further.
price_above_350000 = autos[autos['price']>350000]
template = (color.BOLD + "There are "
+ color.RED + "{}" + color.END
+ color.BOLD + " instances with a starting price of 350,000 or higher" + color.END)
print(template.format(len(price_above_350000)))
price_above_350000
There are 14 occurences with a price higher than 350,000.\ Since most of these cars are normal family cars (with brands like Ford, BMW, Fiat, Volkswagen) these prices are unrealistic.\ The 14 instances will be removed from the dataset.
autos = autos[autos['price']<=350000]
explore_series(autos,'price')
After removal there are 49,986 instances left in the dataset.\ Removing the outliers at the top has had its impact on the mean starting price.\ This has decreased from 9,840 to 5,721.\ The standard deviation has decreased drastically as it is now less than 9,000.
The "odometer_km" will be explored next.
explore_series(autos, 'odometer_km', value_counts=True)
There are only 13 different values of mileages in the dataset. This suggests that people have to use a preset of values when creating an auction on eBay.
The lowest value is 5,000 and the highest is 150,000 which are both plausible values for mileage on a used car.\ Therefore there will be no correction on outliers for this column.
The mean is more than 120,000 km which suggests that most cars sold through eBay are likely to be a couple years old.\ 65% of the dataset has a mileage of 150,000 (or more).
This concludes the exploration of the columns "price" and "odometer_km".
There are five columns containing date information.
Column | Description |
---|---|
date_crawled | When this ad was first crawled, all field-values are taken from this date |
registration_year | The year in which the car was first registered |
registration_month | The month in which the car was first registered |
ad_created | The date on which the eBay listing was created |
last_seen | When the crawler saw this ad last online |
In order to explore the date columns more easily date_crawled, ad_created and last_seen have to be reformatted into a date type.\ Once this has been done all columns will be explored shortly.
#creating a function to reformat the columns with string format into date format.
def formatDate(dataset,column):
newformat = []
for element in dataset[column]:
element = dt.datetime.strptime(element, "%Y-%m-%d %H:%M:%S")
element = element.date()
newformat.append(element)
dataset[column] = newformat
formatDate(autos,'date_crawled')
formatDate(autos,'ad_created')
formatDate(autos,'last_seen')
explore_series(autos,'date_crawled', value_counts=True)
Looking at the data above it seems like the period over which the data has been crawled covers roughly one month (March-April 2016).\ The distribution is more or less uniform.
explore_series(autos, 'ad_created', value_counts=True)
The dates ads were created range from June 2015 until April of 2016.\ The majority (+- 97%) of ads in the dataset were created after the date on which data was crawled for the first time.\ This make sense as most auctions are only 'live' for a short period of time.
explore_series(autos, 'last_seen', value_counts=True)
The last three days contain a disproportionate amount of 'last seen' values.\ Given that these are 6-10x the values from the previous days, it's unlikely that there was a massive spike in sales.\ It's more likely that these values are to do with the crawling period ending and don't indicate car sales.
explore_series(autos, 'registration_year', value_counts=True)
Both the minimum as maximum value of registration year seems strange.
The lowest registration year is 1,000 which must be incorrect as cars only started appearing in the late 1800's.\ Due to this all occurences with a registration year before 1885 (first patented practical automobile) will be removed.
All registration years after 2016 must be incorrect as ads were created in 2015 & 2016.\ These will be removed from the dataset as well.
autos = autos[autos['registration_year'].between(1885,2016)]
explore_series(autos, 'registration_year')
There is still a large variety in registration years of the cars.\ The mean of 2002 with a small standard deviation indicate that most cars are approximately between 7 and 21 years old.
This concludes the exploration of date columns.
When working with data on cars, it's natural to explore variations across different car brands.\ Aggregation can be used to understand the brand column.
explore_series(autos, 'brand', value_counts=True)
There is a large variety of car brands in the dataset and data is not uniformly distributed.\ For further analysis the dataset will be limited to the top 10 brands in terms of number of ads.
top_10_brands = (autos['brand']
.value_counts(normalize=True,dropna=False)
.head(10)
.index)
autos = autos[autos.brand.isin(top_10_brands)]
explore_series(autos, 'brand', value_counts=True)
There are still more then 38,000 entries left after limiting the dataset to the top 10 brands.\ All brands left are European, except for Ford. Out of those 10, 5 are German which can be explained by the dataset being from the German section of eBay.
Now that the dataset contains only the top 10 brands it might be interesting to check whether there are differences in mean price between those brands.
mean_price_per_brand = {}
for brand in top_10_brands:
mean_price = autos.loc[autos['brand'] == brand,'price'].mean()
mean_price_per_brand[brand] = int(round(mean_price,0))
mean_price_per_brand = pd.Series(mean_price_per_brand, name='mean_price')
mean_price_per_brand.sort_values(ascending=False)
Brands can be divided into three price categories:
Normally a car will become less valuable when the mileage of the car becomes higher.\ It is interesting to see whether this principle can be found in the dataset.\ For that the mean mileage per brand needs to be calculated.
mean_mileage_per_brand = {}
for brand in top_10_brands:
mean_mileage = autos.loc[autos['brand'] == brand,'odometer_km'].mean()
mean_mileage_per_brand[brand] = int(round(mean_mileage,0))
mean_mileage_per_brand = pd.Series(mean_mileage_per_brand,name='mean_mileage')
mean_mileage_per_brand
price_mileage_per_brand = pd.concat([mean_price_per_brand,mean_mileage_per_brand],axis=1)
price_mileage_per_brand
Above both mean price and mean mileage can be seen. It is however not possible to conclude whether higher mileage is affecting the price.\ This is due to the fact that within a brand there are a lot of other variables affecting price (such as car type, engine type, registration year etc).\ In order to confirm whether mileage affects the price a slice of the dataset is necessary where all those variables are kept the same as much as possible.
unique_brands = autos['brand'].unique()
brand_model = {}
brand_model_count = {}
for ub in unique_brands:
temp = autos.loc[autos['brand']==ub,'model'].value_counts().index[0]
temp2 = autos.loc[autos['brand']==ub,'model'].value_counts()[0]
brand_model[ub] = temp
brand_model_count[ub] = temp2
brand_model_series = pd.Series(brand_model,name = 'model')
brand_model_count_series = pd.Series(brand_model_count,name = 'count')
df = pd.concat([brand_model_series, brand_model_count_series], axis=1)
df.sort_values(by='count', ascending=False)
Volkswagen Golf is the most occuring combination of brand and model.
#creating new dataset with only brand = volkswagen & model = golf
vw_golf = (autos.loc[(
autos['brand']=='volkswagen')&
(autos['model']=='golf')]
)
vw_golf['registration_year'].value_counts().head(5)
The top three registration years for Volkswagen Golf's in the dataset are 1998, 1999 and 2000.
#creating new data set only containing registration year 1998 - 2000
vw_golf_ry = (vw_golf[
vw_golf['registration_year'].isin([1998,1999,2000])])
vw_golf_ry['fuel_type'].value_counts().head(5)
Benzine is by far the most occuring fuel type for Volkswagen Golf's with a registration year between 1998 and 2000.
#creating new data set only containing fuel type benzine
vw_golf_ry_benzine = (
vw_golf_ry[
vw_golf_ry['fuel_type']=='benzin'])
vw_golf_ry_benzine['gearbox'].value_counts().head(5)
Almost all cars left in the dataset have a manual transmission
#creating new data set only containing manual gearboxes
vw_golf_ry_benzine_manual = (
vw_golf_ry_benzine[
vw_golf_ry_benzine['gearbox']=='manuell'])
vw_golf_ry_benzine_manual['unrepaired_damage'].value_counts()
There are almost no cars with damage in the remaining dataset
#creating new data set only containing cars with no unrepaired damage
vw_golf_ry_benzine_manual_nd = (
vw_golf_ry_benzine_manual[
vw_golf_ry_benzine_manual['unrepaired_damage']=='nein'])
vw_golf_ry_benzine_manual_nd['power_PS'].value_counts().head(5)
The amount of horsepowers for cars in the remainig dataset is more granular.\ However, cars with 75 and 101 horsepower still cover almost 75% of the dataset.\ Since the amount of horsepower is not so different between 75 and 101 both will be kept for further analysis.
#creating new data set only containing cars with 75 or 101 horsepower
vw_golf_ry_benzine_manual_nd_ps = (
vw_golf_ry_benzine_manual_nd[
vw_golf_ry_benzine_manual_nd['power_PS'].isin([75,101])])
#checking how distribution of odometer_km is within the remaining dataset
vw_golf_ry_benzine_manual_nd_ps['odometer_km'].value_counts()
After filtering the dataset on several variables the distribution of odometer_km is skewered heavily towards 150,000.\ There are so few occurences left for other values of odometer_km that it is not possible anymore to make conclusions about the effect of mileage on price.
Furthermore, price in this dataset is the starting price of an auction and not the actual selling price.\ Some people might create an ad with an unrealistic starting price assuming that this will attract people and drive up the selling price.\ It would therefore be better to investigate this effect further with a dataset that contains the selling price instead of the starting price of an auction.
#just because I went through all the trouble already I will calculate anyway. xD
df = vw_golf_ry_benzine_manual_nd_ps
odometer_km_unique = vw_golf_ry_benzine_manual_nd_ps['odometer_km'].unique()
price_odo = {}
for odo in odometer_km_unique:
temp = df.loc[df['odometer_km']==odo,'price'].mean()
price_odo[odo] = round(temp,0)
price_odo = pd.Series(price_odo, name='price')
price_odo.sort_index()