#import the pandas and numpy packages
import pandas as pd
import numpy as np
#read the csv file; call it autos
autos = pd.read_csv("autos.csv", encoding = "Latin-1")
#first review of the data. this should show us the types of variables and the size of our dataset
autos.info()
autos.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dateCrawled 50000 non-null object 1 name 50000 non-null object 2 seller 50000 non-null object 3 offerType 50000 non-null object 4 price 50000 non-null object 5 abtest 50000 non-null object 6 vehicleType 44905 non-null object 7 yearOfRegistration 50000 non-null int64 8 gearbox 47320 non-null object 9 powerPS 50000 non-null int64 10 model 47242 non-null object 11 odometer 50000 non-null object 12 monthOfRegistration 50000 non-null int64 13 fuelType 45518 non-null object 14 brand 50000 non-null object 15 notRepairedDamage 40171 non-null object 16 dateCreated 50000 non-null object 17 nrOfPictures 50000 non-null int64 18 postalCode 50000 non-null int64 19 lastSeen 50000 non-null object dtypes: int64(5), object(15) memory usage: 7.6+ MB
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
#show the column names currently
autos.columns
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'odometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen'], dtype='object')
Above, we can see that there are 20 columns within this dataset, most of which are strings. The column names are written in camelcase instead of pythons preferred snake_case, which means we can't just replace the spaces with underscores. The next step will be to convert the column names from camelcase to snakecase and to re-name some of the column names based on the data dictionary to be more descriptive.
#rename the columns
autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type',
'registration_year', 'gearbox', 'power_ps', 'model',
'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage',
'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen']
autos.columns
#show the header for each of the columns to make sure it is in snake_case and looks right.
autos.head()
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_damage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
There. Now that the column names are cleaned up, we can move forward.
In looking at the autos dataframe, we can try to figure out if other data cleaning tasks need to be done.
I should be on the lookout for text columns where all or almost all the values are the same. If this is the case with a column, then that column should be dropped as it wouldn't have any useful information for analysis.
Also I should look for examples of numeric data stored as text wich can be cleaned and converted.
autos.describe(include ='all') #run this keep exploring the dataframe
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_damage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000 | 50000 | 50000 | 50000 | 50000 | 50000 | 44905 | 50000.000000 | 47320 | 50000.000000 | 47242 | 50000 | 50000.000000 | 45518 | 50000 | 40171 | 50000 | 50000.0 | 50000.000000 | 50000 |
unique | 48213 | 38754 | 2 | 2 | 2357 | 2 | 8 | NaN | 2 | NaN | 245 | 13 | NaN | 7 | 40 | 2 | 76 | NaN | NaN | 39481 |
top | 2016-03-19 17:36:18 | Ford_Fiesta | privat | Angebot | $0 | test | limousine | NaN | manuell | NaN | golf | 150,000km | NaN | benzin | volkswagen | nein | 2016-04-03 00:00:00 | NaN | NaN | 2016-04-07 06:17:27 |
freq | 3 | 78 | 49999 | 49999 | 1421 | 25756 | 12859 | NaN | 36993 | NaN | 4024 | 32424 | NaN | 30107 | 10687 | 35232 | 1946 | NaN | NaN | 8 |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2005.073280 | NaN | 116.355920 | NaN | NaN | 5.723360 | NaN | NaN | NaN | NaN | 0.0 | 50813.627300 | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 105.712813 | NaN | 209.216627 | NaN | NaN | 3.711984 | NaN | NaN | NaN | NaN | 0.0 | 25779.747957 | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | 1067.000000 | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1999.000000 | NaN | 70.000000 | NaN | NaN | 3.000000 | NaN | NaN | NaN | NaN | 0.0 | 30451.000000 | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2003.000000 | NaN | 105.000000 | NaN | NaN | 6.000000 | NaN | NaN | NaN | NaN | 0.0 | 49577.000000 | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2008.000000 | NaN | 150.000000 | NaN | NaN | 9.000000 | NaN | NaN | NaN | NaN | 0.0 | 71540.000000 | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9999.000000 | NaN | 17700.000000 | NaN | NaN | 12.000000 | NaN | NaN | NaN | NaN | 0.0 | 99998.000000 | NaN |
So from looking at the dataset, it looks like seller and offer_type only have 2 unique values so I think these columns may be unuseable.
It looks like nr_of_pictures column has numeric values, so that would be a good column to look at more detailed.
It looks like price and odometer columns contain text values, so I will try to convert them to numeric.
#review the nr_of_pictures column
autos['nr_of_pictures'].value_counts()
0 50000 Name: nr_of_pictures, dtype: int64
So it looks like the nr_of_pictures column only has 0 as its value, so I think it would be smart to drop/delete this column in addition to the seller and offer_type columns.
#delete these columns
autos.drop(columns = ["nr_of_pictures", "seller", "offer_type"], inplace = True)
autos.head()
date_crawled | name | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_damage | ad_created | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 39218 | 2016-04-01 14:38:50 |
Next is formatting the price column. Looks like we have to remove the ',' and the '$'
#convert the values in the price column from text to numeric
autos['price'] = (autos['price'].str.replace('$','')
.str.replace(',','')
.str.replace(' ', '')
.astype(int)
)
#review the data
autos['price'].head()
'\n'
autos['price'].describe()
/Users/rexcameron/opt/anaconda3/envs/Pandas/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
count 5.000000e+04 mean 9.840044e+03 std 4.811044e+05 min 0.000000e+00 25% 1.100000e+03 50% 2.950000e+03 75% 7.200000e+03 max 1.000000e+08 Name: price, dtype: float64
#changing the column name
autos.rename({'price' : 'price_dollar'}, axis=1, inplace=True)
autos.head()
date_crawled | name | price_dollar | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_damage | ad_created | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | 5000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | 8500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | 8990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | 4350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | 1350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 39218 | 2016-04-01 14:38:50 |
#check the values in the price_dollar column
autos['price_dollar'].unique().shape
(2357,)
autos['price_dollar'].describe()
count 5.000000e+04 mean 9.840044e+03 std 4.811044e+05 min 0.000000e+00 25% 1.100000e+03 50% 2.950000e+03 75% 7.200000e+03 max 1.000000e+08 Name: price_dollar, dtype: float64
So it looks like the Non-Numeric numbers have been removed. Next step is going to be converting the 'Odometer' column and re-naming it.
Now that 'Price_Dollar' column looks set, I can move to the 'Odometer' Coulmn. I guess the first step would be to figure out what non-numeric characters are currently there.
#Figure out what non-numeric characters in column
autos['odometer'].head()
0 150,000km 1 150,000km 2 70,000km 3 70,000km 4 150,000km Name: odometer, dtype: object
#Remove the non-numeric values (',' and the km) from the column and convert it to a numeric dtype.
autos['odometer'] = autos['odometer'].str.replace(',', '').str.replace('km', '').astype('int64')
autos['odometer'].head()
0 150000 1 150000 2 70000 3 70000 4 150000 Name: odometer, dtype: int64
That looks better. Just some simple clean values for the Odometer column. I think it might be smart to specify the units for the odometer, so I will change the column name to Odometer_km.
#Change the column name to odometer_km
autos.rename({'odometer':'odometer_km'}, axis = 1, inplace = True)
#double check to make sure that the column was successfully renamed
autos.columns
Index(['date_crawled', 'name', 'price_dollar', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer_km', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'postal_code', 'last_seen'], dtype='object')
So now the price column looks set. Next, I want to continue to learn more about the other numeric columns. To do this, I can use a whole bunch of different commands. It is also important to make sure the columns have more than one unique value, or else the data will be seen as satisitically insignificat and will be deleted from the dataframe.
#Calculate the unique values for the other columns
print('Unique Values for this dataset:')
print('registration_year:', *autos['registration_year'].unique().shape)
print('registration_month:', *autos['registration_month'].unique().shape)
print('postal_code:', *autos['postal_code'].unique().shape)
print('power_ps:', *autos['power_ps'].unique().shape)
Unique Values for this dataset: registration_year: 97 registration_month: 13 postal_code: 7014 power_ps: 448
From the output above, it looks like all those 4 columns have plenty of unique values, so that means they should stay in the dataset as they are statistically significant.
outliers_table = autos[['odometer_km', 'price_dollar']].describe()
outliers_table
odometer_km | price_dollar | |
---|---|---|
count | 50000.000000 | 5.000000e+04 |
mean | 125732.700000 | 9.840044e+03 |
std | 40042.211706 | 4.811044e+05 |
min | 5000.000000 | 0.000000e+00 |
25% | 125000.000000 | 1.100000e+03 |
50% | 150000.000000 | 2.950000e+03 |
75% | 150000.000000 | 7.200000e+03 |
max | 150000.000000 | 1.000000e+08 |
To remove the outliers, I need to first figure out if there are any. I can use the '1.5xIQR rule'. This is a pretty useful rule that will help you figure out if the data falls outside the upper and lower quartiles.
#remove the outliers from the Odometers_km column
odometer_no_outliers = autos[autos['odometer_km'].between(autos['odometer_km'].quantile(0.25)-1.5*(autos['odometer_km'].quantile(0.75)-autos['odometer_km'].quantile(0.25)),
autos['odometer_km'].quantile(0.75)+1.5*(autos['odometer_km'].quantile(0.75)-autos['odometer_km'].quantile(0.25)))]
#print the dataframe with and without outliers
print('odomemter_km with potential outliers:', *autos['odometer_km'].shape)
print('odometer_km without potential outliers:', *odometer_no_outliers['odometer_km'].shape)
odomemter_km with potential outliers: 50000 odometer_km without potential outliers: 41520
So it looks like there were outliers in the column 'odometer_km' becasue the column amount of 41520 is lower than the original 50000. So I think it might be smart to replace the values of the 'autos' dataframe with the values of 'odometer_no_outliers.' This is the same dataframe but without the original outliers that were in the 'odometer_km' column.
#Remove the rows with the outliers in the 'odometer_km' column.
autos = odometer_no_outliers
#double check to make sure the number of rows in the new autos dataframe doesnt have those outliers.
autos.shape[0]
41520
Now lets run the same code for the 'price_dollars' column and see what we can get rid of. Again, I will use the '1.5xIQR rule.'
#removing the outliers from the price column
price_no_outliers = autos[autos['price_dollar'].between(autos['price_dollar'].quantile(0.25)-1.5*(autos['price_dollar'].quantile(0.75)-autos['price_dollar'].quantile(0.25)),
autos['price_dollar'].quantile(0.75)+1.5*(autos['price_dollar'].quantile(0.75)-autos['price_dollar'].quantile(0.25)))]
#Show the autos dataframe so that you can compare price_no_outliers to original
print('autos with potential price outliers:', *autos['price_dollar'].shape)
print('autos without potential price outliers:', *price_no_outliers['price_dollar'].shape)
autos with potential price outliers: 41520 autos without potential price outliers: 38386
So it looks like there were some price outliers in the autos dataframe that were removed when we ran the '1.5xIQR rule.' You can tell this becasue the number of rows in the autos without potential price outliers is 38386, which is lower than the number of rows in the original autos dataframe. Like earlier, I should be able to replace the values in the 'autos' dataframe with this values in the 'price_no_outliers' dataframe, without anything too bad happending to the rest of the data.
#update the 'autos' dataframe so that the likely price outliers have been removed.
autos = price_no_outliers
#double check that the number of rows in the new autos version looks better
autos.shape[0]
38386
Lets compare the values of the columns with the outliers included with the values without the outliers and see whats going on.
print('Table with Outliers')
outliers_table
Table with Outliers
odometer_km | price_dollar | |
---|---|---|
count | 50000.000000 | 5.000000e+04 |
mean | 125732.700000 | 9.840044e+03 |
std | 40042.211706 | 4.811044e+05 |
min | 5000.000000 | 0.000000e+00 |
25% | 125000.000000 | 1.100000e+03 |
50% | 150000.000000 | 2.950000e+03 |
75% | 150000.000000 | 7.200000e+03 |
max | 150000.000000 | 1.000000e+08 |
print('Table w/o Outliers')
autos[['odometer_km','price_dollar']].describe()
Table w/o Outliers
odometer_km | price_dollar | |
---|---|---|
count | 38386.000000 | 38386.000000 |
mean | 142674.412546 | 3127.445136 |
std | 16147.477812 | 2872.206674 |
min | 90000.000000 | 0.000000 |
25% | 150000.000000 | 950.000000 |
50% | 150000.000000 | 2131.000000 |
75% | 150000.000000 | 4500.000000 |
max | 150000.000000 | 12250.000000 |
From lookig at the two columns, it looks like 'odometer_km' went thru a couple of changes. The Standard Deviation decreased (from 40042 to 16147) while the minimum value increased quite a bit (from 5000 to 90000).
As for the 'price' column, it looks to have changed significantally as well. Its Standard Deviation appears to have decaresed (from 481104 to 2872). Also the maxium value for this price column now looks to be 12250 which is alot more realistic than 1 billion, which is what the old one was.
So to start looking carefully at the columns related to date values, I think it would help it I first review how these columns are formatted. I want to specifically start by looking at the 'date_crawled', 'ad_created, and 'last_seen' columns.
#review formatting for dat related columns
autos[['date_crawled', 'ad_created', 'last_seen']].head()
date_crawled | ad_created | last_seen | |
---|---|---|---|
0 | 2016-03-26 17:47:46 | 2016-03-26 00:00:00 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | 2016-04-04 00:00:00 | 2016-04-06 14:45:08 |
4 | 2016-04-01 14:38:50 | 2016-04-01 00:00:00 | 2016-04-01 14:38:50 |
5 | 2016-03-21 13:47:45 | 2016-03-21 00:00:00 | 2016-04-06 09:45:21 |
6 | 2016-03-20 17:55:21 | 2016-03-20 00:00:00 | 2016-03-23 02:48:59 |
Looks like it goes year-month-day, and the hours are in 24 hr time. Ok then. Next I want to review the range for 'date_crawled' column and se if there are any big outliers which may impact the conclusions.
#review the dat range in the 'date_crawled' column
autos['date_crawled'].str[:10].value_counts(normalize = True, dropna = False).sort_index()
2016-03-05 0.024697 2016-03-06 0.013911 2016-03-07 0.036628 2016-03-08 0.034414 2016-03-09 0.033736 2016-03-10 0.032642 2016-03-11 0.032017 2016-03-12 0.037540 2016-03-13 0.015058 2016-03-14 0.037383 2016-03-15 0.034127 2016-03-16 0.030037 2016-03-17 0.032121 2016-03-18 0.013234 2016-03-19 0.033398 2016-03-20 0.037514 2016-03-21 0.037488 2016-03-22 0.032564 2016-03-23 0.032538 2016-03-24 0.028630 2016-03-25 0.032199 2016-03-26 0.032069 2016-03-27 0.030819 2016-03-28 0.034674 2016-03-29 0.033867 2016-03-30 0.034127 2016-03-31 0.031991 2016-04-01 0.033267 2016-04-02 0.034492 2016-04-03 0.037800 2016-04-04 0.036862 2016-04-05 0.013442 2016-04-06 0.003569 2016-04-07 0.001146 Name: date_crawled, dtype: float64
Data seems fine I think. From reviewing it, we can see that they were mostly found in March, with a few in April, all in the same year 2016.
Next lets review the date range for 'ad_created'.
#review date range for 'ad_created'
autos['date_crawled'].str[:10].value_counts(normalize = True, dropna = False).sort_index()
2016-03-05 0.024697 2016-03-06 0.013911 2016-03-07 0.036628 2016-03-08 0.034414 2016-03-09 0.033736 2016-03-10 0.032642 2016-03-11 0.032017 2016-03-12 0.037540 2016-03-13 0.015058 2016-03-14 0.037383 2016-03-15 0.034127 2016-03-16 0.030037 2016-03-17 0.032121 2016-03-18 0.013234 2016-03-19 0.033398 2016-03-20 0.037514 2016-03-21 0.037488 2016-03-22 0.032564 2016-03-23 0.032538 2016-03-24 0.028630 2016-03-25 0.032199 2016-03-26 0.032069 2016-03-27 0.030819 2016-03-28 0.034674 2016-03-29 0.033867 2016-03-30 0.034127 2016-03-31 0.031991 2016-04-01 0.033267 2016-04-02 0.034492 2016-04-03 0.037800 2016-04-04 0.036862 2016-04-05 0.013442 2016-04-06 0.003569 2016-04-07 0.001146 Name: date_crawled, dtype: float64
So it looks like the ads were createed between March and April 2016. Again, this seems pretty even in distribution. Good to know.
Next lets review the distribution for the 'last_seen' column, and see what that looks like.
#review the date range for 'last_seen' column
autos['last_seen'].str[:10].value_counts(normalize = True, dropna = False).sort_index()
2016-03-05 0.001094 2016-03-06 0.005002 2016-03-07 0.006174 2016-03-08 0.008701 2016-03-09 0.010707 2016-03-10 0.011384 2016-03-11 0.014120 2016-03-12 0.026051 2016-03-13 0.009821 2016-03-14 0.013755 2016-03-15 0.016985 2016-03-16 0.017871 2016-03-17 0.030141 2016-03-18 0.007685 2016-03-19 0.017090 2016-03-20 0.022300 2016-03-21 0.022091 2016-03-22 0.022638 2016-03-23 0.019434 2016-03-24 0.021049 2016-03-25 0.020320 2016-03-26 0.017845 2016-03-27 0.017064 2016-03-28 0.022326 2016-03-29 0.023368 2016-03-30 0.025712 2016-03-31 0.024879 2016-04-01 0.024488 2016-04-02 0.025817 2016-04-03 0.025634 2016-04-04 0.025426 2016-04-05 0.116944 2016-04-06 0.205309 2016-04-07 0.120773 Name: last_seen, dtype: float64
Again, looks like this distribution is pretty clean and even. The dates look like they are the same as the 'date_crawled' column, from March 5, 2016 to April 7, 2016.
Next I want to review the 'registration_year' column. There might be some outliers there so let me check that out.
#review the 'registration_year' column to see that the distribution is
print(autos['registration_year'].describe())
print(autos['registration_year'].value_counts().sort_index().head())
count 38386.000000 mean 2002.499557 std 36.331672 min 1910.000000 25% 1999.000000 50% 2002.000000 75% 2006.000000 max 9000.000000 Name: registration_year, dtype: float64 1910 2 1934 1 1937 1 1950 1 1953 1 Name: registration_year, dtype: int64
Yea this column needs to get cleaned up a little. 1000 and 9000 for a registration year are obviously not realistic. Cars really appeared in society around 1910, so 1000 needs to be deleted. Also, the data was scraped in 2016, so no registration year should be later than 2016. Lets remove some rows!
#remove the rows where 'registration_year' is off
autos = autos[autos['registration_year'].between(1900, 2016)]
#double check that the values that remain are realistic
autos["registration_year"].value_counts(normalize = True).sort_index()
1910 0.000054 1934 0.000027 1937 0.000027 1950 0.000027 1953 0.000027 ... 2012 0.004194 2013 0.000654 2014 0.000218 2015 0.000136 2016 0.031071 Name: registration_year, Length: 66, dtype: float64
print(autos['registration_year'].describe())
count 36722.000000 mean 2001.640924 std 5.980791 min 1910.000000 25% 1998.000000 50% 2002.000000 75% 2005.000000 max 2016.000000 Name: registration_year, dtype: float64
That seems better. We can see that the majority of the cars were registered in the mid-1990's up to our assigned maxium year date of 2016. Good to know.
#review the top 10 cars in the 'brands' column
autos['brand'].value_counts().head(10)
volkswagen 8161 opel 4469 bmw 3933 mercedes_benz 3307 audi 2928 ford 2684 renault 1958 peugeot 1195 fiat 948 seat 684 Name: brand, dtype: int64
In this dataset, we can see that the top selling brand was 'volkswagen' and the lowest selling brand was 'toyota'. Next I want to get a better sense of the mean price of these cars. Figure out if there is any strong relationship related between the brand of car and the price it sold for.
#creating a dictionary for mean price
aggregate_price = {} #empty dictionary called aggregate_price
top_brands = autos['brand'].value_counts().head(10).index
for brand in top_brands:
sel_row = autos[autos['brand'] == brand]
mean_row = sel_row['price_dollar'].mean()
aggregate_price[brand] = mean_row
#show the aggregate (mean) price for each brand of car
aggregate_price
{'volkswagen': 3205.765960053915, 'opel': 2115.4443947191767, 'bmw': 4501.369692346809, 'mercedes_benz': 4178.984275778651, 'audi': 4362.219945355191, 'ford': 2242.295827123696, 'renault': 1728.5985699693565, 'peugeot': 2328.5171548117155, 'fiat': 1831.6582278481012, 'seat': 2713.6988304093566}
#sort the aggregate (mean) price from high to low
sorted(aggregate_price.items(), key = lambda x: x[1], reverse = True)
[('bmw', 4501.369692346809), ('audi', 4362.219945355191), ('mercedes_benz', 4178.984275778651), ('volkswagen', 3205.765960053915), ('seat', 2713.6988304093566), ('peugeot', 2328.5171548117155), ('ford', 2242.295827123696), ('opel', 2115.4443947191767), ('fiat', 1831.6582278481012), ('renault', 1728.5985699693565)]
From looking at this data, it looks like the top selling brand was 'bmw' and the lowest selling brand is 'renault.' Now that we know this, lets analyze the mileage and see what kind of relationship there is between mileage and selling price and brand.
#Create a dictionary for mileage called 'mean_mileage'
aggregate_mileage = {}
for brand in autos['brand'].value_counts().head(10).index:
sel_row = autos[autos['brand'] == brand]
mean_mileage = sel_row['odometer_km'].mean()
aggregate_mileage[brand] = mean_mileage
aggregate_mileage
{'volkswagen': 144072.41759588287, 'opel': 142254.41933318417, 'bmw': 145766.59038901603, 'mercedes_benz': 145777.13940127005, 'audi': 146784.49453551913, 'ford': 141067.43666169894, 'renault': 141036.77221654748, 'peugeot': 140380.75313807532, 'fiat': 137420.88607594935, 'seat': 140445.90643274854}
From this data, it looks like 'volkswagen' had the highest average mileage while 'seat' had the lower average mileage. I think it would be smart to create a pd series of this data and then compare it with a pd series of the aggregate (mean) price.
#create a series for 'mean_mileage'
carsmm_series = pd.Series(aggregate_mileage)
carsmm_series
volkswagen 144072.417596 opel 142254.419333 bmw 145766.590389 mercedes_benz 145777.139401 audi 146784.494536 ford 141067.436662 renault 141036.772217 peugeot 140380.753138 fiat 137420.886076 seat 140445.906433 dtype: float64
#create a series for 'mean_price'
carsmp_series = pd.Series(aggregate_price)
carsmp_series
volkswagen 3205.765960 opel 2115.444395 bmw 4501.369692 mercedes_benz 4178.984276 audi 4362.219945 ford 2242.295827 renault 1728.598570 peugeot 2328.517155 fiat 1831.658228 seat 2713.698830 dtype: float64
#use the Series Constructor that uses the 'aggregate_price' dictionary
bmp_series = pd.Series(aggregate_price)
print(bmp_series)
volkswagen 3205.765960 opel 2115.444395 bmw 4501.369692 mercedes_benz 4178.984276 audi 4362.219945 ford 2242.295827 renault 1728.598570 peugeot 2328.517155 fiat 1831.658228 seat 2713.698830 dtype: float64
df = pd.DataFrame(bmp_series, columns=['mean_price'])
df
mean_price | |
---|---|
volkswagen | 3205.765960 |
opel | 2115.444395 |
bmw | 4501.369692 |
mercedes_benz | 4178.984276 |
audi | 4362.219945 |
ford | 2242.295827 |
renault | 1728.598570 |
peugeot | 2328.517155 |
fiat | 1831.658228 |
seat | 2713.698830 |
#insert the values for mean mileage as another column
df.insert(1, 'mean_mileage', carsmm_series)
df
mean_price | mean_mileage | |
---|---|---|
volkswagen | 3205.765960 | 144072.417596 |
opel | 2115.444395 | 142254.419333 |
bmw | 4501.369692 | 145766.590389 |
mercedes_benz | 4178.984276 | 145777.139401 |
audi | 4362.219945 | 146784.494536 |
ford | 2242.295827 | 141067.436662 |
renault | 1728.598570 | 141036.772217 |
peugeot | 2328.517155 | 140380.753138 |
fiat | 1831.658228 | 137420.886076 |
seat | 2713.698830 | 140445.906433 |
#sort the dataframe by mean price in decending order
df.sort_values('mean_price', ascending=False)
mean_price | mean_mileage | |
---|---|---|
bmw | 4501.369692 | 145766.590389 |
audi | 4362.219945 | 146784.494536 |
mercedes_benz | 4178.984276 | 145777.139401 |
volkswagen | 3205.765960 | 144072.417596 |
seat | 2713.698830 | 140445.906433 |
peugeot | 2328.517155 | 140380.753138 |
ford | 2242.295827 | 141067.436662 |
opel | 2115.444395 | 142254.419333 |
fiat | 1831.658228 | 137420.886076 |
renault | 1728.598570 | 141036.772217 |
#caluclate a correlation between the 2 columns
df['mean_price'].corr(df['mean_mileage'])
0.8992195082897082
So it looks like the correlation I calculated was 0.89 or 89%. I am not sure if this is correct, but if it is, that would mean there is a little bit of a correlation between the mean price and the mean mileage for the cars sold within this dataset.
-There were 3 statistically insignificant columns within this dataset. They were 'seller', 'offer_type', and 'nr_of_pictures'.
-There were 11,614 outliers within the 'odometer_km' column and 3,134 outliers within the 'price_dollar' column.
-All the ads were first crawled from March 5, 2016 to April 7, 2016 with the ads being created in that same timeframe.
-The mean price and car brand seem to be somewhat correlated.