Ebay Germany Car Sales dataset analysis

In [2]:
import pandas as pd
import numpy as np
print("imported")
imported
In [3]:
autos=pd.read_csv("autos.csv", encoding="Latin-1")
print(autos.head(1))
           dateCrawled                              name  seller offerType  \
0  2016-03-26 17:47:46  Peugeot_807_160_NAVTECH_ON_BOARD  privat   Angebot   

    price   abtest vehicleType  yearOfRegistration  gearbox  powerPS   model  \
0  $5,000  control         bus                2004  manuell      158  andere   

    odometer  monthOfRegistration fuelType    brand notRepairedDamage  \
0  150,000km                    3      lpg  peugeot              nein   

           dateCreated  nrOfPictures  postalCode             lastSeen  
0  2016-03-26 00:00:00             0       79588  2016-04-06 06:45:54  
In [4]:
autos
Out[4]:
dateCrawled name seller offerType price abtest vehicleType yearOfRegistration gearbox powerPS model odometer monthOfRegistration fuelType brand notRepairedDamage dateCreated nrOfPictures postalCode lastSeen
0 2016-03-26 17:47:46 Peugeot_807_160_NAVTECH_ON_BOARD privat Angebot $5,000 control bus 2004 manuell 158 andere 150,000km 3 lpg peugeot nein 2016-03-26 00:00:00 0 79588 2016-04-06 06:45:54
1 2016-04-04 13:38:56 BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik privat Angebot $8,500 control limousine 1997 automatik 286 7er 150,000km 6 benzin bmw nein 2016-04-04 00:00:00 0 71034 2016-04-06 14:45:08
2 2016-03-26 18:57:24 Volkswagen_Golf_1.6_United privat Angebot $8,990 test limousine 2009 manuell 102 golf 70,000km 7 benzin volkswagen nein 2016-03-26 00:00:00 0 35394 2016-04-06 20:15:37
3 2016-03-12 16:58:10 Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... privat Angebot $4,350 control kleinwagen 2007 automatik 71 fortwo 70,000km 6 benzin smart nein 2016-03-12 00:00:00 0 33729 2016-03-15 03:16:28
4 2016-04-01 14:38:50 Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... privat Angebot $1,350 test kombi 2003 manuell 0 focus 150,000km 7 benzin ford nein 2016-04-01 00:00:00 0 39218 2016-04-01 14:38:50
5 2016-03-21 13:47:45 Chrysler_Grand_Voyager_2.8_CRD_Aut.Limited_Sto... privat Angebot $7,900 test bus 2006 automatik 150 voyager 150,000km 4 diesel chrysler NaN 2016-03-21 00:00:00 0 22962 2016-04-06 09:45:21
6 2016-03-20 17:55:21 VW_Golf_III_GT_Special_Electronic_Green_Metall... privat Angebot $300 test limousine 1995 manuell 90 golf 150,000km 8 benzin volkswagen NaN 2016-03-20 00:00:00 0 31535 2016-03-23 02:48:59
7 2016-03-16 18:55:19 Golf_IV_1.9_TDI_90PS privat Angebot $1,990 control limousine 1998 manuell 90 golf 150,000km 12 diesel volkswagen nein 2016-03-16 00:00:00 0 53474 2016-04-07 03:17:32
8 2016-03-22 16:51:34 Seat_Arosa privat Angebot $250 test NaN 2000 manuell 0 arosa 150,000km 10 NaN seat nein 2016-03-22 00:00:00 0 7426 2016-03-26 18:18:10
9 2016-03-16 13:47:02 Renault_Megane_Scenic_1.6e_RT_Klimaanlage privat Angebot $590 control bus 1997 manuell 90 megane 150,000km 7 benzin renault nein 2016-03-16 00:00:00 0 15749 2016-04-06 10:46:35
10 2016-03-15 01:41:36 VW_Golf_Tuning_in_siber/grau privat Angebot $999 test NaN 2017 manuell 90 NaN 150,000km 4 benzin volkswagen nein 2016-03-14 00:00:00 0 86157 2016-04-07 03:16:21
11 2016-03-16 18:45:34 Mercedes_A140_Motorschaden privat Angebot $350 control NaN 2000 NaN 0 NaN 150,000km 0 benzin mercedes_benz NaN 2016-03-16 00:00:00 0 17498 2016-03-16 18:45:34
12 2016-03-31 19:48:22 Smart_smart_fortwo_coupe_softouch_pure_MHD_Pan... privat Angebot $5,299 control kleinwagen 2010 automatik 71 fortwo 50,000km 9 benzin smart nein 2016-03-31 00:00:00 0 34590 2016-04-06 14:17:52
13 2016-03-23 10:48:32 Audi_A3_1.6_tuning privat Angebot $1,350 control limousine 1999 manuell 101 a3 150,000km 11 benzin audi nein 2016-03-23 00:00:00 0 12043 2016-04-01 14:17:13
14 2016-03-23 11:50:46 Renault_Clio_3__Dynamique_1.2__16_V;_viele_Ver... privat Angebot $3,999 test kleinwagen 2007 manuell 75 clio 150,000km 9 benzin renault NaN 2016-03-23 00:00:00 0 81737 2016-04-01 15:46:47
15 2016-04-01 12:06:20 Corvette_C3_Coupe_T_Top_Crossfire_Injection privat Angebot $18,900 test coupe 1982 automatik 203 NaN 80,000km 6 benzin sonstige_autos nein 2016-04-01 00:00:00 0 61276 2016-04-02 21:10:48
16 2016-03-16 14:59:02 Opel_Vectra_B_Kombi privat Angebot $350 test kombi 1999 manuell 101 vectra 150,000km 5 benzin opel nein 2016-03-16 00:00:00 0 57299 2016-03-18 05:29:37
17 2016-03-29 11:46:22 Volkswagen_Scirocco_2_G60 privat Angebot $5,500 test coupe 1990 manuell 205 scirocco 150,000km 6 benzin volkswagen nein 2016-03-29 00:00:00 0 74821 2016-04-05 20:46:26
18 2016-03-26 19:57:44 Verkaufen_mein_bmw_e36_320_i_touring privat Angebot $300 control bus 1995 manuell 150 3er 150,000km 0 benzin bmw NaN 2016-03-26 00:00:00 0 54329 2016-04-02 12:16:41
19 2016-03-17 13:36:21 mazda_tribute_2.0_mit_gas_und_tuev_neu_2018 privat Angebot $4,150 control suv 2004 manuell 124 andere 150,000km 2 lpg mazda nein 2016-03-17 00:00:00 0 40878 2016-03-17 14:45:58
20 2016-03-05 19:57:31 Audi_A4_Avant_1.9_TDI_*6_Gang*AHK*Klimatronik*... privat Angebot $3,500 test kombi 2003 manuell 131 a4 150,000km 5 diesel audi NaN 2016-03-05 00:00:00 0 53913 2016-03-07 05:46:46
21 2016-03-06 19:07:10 Porsche_911_Carrera_4S_Cabrio privat Angebot $41,500 test cabrio 2004 manuell 320 911 150,000km 4 benzin porsche nein 2016-03-06 00:00:00 0 65428 2016-04-05 23:46:19
22 2016-03-28 20:50:54 MINI_Cooper_S_Cabrio privat Angebot $25,450 control cabrio 2015 manuell 184 cooper 10,000km 1 benzin mini nein 2016-03-28 00:00:00 0 44789 2016-04-01 06:45:30
23 2016-03-10 19:55:34 Peugeot_Boxer_2_2_HDi_120_Ps_9_Sitzer_inkl_Klima privat Angebot $7,999 control bus 2010 manuell 120 NaN 150,000km 2 diesel peugeot nein 2016-03-10 00:00:00 0 30900 2016-03-17 08:45:17
24 2016-04-03 11:57:02 BMW_535i_xDrive_Sport_Aut. privat Angebot $48,500 control limousine 2014 automatik 306 5er 30,000km 12 benzin bmw nein 2016-04-03 00:00:00 0 22547 2016-04-07 13:16:50
25 2016-03-21 21:56:18 Ford_escort_kombi_an_bastler_mit_ghia_ausstattung privat Angebot $90 control kombi 1996 manuell 116 NaN 150,000km 4 benzin ford ja 2016-03-21 00:00:00 0 27574 2016-04-01 05:16:49
26 2016-04-03 22:46:28 Volkswagen_Polo_Fox privat Angebot $777 control kleinwagen 1992 manuell 54 polo 125,000km 2 benzin volkswagen nein 2016-04-03 00:00:00 0 38110 2016-04-05 23:46:48
27 2016-03-27 18:45:01 Hat_einer_Ahnung_mit_Ford_Galaxy_HILFE privat Angebot $0 control NaN 2005 NaN 0 NaN 150,000km 0 NaN ford NaN 2016-03-27 00:00:00 0 66701 2016-03-27 18:45:01
28 2016-03-19 21:56:19 MINI_Cooper_D privat Angebot $5,250 control kleinwagen 2007 manuell 110 cooper 150,000km 7 diesel mini ja 2016-03-19 00:00:00 0 15745 2016-04-07 14:58:48
29 2016-04-02 12:45:44 Mercedes_Benz_E_320_T_CDI_Avantgarde_DPF7_Sitz... privat Angebot $4,999 test kombi 2004 automatik 204 e_klasse 150,000km 10 diesel mercedes_benz nein 2016-04-02 00:00:00 0 47638 2016-04-02 12:45:44
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49970 2016-03-21 22:47:37 c4_Grand_Picasso_mit_Automatik_Leder_Navi_Temp... privat Angebot $15,800 control bus 2010 automatik 136 c4 60,000km 4 diesel citroen nein 2016-03-21 00:00:00 0 14947 2016-04-07 04:17:34
49971 2016-03-29 14:54:12 W.Lupo_1.0 privat Angebot $950 test kleinwagen 2001 manuell 50 lupo 150,000km 4 benzin volkswagen nein 2016-03-29 00:00:00 0 65197 2016-03-29 20:41:51
49972 2016-03-26 22:25:23 Mercedes_Benz_Vito_115_CDI_Extralang_Aut. privat Angebot $3,300 control bus 2004 automatik 150 vito 150,000km 10 diesel mercedes_benz ja 2016-03-26 00:00:00 0 65326 2016-03-28 11:28:18
49973 2016-03-27 05:32:39 Mercedes_Benz_SLK_200_Kompressor privat Angebot $6,000 control cabrio 2004 manuell 163 slk 150,000km 11 benzin mercedes_benz nein 2016-03-27 00:00:00 0 53567 2016-03-27 08:25:24
49974 2016-03-20 10:52:31 Golf_1_Cabrio_Tuev_Neu_viele_Extras_alles_eing... privat Angebot $0 control cabrio 1983 manuell 70 golf 150,000km 2 benzin volkswagen nein 2016-03-20 00:00:00 0 8209 2016-03-27 19:48:16
49975 2016-03-27 20:51:39 Honda_Jazz_1.3_DSi_i_VTEC_IMA_CVT_Comfort privat Angebot $9,700 control kleinwagen 2012 automatik 88 jazz 100,000km 11 hybrid honda nein 2016-03-27 00:00:00 0 84385 2016-04-05 19:45:34
49976 2016-03-19 18:56:05 Audi_80_Avant_2.6_E__Vollausstattung!!_Einziga... privat Angebot $5,900 test kombi 1992 automatik 150 80 150,000km 12 benzin audi nein 2016-03-19 00:00:00 0 36100 2016-04-07 06:16:44
49977 2016-03-31 18:37:18 Mercedes_Benz_C200_Cdi_W203 privat Angebot $5,500 control limousine 2003 manuell 116 c_klasse 150,000km 2 diesel mercedes_benz nein 2016-03-31 00:00:00 0 33739 2016-04-06 12:16:11
49978 2016-04-04 10:37:14 Mercedes_Benz_E_200_Classic privat Angebot $900 control limousine 1996 automatik 136 e_klasse 150,000km 9 benzin mercedes_benz ja 2016-04-04 00:00:00 0 24405 2016-04-06 12:44:20
49979 2016-03-20 18:38:40 Volkswagen_Polo_1.6_TDI_Style privat Angebot $11,000 test kleinwagen 2011 manuell 90 polo 70,000km 11 diesel volkswagen nein 2016-03-20 00:00:00 0 48455 2016-04-07 01:45:12
49980 2016-03-12 10:55:54 Ford_Escort_Turnier_16V privat Angebot $400 control kombi 1995 manuell 105 escort 125,000km 3 benzin ford NaN 2016-03-12 00:00:00 0 56218 2016-04-06 17:16:49
49981 2016-03-15 09:38:21 Opel_Astra_Kombi_mit_Anhaengerkupplung privat Angebot $2,000 control kombi 1998 manuell 115 astra 150,000km 12 benzin opel nein 2016-03-15 00:00:00 0 86859 2016-04-05 17:21:46
49982 2016-03-29 18:51:08 Skoda_Fabia_4_Tuerer_Bj:2004__85.000Tkm privat Angebot $1,950 control kleinwagen 2004 manuell 0 fabia 90,000km 7 benzin skoda NaN 2016-03-29 00:00:00 0 45884 2016-03-29 18:51:08
49983 2016-03-06 12:43:04 Ford_focus_99 privat Angebot $600 test kleinwagen 1999 manuell 101 focus 150,000km 4 benzin ford NaN 2016-03-06 00:00:00 0 52477 2016-03-09 06:16:08
49984 2016-03-31 22:48:48 Student_sucht_ein__Anfaengerauto___ab_2000_BJ_... privat Angebot $0 test NaN 2000 NaN 0 NaN 150,000km 0 NaN sonstige_autos NaN 2016-03-31 00:00:00 0 12103 2016-04-02 19:44:53
49985 2016-04-02 16:38:23 Verkaufe_meinen_vw_vento! privat Angebot $1,000 control NaN 1995 automatik 0 NaN 150,000km 0 benzin volkswagen NaN 2016-04-02 00:00:00 0 30900 2016-04-06 15:17:52
49986 2016-04-04 20:46:02 Chrysler_300C_3.0_CRD_DPF_Automatik_Voll_Ausst... privat Angebot $15,900 control limousine 2010 automatik 218 300c 125,000km 11 diesel chrysler nein 2016-04-04 00:00:00 0 73527 2016-04-06 23:16:00
49987 2016-03-22 20:47:27 Audi_A3_Limousine_2.0_TDI_DPF_Ambition__NAVI__... privat Angebot $21,990 control limousine 2013 manuell 150 a3 50,000km 11 diesel audi nein 2016-03-22 00:00:00 0 94362 2016-03-26 22:46:06
49988 2016-03-28 19:49:51 BMW_330_Ci privat Angebot $9,550 control coupe 2001 manuell 231 3er 150,000km 10 benzin bmw nein 2016-03-28 00:00:00 0 83646 2016-04-07 02:17:40
49989 2016-03-11 19:50:37 VW_Polo_zum_Ausschlachten_oder_Wiederaufbau privat Angebot $150 test kleinwagen 1997 manuell 0 polo 150,000km 5 benzin volkswagen ja 2016-03-11 00:00:00 0 21244 2016-03-12 10:17:55
49990 2016-03-21 19:54:19 Mercedes_Benz_A_200__BlueEFFICIENCY__Urban privat Angebot $17,500 test limousine 2012 manuell 156 a_klasse 30,000km 12 benzin mercedes_benz nein 2016-03-21 00:00:00 0 58239 2016-04-06 22:46:57
49991 2016-03-06 15:25:19 Kleinwagen privat Angebot $500 control NaN 2016 manuell 0 twingo 150,000km 0 benzin renault NaN 2016-03-06 00:00:00 0 61350 2016-03-06 18:24:19
49992 2016-03-10 19:37:38 Fiat_Grande_Punto_1.4_T_Jet_16V_Sport privat Angebot $4,800 control kleinwagen 2009 manuell 120 andere 125,000km 9 lpg fiat nein 2016-03-10 00:00:00 0 68642 2016-03-13 01:44:51
49993 2016-03-15 18:47:35 Audi_A3__1_8l__Silber;_schoenes_Fahrzeug privat Angebot $1,650 control kleinwagen 1997 manuell 0 NaN 150,000km 7 benzin audi NaN 2016-03-15 00:00:00 0 65203 2016-04-06 19:46:53
49994 2016-03-22 17:36:42 Audi_A6__S6__Avant_4.2_quattro_eventuell_Tausc... privat Angebot $5,000 control kombi 2001 automatik 299 a6 150,000km 1 benzin audi nein 2016-03-22 00:00:00 0 46537 2016-04-06 08:16:39
49995 2016-03-27 14:38:19 Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon privat Angebot $24,900 control limousine 2011 automatik 239 q5 100,000km 1 diesel audi nein 2016-03-27 00:00:00 0 82131 2016-04-01 13:47:40
49996 2016-03-28 10:50:25 Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... privat Angebot $1,980 control cabrio 1996 manuell 75 astra 150,000km 5 benzin opel nein 2016-03-28 00:00:00 0 44807 2016-04-02 14:18:02
49997 2016-04-02 14:44:48 Fiat_500_C_1.2_Dualogic_Lounge privat Angebot $13,200 test cabrio 2014 automatik 69 500 5,000km 11 benzin fiat nein 2016-04-02 00:00:00 0 73430 2016-04-04 11:47:27
49998 2016-03-08 19:25:42 Audi_A3_2.0_TDI_Sportback_Ambition privat Angebot $22,900 control kombi 2013 manuell 150 a3 40,000km 11 diesel audi nein 2016-03-08 00:00:00 0 35683 2016-04-05 16:45:07
49999 2016-03-14 00:42:12 Opel_Vectra_1.6_16V privat Angebot $1,250 control limousine 1996 manuell 101 vectra 150,000km 1 benzin opel nein 2016-03-13 00:00:00 0 45897 2016-04-06 21:18:48

50000 rows × 20 columns

In [5]:
autos.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null object
dtypes: int64(5), object(15)
memory usage: 7.6+ MB
In [6]:
autos.columns
Out[6]:
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

Renaming columns - from camelCase to snake_case

In [7]:
autos.rename({"yearOfRegistration":"registration_year",
                           "monthOfRegistration":"registration_month",
                           "notRepairedDamage":"unrepaired_damage",
                           "dateCreated":"ad_created",
                            "dateCrawled":"date_crawled",
                             "offerType":"offer_type",
                             "vehicleType":"vehicle_type",
                             "powerPS":"power_ps",
                             "fuelType":"fuel_type",
                             "nrOfPictures":"nr_of_pictures",
                             "postalCode":"postal_code",
                             "lastSeen":"last_seen"},axis=1,inplace=True)
autos.head()
Out[7]:
date_crawled name seller offer_type price abtest vehicle_type registration_year gearbox power_ps model odometer registration_month fuel_type brand unrepaired_damage ad_created nr_of_pictures postal_code last_seen
0 2016-03-26 17:47:46 Peugeot_807_160_NAVTECH_ON_BOARD privat Angebot $5,000 control bus 2004 manuell 158 andere 150,000km 3 lpg peugeot nein 2016-03-26 00:00:00 0 79588 2016-04-06 06:45:54
1 2016-04-04 13:38:56 BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik privat Angebot $8,500 control limousine 1997 automatik 286 7er 150,000km 6 benzin bmw nein 2016-04-04 00:00:00 0 71034 2016-04-06 14:45:08
2 2016-03-26 18:57:24 Volkswagen_Golf_1.6_United privat Angebot $8,990 test limousine 2009 manuell 102 golf 70,000km 7 benzin volkswagen nein 2016-03-26 00:00:00 0 35394 2016-04-06 20:15:37
3 2016-03-12 16:58:10 Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... privat Angebot $4,350 control kleinwagen 2007 automatik 71 fortwo 70,000km 6 benzin smart nein 2016-03-12 00:00:00 0 33729 2016-03-15 03:16:28
4 2016-04-01 14:38:50 Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... privat Angebot $1,350 test kombi 2003 manuell 0 focus 150,000km 7 benzin ford nein 2016-04-01 00:00:00 0 39218 2016-04-01 14:38:50
In [8]:
autos.describe(include="all") #show all columns
Out[8]:
date_crawled name seller offer_type price abtest vehicle_type registration_year gearbox power_ps model odometer registration_month fuel_type brand unrepaired_damage ad_created nr_of_pictures postal_code last_seen
count 50000 50000 50000 50000 50000 50000 44905 50000.000000 47320 50000.000000 47242 50000 50000.000000 45518 50000 40171 50000 50000.0 50000.000000 50000
unique 48213 38754 2 2 2357 2 8 NaN 2 NaN 245 13 NaN 7 40 2 76 NaN NaN 39481
top 2016-04-02 15:49:30 Ford_Fiesta privat Angebot $0 test limousine NaN manuell NaN golf 150,000km NaN benzin volkswagen nein 2016-04-03 00:00:00 NaN NaN 2016-04-07 06:17:27
freq 3 78 49999 49999 1421 25756 12859 NaN 36993 NaN 4024 32424 NaN 30107 10687 35232 1946 NaN NaN 8
mean NaN NaN NaN NaN NaN NaN NaN 2005.073280 NaN 116.355920 NaN NaN 5.723360 NaN NaN NaN NaN 0.0 50813.627300 NaN
std NaN NaN NaN NaN NaN NaN NaN 105.712813 NaN 209.216627 NaN NaN 3.711984 NaN NaN NaN NaN 0.0 25779.747957 NaN
min NaN NaN NaN NaN NaN NaN NaN 1000.000000 NaN 0.000000 NaN NaN 0.000000 NaN NaN NaN NaN 0.0 1067.000000 NaN
25% NaN NaN NaN NaN NaN NaN NaN 1999.000000 NaN 70.000000 NaN NaN 3.000000 NaN NaN NaN NaN 0.0 30451.000000 NaN
50% NaN NaN NaN NaN NaN NaN NaN 2003.000000 NaN 105.000000 NaN NaN 6.000000 NaN NaN NaN NaN 0.0 49577.000000 NaN
75% NaN NaN NaN NaN NaN NaN NaN 2008.000000 NaN 150.000000 NaN NaN 9.000000 NaN NaN NaN NaN 0.0 71540.000000 NaN
max NaN NaN NaN NaN NaN NaN NaN 9999.000000 NaN 17700.000000 NaN NaN 12.000000 NaN NaN NaN NaN 0.0 99998.000000 NaN

Formating some columns' type from st to int.

In [9]:
try: #rerunning of code will raise attrib error because it already applied to dataframe
    autos["price"]=autos["price"].str.replace("$","").str.replace(",","").astype(int)
    autos["odometer"]=autos["odometer"].str.replace("km","").str.replace(",","").astype(int)
except:
    pass
autos.rename({"odometer":"odometer_km"},axis=1,inplace=True)
autos["price"].head()
autos["odometer_km"].head()
Out[9]:
0    150000
1    150000
2     70000
3     70000
4    150000
Name: odometer_km, dtype: int64
In [10]:
autos["price"].head()
Out[10]:
0    5000
1    8500
2    8990
3    4350
4    1350
Name: price, dtype: int64
In [11]:
print(autos["price"].unique().shape)
print(autos["odometer_km"].unique().shape)
print(autos["price"].describe())
autos["odometer_km"].describe()
(2357,)
(13,)
count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64
Out[11]:
count     50000.000000
mean     125732.700000
std       40042.211706
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64
In [12]:
print(autos.isnull().sum())
print(autos["price"].isnull().sum())
date_crawled             0
name                     0
seller                   0
offer_type               0
price                    0
abtest                   0
vehicle_type          5095
registration_year        0
gearbox               2680
power_ps                 0
model                 2758
odometer_km              0
registration_month       0
fuel_type             4482
brand                    0
unrepaired_damage     9829
ad_created               0
nr_of_pictures           0
postal_code              0
last_seen                0
dtype: int64
0
In [13]:
print(autos["price"].value_counts().sort_index().head(10))
print(autos["price"].value_counts().sort_index(ascending=False).head(10))
print(autos["odometer_km"].value_counts().sort_index().head())
0     1421
1      156
2        3
3        1
5        2
8        1
9        1
10       7
11       2
12       3
Name: price, dtype: int64
99999999    1
27322222    1
12345678    3
11111111    2
10000000    1
3890000     1
1300000     1
1234566     1
999999      2
999990      1
Name: price, dtype: int64
5000     967
10000    264
20000    784
30000    789
40000    819
Name: odometer_km, dtype: int64
In [14]:
print(autos[autos["price"]<100].head())
           date_crawled                                               name  \
25  2016-03-21 21:56:18  Ford_escort_kombi_an_bastler_mit_ghia_ausstattung   
27  2016-03-27 18:45:01             Hat_einer_Ahnung_mit_Ford_Galaxy_HILFE   
30  2016-03-14 11:47:31                         Peugeot_206_Unfallfahrzeug   
55  2016-03-07 02:47:54                     Mercedes_E320_AMG_zu_Tauschen!   
64  2016-04-05 07:36:19                 Autotransport__Abschlepp_Schlepper   

    seller offer_type  price   abtest vehicle_type  registration_year  \
25  privat    Angebot     90  control        kombi               1996   
27  privat    Angebot      0  control          NaN               2005   
30  privat    Angebot     80     test   kleinwagen               2002   
55  privat    Angebot      1     test          NaN               2017   
64  privat    Angebot     40     test          NaN               2011   

      gearbox  power_ps     model  odometer_km  registration_month fuel_type  \
25    manuell       116       NaN       150000                   4    benzin   
27        NaN         0       NaN       150000                   0       NaN   
30    manuell        60   2_reihe       150000                   6    benzin   
55  automatik       224  e_klasse       125000                   7    benzin   
64        NaN         0       5er       150000                   5       NaN   

            brand unrepaired_damage           ad_created  nr_of_pictures  \
25           ford                ja  2016-03-21 00:00:00               0   
27           ford               NaN  2016-03-27 00:00:00               0   
30        peugeot                ja  2016-03-14 00:00:00               0   
55  mercedes_benz              nein  2016-03-06 00:00:00               0   
64            bmw               NaN  2016-04-05 00:00:00               0   

    postal_code            last_seen  
25        27574  2016-04-01 05:16:49  
27        66701  2016-03-27 18:45:01  
30        57076  2016-03-14 11:47:31  
55        22111  2016-03-08 05:45:44  
64        40591  2016-04-07 12:16:01  

Lets remove outliers with selecting only appropriate columns

In [15]:
autos=autos[(autos["price"]>100) & (autos["price"]<400000)]
print(autos.describe())
               price  registration_year      power_ps    odometer_km  \
count   48090.000000       48090.000000  48090.000000   48090.000000   
mean     5946.617426        2004.749927    117.796881  125921.501352   
std      9085.786730          88.018197    200.711913   39519.217705   
min       110.000000        1000.000000      0.000000    5000.000000   
25%      1250.000000        1999.000000     75.000000  125000.000000   
50%      3099.000000        2004.000000    108.000000  150000.000000   
75%      7500.000000        2008.000000    150.000000  150000.000000   
max    350000.000000        9999.000000  17700.000000  150000.000000   

       registration_month  nr_of_pictures   postal_code  
count        48090.000000         48090.0  48090.000000  
mean             5.806259             0.0  50995.636203  
std              3.673874             0.0  25728.598110  
min              0.000000             0.0   1067.000000  
25%              3.000000             0.0  30823.000000  
50%              6.000000             0.0  49733.000000  
75%              9.000000             0.0  71665.000000  
max             12.000000             0.0  99998.000000  

Let's check dates and their distribution. Normlaize for percentage expression

In [16]:
autos.loc[:,["date_crawled","last_seen","ad_created",
             "registration_month","registration_year"]].head()
Out[16]:
date_crawled last_seen ad_created registration_month registration_year
0 2016-03-26 17:47:46 2016-04-06 06:45:54 2016-03-26 00:00:00 3 2004
1 2016-04-04 13:38:56 2016-04-06 14:45:08 2016-04-04 00:00:00 6 1997
2 2016-03-26 18:57:24 2016-04-06 20:15:37 2016-03-26 00:00:00 7 2009
3 2016-03-12 16:58:10 2016-03-15 03:16:28 2016-03-12 00:00:00 6 2007
4 2016-04-01 14:38:50 2016-04-01 14:38:50 2016-04-01 00:00:00 7 2003
In [17]:
print(autos['date_crawled'].str[:10].value_counts(
                    normalize=True, dropna=False).sort_index(ascending=False)[0:5]) #or head()
print(autos['ad_created'].str[:10].value_counts(
                    normalize=True, dropna=False).sort_index()[0:5])
print(autos['last_seen'].str[:10].value_counts(
                    normalize=True, dropna=False).sort_index()[0:5])
2016-04-07    0.001393
2016-04-06    0.003161
2016-04-05    0.013100
2016-04-04    0.036556
2016-04-03    0.038615
Name: date_crawled, dtype: float64
2015-06-11    0.000021
2015-08-10    0.000021
2015-09-09    0.000021
2015-11-10    0.000021
2015-12-05    0.000021
Name: ad_created, dtype: float64
2016-03-05    0.001081
2016-03-06    0.004304
2016-03-07    0.005386
2016-03-08    0.007299
2016-03-09    0.009565
Name: last_seen, dtype: float64
In [ ]:
 
In [18]:
autos["registration_year"].describe()
Out[18]:
count    48090.000000
mean      2004.749927
std         88.018197
min       1000.000000
25%       1999.000000
50%       2004.000000
75%       2008.000000
max       9999.000000
Name: registration_year, dtype: float64

Oldest registration year is year 1000 and neüest car is made in 9999 which is odd..Lets check what percent is between normal values

In [19]:
autos["registration_year"].between(1900,2020).value_counts(normalize=True)
Out[19]:
True     0.999605
False    0.000395
Name: registration_year, dtype: float64

**It seem 99.9% is normal dates, so we can remove odd date rows

In [23]:
autos=autos[autos["registration_year"].between(1900,2020)]
In [25]:
autos["registration_year"].value_counts(normalize=True).sort_index()
Out[25]:
1910    0.000042
1927    0.000021
1929    0.000021
1931    0.000021
1934    0.000042
1937    0.000083
1938    0.000021
1939    0.000021
1941    0.000042
1943    0.000021
1948    0.000021
1950    0.000021
1951    0.000042
1952    0.000021
1953    0.000021
1954    0.000042
1955    0.000042
1956    0.000083
1957    0.000042
1958    0.000083
1959    0.000125
1960    0.000437
1961    0.000125
1962    0.000083
1963    0.000166
1964    0.000250
1965    0.000354
1966    0.000458
1967    0.000541
1968    0.000541
          ...   
1990    0.006823
1991    0.006969
1992    0.007593
1993    0.008654
1994    0.012814
1995    0.024485
1996    0.028021
1997    0.039691
1998    0.048324
1999    0.059745
2000    0.064051
2001    0.054648
2002    0.051486
2003    0.056021
2004    0.056146
2005    0.060515
2006    0.055501
2007    0.047263
2008    0.045953
2009    0.043269
2010    0.033014
2011    0.033638
2012    0.027210
2013    0.016642
2014    0.013771
2015    0.007905
2016    0.024880
2017    0.028749
2018    0.009736
2019    0.000021
Name: registration_year, Length: 81, dtype: float64
In [22]:
48071/autos.shape[0]
Out[22]:
1.0

Discover brand columns

In [62]:
print(autos["brand"].isnull().sum())
print(autos["brand"].value_counts())
print(autos["brand"].value_counts(normalize=True)) #with percentage
print(autos["brand"].unique())
unique_brands=autos["brand"].unique()
0
volkswagen        10237
bmw                5241
opel               5196
mercedes_benz      4625
audi               4143
ford               3334
renault            2294
peugeot            1416
fiat               1244
seat                910
skoda               774
nissan              737
mazda               732
smart               690
citroen             679
toyota              611
hyundai             479
sonstige_autos      450
volvo               434
mini                416
mitsubishi          390
honda               384
kia                 343
alfa_romeo          320
suzuki              281
porsche             280
chevrolet           271
chrysler            168
dacia               129
daihatsu            120
jeep                107
subaru              100
land_rover           99
saab                 79
daewoo               74
jaguar               72
trabant              65
rover                64
lancia               54
lada                 29
Name: brand, dtype: int64
volkswagen        0.212956
bmw               0.109026
opel              0.108090
mercedes_benz     0.096212
audi              0.086185
ford              0.069356
renault           0.047721
peugeot           0.029456
fiat              0.025878
seat              0.018930
skoda             0.016101
nissan            0.015331
mazda             0.015227
smart             0.014354
citroen           0.014125
toyota            0.012710
hyundai           0.009964
sonstige_autos    0.009361
volvo             0.009028
mini              0.008654
mitsubishi        0.008113
honda             0.007988
kia               0.007135
alfa_romeo        0.006657
suzuki            0.005846
porsche           0.005825
chevrolet         0.005637
chrysler          0.003495
dacia             0.002684
daihatsu          0.002496
jeep              0.002226
subaru            0.002080
land_rover        0.002059
saab              0.001643
daewoo            0.001539
jaguar            0.001498
trabant           0.001352
rover             0.001331
lancia            0.001123
lada              0.000603
Name: brand, dtype: float64
['peugeot' 'bmw' 'volkswagen' 'smart' 'ford' 'chrysler' 'seat' 'renault'
 'mercedes_benz' 'audi' 'sonstige_autos' 'opel' 'mazda' 'porsche' 'mini'
 'toyota' 'dacia' 'nissan' 'jeep' 'saab' 'volvo' 'mitsubishi' 'jaguar'
 'fiat' 'skoda' 'subaru' 'kia' 'citroen' 'chevrolet' 'hyundai' 'honda'
 'daewoo' 'suzuki' 'trabant' 'land_rover' 'alfa_romeo' 'lada' 'rover'
 'daihatsu' 'lancia']

Check price mean for unique brands(Excel Pivot can also do..)

In [75]:
brands_dict={}
for brand in unique_brands:
    print("\033[1m+Mean price of {}:".format(brand),autos[autos["brand"]==brand]["price"]
          .mean().astype(int)) #int yaziram ki float-u round off edim cevirsin.
    brands_dict[brand]=autos[autos["brand"]==brand]["price"].mean().astype(int)
print(brands_dict)  
+Mean price of peugeot: 3095
+Mean price of bmw: 8313
+Mean price of volkswagen: 5381
+Mean price of smart: 3538
+Mean price of ford: 3781
+Mean price of chrysler: 3539
+Mean price of seat: 4362
+Mean price of renault: 2463
+Mean price of mercedes_benz: 8572
+Mean price of audi: 9268
+Mean price of sonstige_autos: 12695
+Mean price of opel: 2984
+Mean price of mazda: 4097
+Mean price of porsche: 46764
+Mean price of mini: 10591
+Mean price of toyota: 5148
+Mean price of dacia: 5897
+Mean price of nissan: 4694
+Mean price of jeep: 11590
+Mean price of saab: 3183
+Mean price of volvo: 4922
+Mean price of mitsubishi: 3446
+Mean price of jaguar: 11844
+Mean price of fiat: 2833
+Mean price of skoda: 6402
+Mean price of subaru: 4019
+Mean price of kia: 5923
+Mean price of citroen: 3788
+Mean price of chevrolet: 6716
+Mean price of hyundai: 5416
+Mean price of honda: 4041
+Mean price of daewoo: 1107
+Mean price of suzuki: 4210
+Mean price of trabant: 1843
+Mean price of land_rover: 18934
+Mean price of alfa_romeo: 4054
+Mean price of lada: 2647
+Mean price of rover: 1609
+Mean price of daihatsu: 1654
+Mean price of lancia: 3240
{'mazda': 4097, 'mitsubishi': 3446, 'jaguar': 11844, 'volkswagen': 5381, 'daihatsu': 1654, 'renault': 2463, 'skoda': 6402, 'kia': 5923, 'smart': 3538, 'opel': 2984, 'peugeot': 3095, 'nissan': 4694, 'porsche': 46764, 'lada': 2647, 'seat': 4362, 'ford': 3781, 'mini': 10591, 'bmw': 8313, 'mercedes_benz': 8572, 'subaru': 4019, 'land_rover': 18934, 'toyota': 5148, 'alfa_romeo': 4054, 'suzuki': 4210, 'jeep': 11590, 'honda': 4041, 'volvo': 4922, 'saab': 3183, 'daewoo': 1107, 'chevrolet': 6716, 'trabant': 1843, 'chrysler': 3539, 'rover': 1609, 'fiat': 2833, 'sonstige_autos': 12695, 'audi': 9268, 'dacia': 5897, 'lancia': 3240, 'hyundai': 5416, 'citroen': 3788}

Below is another way of getting the same result as above (value_counts.index method)

In [80]:
brand_counts=autos["brand"].value_counts()
brands_dict={}
for brand in brand_counts.index:
    brands_dict[brand]=brand_counts[brand]
print(brands_dict)  
{'mazda': 732, 'suzuki': 281, 'jaguar': 72, 'saab': 79, 'daihatsu': 120, 'renault': 2294, 'smart': 690, 'alfa_romeo': 320, 'kia': 343, 'skoda': 774, 'daewoo': 74, 'ford': 3334, 'opel': 5196, 'porsche': 280, 'dacia': 129, 'peugeot': 1416, 'land_rover': 99, 'bmw': 5241, 'mercedes_benz': 4625, 'subaru': 100, 'mini': 416, 'toyota': 611, 'fiat': 1244, 'trabant': 65, 'volkswagen': 10237, 'jeep': 107, 'citroen': 679, 'mitsubishi': 390, 'nissan': 737, 'volvo': 434, 'chevrolet': 271, 'chrysler': 168, 'rover': 64, 'lada': 29, 'sonstige_autos': 450, 'audi': 4143, 'seat': 910, 'lancia': 54, 'hyundai': 479, 'honda': 384}

Filter column itself according desired indicators

In [113]:
brand_counts=autos["brand"].value_counts(normalize=True)
common_brands=brand_counts[brand_counts>0.05].index
print(common_brands)  
c_brands_mean_prices={}
for brand in common_brands:
    mean_prices=autos[autos["brand"]==brand]["price"].mean()
    print(mean_prices)
    c_brands_mean_prices[brand]=int(mean_prices)
c_brands_mean_prices    
Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford'], dtype='object')
5381.686431571749
8313.271513070025
2984.8720169361045
8572.600216216217
9268.353608496258
3781.123275344931
Out[113]:
{'audi': 9268,
 'bmw': 8313,
 'ford': 3781,
 'mercedes_benz': 8572,
 'opel': 2984,
 'volkswagen': 5381}

Above: of the top 6 brands most expensive are: audi,mercedes and bmw in sequence.

In [100]:
d={}
a=autos["brand"].value_counts()
print(type(a))

'''for i in a:
    d[a.index]=3'''

print(a[a>5000].values)
<class 'pandas.core.series.Series'>
[10237  5241  5196]

Series and dataframe constructos

In [108]:
bmp_series=pd.Series(c_brands_mean_prices)
#print(bmp_series)
sample_series=pd.Series(autos["brand"].value_counts(normalize=True))
print('indexes\n',sample_series.index)
#print(sample_series.array)
print('values\n',sample_series.values)
print('shape\n',sample_series.shape)
indexes
 Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford', 'renault',
       'peugeot', 'fiat', 'seat', 'skoda', 'nissan', 'mazda', 'smart',
       'citroen', 'toyota', 'hyundai', 'sonstige_autos', 'volvo', 'mini',
       'mitsubishi', 'honda', 'kia', 'alfa_romeo', 'suzuki', 'porsche',
       'chevrolet', 'chrysler', 'dacia', 'daihatsu', 'jeep', 'subaru',
       'land_rover', 'saab', 'daewoo', 'jaguar', 'trabant', 'rover', 'lancia',
       'lada'],
      dtype='object')
values
 [0.21295584 0.10902623 0.10809012 0.09621185 0.08618502 0.06935574
 0.04772108 0.02945643 0.02587839 0.01893033 0.01610118 0.01533149
 0.01522748 0.01435377 0.01412494 0.01271037 0.00996443 0.00936115
 0.00902831 0.00865387 0.008113   0.00798818 0.00713528 0.00665682
 0.00584552 0.00582472 0.00563749 0.00349483 0.00268353 0.00249631
 0.00222587 0.00208026 0.00205945 0.0016434  0.00153939 0.00149778
 0.00135217 0.00133136 0.00112334 0.00060327]
shape]
 (40,)
In [109]:
sample_series=pd.DataFrame(autos["brand"].value_counts(normalize=True),
                       columns=['mean_price'])

Calculate mean mileages for common brands

In [114]:
c_brands_mean_mileages={} 
for brand in common_brands:
    mean_mileage=autos[autos["brand"]==brand]["odometer_km"].mean()
    print(mean_mileage)
    c_brands_mean_mileages[brand]=int(mean_mileage)
c_brands_mean_mileages   
129039.26931718277
132845.8309482923
129508.27559661277
131107.02702702704
129619.84069514845
124334.13317336532
Out[114]:
{'audi': 129619,
 'bmw': 132845,
 'ford': 124334,
 'mercedes_benz': 131107,
 'opel': 129508,
 'volkswagen': 129039}
In [115]:
print(c_brands_mean_mileages)
print(c_brands_mean_prices)
{'audi': 129619, 'volkswagen': 129039, 'bmw': 132845, 'mercedes_benz': 131107, 'opel': 129508, 'ford': 124334}
{'audi': 9268, 'volkswagen': 5381, 'bmw': 8313, 'mercedes_benz': 8572, 'opel': 2984, 'ford': 3781}

Lets convert mean dicts to pd.DataFrames with merging series method

In [119]:
bmm_series=pd.Series(c_brands_mean_mileages)
bmp_series=pd.Series(c_brands_mean_prices)
bmm_df=pd.DataFrame(bmm_series,columns=['mean_mileage'])
#bmp_df=pd.DataFrame(bmp_series, columns=['mean_price'])

bmm_df['mean_price']=bmp_series
print(bmm_df)
               mean_mileage  mean_price
audi                 129619        9268
bmw                  132845        8313
ford                 124334        3781
mercedes_benz        131107        8572
opel                 129508        2984
volkswagen           129039        5381

Lets find most frequent models of common brands and collect to dict

In [145]:
c_models_dict={}
for brand in common_brands:
    freq_models=autos[autos["brand"]==brand]["model"].value_counts()
    print(brand.upper(),'\n',freq_models[:3],'\n')
    c_models_dict[brand]={freq_models.index[0],freq_models.values[0]}
c_models_dict
VOLKSWAGEN 
 golf      3863
polo      1658
passat    1379
Name: model, dtype: int64 

BMW 
 3er    2672
5er    1140
1er     534
Name: model, dtype: int64 

OPEL 
 corsa     1644
astra     1394
vectra     549
Name: model, dtype: int64 

MERCEDES_BENZ 
 c_klasse    1160
e_klasse     971
a_klasse     582
Name: model, dtype: int64 

AUDI 
 a4    1249
a3     864
a6     816
Name: model, dtype: int64 

FORD 
 focus     786
fiesta    742
mondeo    485
Name: model, dtype: int64 

Out[145]:
{'audi': {1249, 'a4'},
 'bmw': {2672, '3er'},
 'ford': {'focus', 786},
 'mercedes_benz': {1160, 'c_klasse'},
 'opel': {'corsa', 1644},
 'volkswagen': {'golf', 3863}}
In [149]:
autos["odometer_km"].value_counts()
Out[149]:
150000    31117
125000     5026
100000     2097
90000      1731
80000      1410
70000      1212
60000      1151
50000      1009
40000       814
30000       776
20000       755
5000        735
10000       238
Name: odometer_km, dtype: int64

Compare mean price of damaged and undamaged cars

In [160]:
not_damaged_price=autos[autos['unrepaired_damage']=="nein"]["price"].mean()
damaged_price=autos[autos['unrepaired_damage']=="ja"]["price"].mean()
print(damaged_price,not_damaged_price)
print("Price difference is:",int(not_damaged_price-damaged_price),"$")
2273.1384279475983 7103.562663975782
Price difference is: 4830 $
In [ ]: