we'll work with a dataset of used cars from eBay Kleinanzeigen, a classifieds section of the German eBay website. The aim of this project is to clean the data and analyze the included used car listings.
#we import the NumPy and Pandas library and read the data set
import numpy as np
import pandas as pd
autos = pd.read_csv("autos.csv", encoding = "Windows-1252")
autos
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
5 | 2016-03-21 13:47:45 | Chrysler_Grand_Voyager_2.8_CRD_Aut.Limited_Sto... | privat | Angebot | $7,900 | test | bus | 2006 | automatik | 150 | voyager | 150,000km | 4 | diesel | chrysler | NaN | 2016-03-21 00:00:00 | 0 | 22962 | 2016-04-06 09:45:21 |
6 | 2016-03-20 17:55:21 | VW_Golf_III_GT_Special_Electronic_Green_Metall... | privat | Angebot | $300 | test | limousine | 1995 | manuell | 90 | golf | 150,000km | 8 | benzin | volkswagen | NaN | 2016-03-20 00:00:00 | 0 | 31535 | 2016-03-23 02:48:59 |
7 | 2016-03-16 18:55:19 | Golf_IV_1.9_TDI_90PS | privat | Angebot | $1,990 | control | limousine | 1998 | manuell | 90 | golf | 150,000km | 12 | diesel | volkswagen | nein | 2016-03-16 00:00:00 | 0 | 53474 | 2016-04-07 03:17:32 |
8 | 2016-03-22 16:51:34 | Seat_Arosa | privat | Angebot | $250 | test | NaN | 2000 | manuell | 0 | arosa | 150,000km | 10 | NaN | seat | nein | 2016-03-22 00:00:00 | 0 | 7426 | 2016-03-26 18:18:10 |
9 | 2016-03-16 13:47:02 | Renault_Megane_Scenic_1.6e_RT_Klimaanlage | privat | Angebot | $590 | control | bus | 1997 | manuell | 90 | megane | 150,000km | 7 | benzin | renault | nein | 2016-03-16 00:00:00 | 0 | 15749 | 2016-04-06 10:46:35 |
10 | 2016-03-15 01:41:36 | VW_Golf_Tuning_in_siber/grau | privat | Angebot | $999 | test | NaN | 2017 | manuell | 90 | NaN | 150,000km | 4 | benzin | volkswagen | nein | 2016-03-14 00:00:00 | 0 | 86157 | 2016-04-07 03:16:21 |
11 | 2016-03-16 18:45:34 | Mercedes_A140_Motorschaden | privat | Angebot | $350 | control | NaN | 2000 | NaN | 0 | NaN | 150,000km | 0 | benzin | mercedes_benz | NaN | 2016-03-16 00:00:00 | 0 | 17498 | 2016-03-16 18:45:34 |
12 | 2016-03-31 19:48:22 | Smart_smart_fortwo_coupe_softouch_pure_MHD_Pan... | privat | Angebot | $5,299 | control | kleinwagen | 2010 | automatik | 71 | fortwo | 50,000km | 9 | benzin | smart | nein | 2016-03-31 00:00:00 | 0 | 34590 | 2016-04-06 14:17:52 |
13 | 2016-03-23 10:48:32 | Audi_A3_1.6_tuning | privat | Angebot | $1,350 | control | limousine | 1999 | manuell | 101 | a3 | 150,000km | 11 | benzin | audi | nein | 2016-03-23 00:00:00 | 0 | 12043 | 2016-04-01 14:17:13 |
14 | 2016-03-23 11:50:46 | Renault_Clio_3__Dynamique_1.2__16_V;_viele_Ver... | privat | Angebot | $3,999 | test | kleinwagen | 2007 | manuell | 75 | clio | 150,000km | 9 | benzin | renault | NaN | 2016-03-23 00:00:00 | 0 | 81737 | 2016-04-01 15:46:47 |
15 | 2016-04-01 12:06:20 | Corvette_C3_Coupe_T_Top_Crossfire_Injection | privat | Angebot | $18,900 | test | coupe | 1982 | automatik | 203 | NaN | 80,000km | 6 | benzin | sonstige_autos | nein | 2016-04-01 00:00:00 | 0 | 61276 | 2016-04-02 21:10:48 |
16 | 2016-03-16 14:59:02 | Opel_Vectra_B_Kombi | privat | Angebot | $350 | test | kombi | 1999 | manuell | 101 | vectra | 150,000km | 5 | benzin | opel | nein | 2016-03-16 00:00:00 | 0 | 57299 | 2016-03-18 05:29:37 |
17 | 2016-03-29 11:46:22 | Volkswagen_Scirocco_2_G60 | privat | Angebot | $5,500 | test | coupe | 1990 | manuell | 205 | scirocco | 150,000km | 6 | benzin | volkswagen | nein | 2016-03-29 00:00:00 | 0 | 74821 | 2016-04-05 20:46:26 |
18 | 2016-03-26 19:57:44 | Verkaufen_mein_bmw_e36_320_i_touring | privat | Angebot | $300 | control | bus | 1995 | manuell | 150 | 3er | 150,000km | 0 | benzin | bmw | NaN | 2016-03-26 00:00:00 | 0 | 54329 | 2016-04-02 12:16:41 |
19 | 2016-03-17 13:36:21 | mazda_tribute_2.0_mit_gas_und_tuev_neu_2018 | privat | Angebot | $4,150 | control | suv | 2004 | manuell | 124 | andere | 150,000km | 2 | lpg | mazda | nein | 2016-03-17 00:00:00 | 0 | 40878 | 2016-03-17 14:45:58 |
20 | 2016-03-05 19:57:31 | Audi_A4_Avant_1.9_TDI_*6_Gang*AHK*Klimatronik*... | privat | Angebot | $3,500 | test | kombi | 2003 | manuell | 131 | a4 | 150,000km | 5 | diesel | audi | NaN | 2016-03-05 00:00:00 | 0 | 53913 | 2016-03-07 05:46:46 |
21 | 2016-03-06 19:07:10 | Porsche_911_Carrera_4S_Cabrio | privat | Angebot | $41,500 | test | cabrio | 2004 | manuell | 320 | 911 | 150,000km | 4 | benzin | porsche | nein | 2016-03-06 00:00:00 | 0 | 65428 | 2016-04-05 23:46:19 |
22 | 2016-03-28 20:50:54 | MINI_Cooper_S_Cabrio | privat | Angebot | $25,450 | control | cabrio | 2015 | manuell | 184 | cooper | 10,000km | 1 | benzin | mini | nein | 2016-03-28 00:00:00 | 0 | 44789 | 2016-04-01 06:45:30 |
23 | 2016-03-10 19:55:34 | Peugeot_Boxer_2_2_HDi_120_Ps_9_Sitzer_inkl_Klima | privat | Angebot | $7,999 | control | bus | 2010 | manuell | 120 | NaN | 150,000km | 2 | diesel | peugeot | nein | 2016-03-10 00:00:00 | 0 | 30900 | 2016-03-17 08:45:17 |
24 | 2016-04-03 11:57:02 | BMW_535i_xDrive_Sport_Aut. | privat | Angebot | $48,500 | control | limousine | 2014 | automatik | 306 | 5er | 30,000km | 12 | benzin | bmw | nein | 2016-04-03 00:00:00 | 0 | 22547 | 2016-04-07 13:16:50 |
25 | 2016-03-21 21:56:18 | Ford_escort_kombi_an_bastler_mit_ghia_ausstattung | privat | Angebot | $90 | control | kombi | 1996 | manuell | 116 | NaN | 150,000km | 4 | benzin | ford | ja | 2016-03-21 00:00:00 | 0 | 27574 | 2016-04-01 05:16:49 |
26 | 2016-04-03 22:46:28 | Volkswagen_Polo_Fox | privat | Angebot | $777 | control | kleinwagen | 1992 | manuell | 54 | polo | 125,000km | 2 | benzin | volkswagen | nein | 2016-04-03 00:00:00 | 0 | 38110 | 2016-04-05 23:46:48 |
27 | 2016-03-27 18:45:01 | Hat_einer_Ahnung_mit_Ford_Galaxy_HILFE | privat | Angebot | $0 | control | NaN | 2005 | NaN | 0 | NaN | 150,000km | 0 | NaN | ford | NaN | 2016-03-27 00:00:00 | 0 | 66701 | 2016-03-27 18:45:01 |
28 | 2016-03-19 21:56:19 | MINI_Cooper_D | privat | Angebot | $5,250 | control | kleinwagen | 2007 | manuell | 110 | cooper | 150,000km | 7 | diesel | mini | ja | 2016-03-19 00:00:00 | 0 | 15745 | 2016-04-07 14:58:48 |
29 | 2016-04-02 12:45:44 | Mercedes_Benz_E_320_T_CDI_Avantgarde_DPF7_Sitz... | privat | Angebot | $4,999 | test | kombi | 2004 | automatik | 204 | e_klasse | 150,000km | 10 | diesel | mercedes_benz | nein | 2016-04-02 00:00:00 | 0 | 47638 | 2016-04-02 12:45:44 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49970 | 2016-03-21 22:47:37 | c4_Grand_Picasso_mit_Automatik_Leder_Navi_Temp... | privat | Angebot | $15,800 | control | bus | 2010 | automatik | 136 | c4 | 60,000km | 4 | diesel | citroen | nein | 2016-03-21 00:00:00 | 0 | 14947 | 2016-04-07 04:17:34 |
49971 | 2016-03-29 14:54:12 | W.Lupo_1.0 | privat | Angebot | $950 | test | kleinwagen | 2001 | manuell | 50 | lupo | 150,000km | 4 | benzin | volkswagen | nein | 2016-03-29 00:00:00 | 0 | 65197 | 2016-03-29 20:41:51 |
49972 | 2016-03-26 22:25:23 | Mercedes_Benz_Vito_115_CDI_Extralang_Aut. | privat | Angebot | $3,300 | control | bus | 2004 | automatik | 150 | vito | 150,000km | 10 | diesel | mercedes_benz | ja | 2016-03-26 00:00:00 | 0 | 65326 | 2016-03-28 11:28:18 |
49973 | 2016-03-27 05:32:39 | Mercedes_Benz_SLK_200_Kompressor | privat | Angebot | $6,000 | control | cabrio | 2004 | manuell | 163 | slk | 150,000km | 11 | benzin | mercedes_benz | nein | 2016-03-27 00:00:00 | 0 | 53567 | 2016-03-27 08:25:24 |
49974 | 2016-03-20 10:52:31 | Golf_1_Cabrio_Tuev_Neu_viele_Extras_alles_eing... | privat | Angebot | $0 | control | cabrio | 1983 | manuell | 70 | golf | 150,000km | 2 | benzin | volkswagen | nein | 2016-03-20 00:00:00 | 0 | 8209 | 2016-03-27 19:48:16 |
49975 | 2016-03-27 20:51:39 | Honda_Jazz_1.3_DSi_i_VTEC_IMA_CVT_Comfort | privat | Angebot | $9,700 | control | kleinwagen | 2012 | automatik | 88 | jazz | 100,000km | 11 | hybrid | honda | nein | 2016-03-27 00:00:00 | 0 | 84385 | 2016-04-05 19:45:34 |
49976 | 2016-03-19 18:56:05 | Audi_80_Avant_2.6_E__Vollausstattung!!_Einziga... | privat | Angebot | $5,900 | test | kombi | 1992 | automatik | 150 | 80 | 150,000km | 12 | benzin | audi | nein | 2016-03-19 00:00:00 | 0 | 36100 | 2016-04-07 06:16:44 |
49977 | 2016-03-31 18:37:18 | Mercedes_Benz_C200_Cdi_W203 | privat | Angebot | $5,500 | control | limousine | 2003 | manuell | 116 | c_klasse | 150,000km | 2 | diesel | mercedes_benz | nein | 2016-03-31 00:00:00 | 0 | 33739 | 2016-04-06 12:16:11 |
49978 | 2016-04-04 10:37:14 | Mercedes_Benz_E_200_Classic | privat | Angebot | $900 | control | limousine | 1996 | automatik | 136 | e_klasse | 150,000km | 9 | benzin | mercedes_benz | ja | 2016-04-04 00:00:00 | 0 | 24405 | 2016-04-06 12:44:20 |
49979 | 2016-03-20 18:38:40 | Volkswagen_Polo_1.6_TDI_Style | privat | Angebot | $11,000 | test | kleinwagen | 2011 | manuell | 90 | polo | 70,000km | 11 | diesel | volkswagen | nein | 2016-03-20 00:00:00 | 0 | 48455 | 2016-04-07 01:45:12 |
49980 | 2016-03-12 10:55:54 | Ford_Escort_Turnier_16V | privat | Angebot | $400 | control | kombi | 1995 | manuell | 105 | escort | 125,000km | 3 | benzin | ford | NaN | 2016-03-12 00:00:00 | 0 | 56218 | 2016-04-06 17:16:49 |
49981 | 2016-03-15 09:38:21 | Opel_Astra_Kombi_mit_Anhaengerkupplung | privat | Angebot | $2,000 | control | kombi | 1998 | manuell | 115 | astra | 150,000km | 12 | benzin | opel | nein | 2016-03-15 00:00:00 | 0 | 86859 | 2016-04-05 17:21:46 |
49982 | 2016-03-29 18:51:08 | Skoda_Fabia_4_Tuerer_Bj:2004__85.000Tkm | privat | Angebot | $1,950 | control | kleinwagen | 2004 | manuell | 0 | fabia | 90,000km | 7 | benzin | skoda | NaN | 2016-03-29 00:00:00 | 0 | 45884 | 2016-03-29 18:51:08 |
49983 | 2016-03-06 12:43:04 | Ford_focus_99 | privat | Angebot | $600 | test | kleinwagen | 1999 | manuell | 101 | focus | 150,000km | 4 | benzin | ford | NaN | 2016-03-06 00:00:00 | 0 | 52477 | 2016-03-09 06:16:08 |
49984 | 2016-03-31 22:48:48 | Student_sucht_ein__Anfaengerauto___ab_2000_BJ_... | privat | Angebot | $0 | test | NaN | 2000 | NaN | 0 | NaN | 150,000km | 0 | NaN | sonstige_autos | NaN | 2016-03-31 00:00:00 | 0 | 12103 | 2016-04-02 19:44:53 |
49985 | 2016-04-02 16:38:23 | Verkaufe_meinen_vw_vento! | privat | Angebot | $1,000 | control | NaN | 1995 | automatik | 0 | NaN | 150,000km | 0 | benzin | volkswagen | NaN | 2016-04-02 00:00:00 | 0 | 30900 | 2016-04-06 15:17:52 |
49986 | 2016-04-04 20:46:02 | Chrysler_300C_3.0_CRD_DPF_Automatik_Voll_Ausst... | privat | Angebot | $15,900 | control | limousine | 2010 | automatik | 218 | 300c | 125,000km | 11 | diesel | chrysler | nein | 2016-04-04 00:00:00 | 0 | 73527 | 2016-04-06 23:16:00 |
49987 | 2016-03-22 20:47:27 | Audi_A3_Limousine_2.0_TDI_DPF_Ambition__NAVI__... | privat | Angebot | $21,990 | control | limousine | 2013 | manuell | 150 | a3 | 50,000km | 11 | diesel | audi | nein | 2016-03-22 00:00:00 | 0 | 94362 | 2016-03-26 22:46:06 |
49988 | 2016-03-28 19:49:51 | BMW_330_Ci | privat | Angebot | $9,550 | control | coupe | 2001 | manuell | 231 | 3er | 150,000km | 10 | benzin | bmw | nein | 2016-03-28 00:00:00 | 0 | 83646 | 2016-04-07 02:17:40 |
49989 | 2016-03-11 19:50:37 | VW_Polo_zum_Ausschlachten_oder_Wiederaufbau | privat | Angebot | $150 | test | kleinwagen | 1997 | manuell | 0 | polo | 150,000km | 5 | benzin | volkswagen | ja | 2016-03-11 00:00:00 | 0 | 21244 | 2016-03-12 10:17:55 |
49990 | 2016-03-21 19:54:19 | Mercedes_Benz_A_200__BlueEFFICIENCY__Urban | privat | Angebot | $17,500 | test | limousine | 2012 | manuell | 156 | a_klasse | 30,000km | 12 | benzin | mercedes_benz | nein | 2016-03-21 00:00:00 | 0 | 58239 | 2016-04-06 22:46:57 |
49991 | 2016-03-06 15:25:19 | Kleinwagen | privat | Angebot | $500 | control | NaN | 2016 | manuell | 0 | twingo | 150,000km | 0 | benzin | renault | NaN | 2016-03-06 00:00:00 | 0 | 61350 | 2016-03-06 18:24:19 |
49992 | 2016-03-10 19:37:38 | Fiat_Grande_Punto_1.4_T_Jet_16V_Sport | privat | Angebot | $4,800 | control | kleinwagen | 2009 | manuell | 120 | andere | 125,000km | 9 | lpg | fiat | nein | 2016-03-10 00:00:00 | 0 | 68642 | 2016-03-13 01:44:51 |
49993 | 2016-03-15 18:47:35 | Audi_A3__1_8l__Silber;_schoenes_Fahrzeug | privat | Angebot | $1,650 | control | kleinwagen | 1997 | manuell | 0 | NaN | 150,000km | 7 | benzin | audi | NaN | 2016-03-15 00:00:00 | 0 | 65203 | 2016-04-06 19:46:53 |
49994 | 2016-03-22 17:36:42 | Audi_A6__S6__Avant_4.2_quattro_eventuell_Tausc... | privat | Angebot | $5,000 | control | kombi | 2001 | automatik | 299 | a6 | 150,000km | 1 | benzin | audi | nein | 2016-03-22 00:00:00 | 0 | 46537 | 2016-04-06 08:16:39 |
49995 | 2016-03-27 14:38:19 | Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon | privat | Angebot | $24,900 | control | limousine | 2011 | automatik | 239 | q5 | 100,000km | 1 | diesel | audi | nein | 2016-03-27 00:00:00 | 0 | 82131 | 2016-04-01 13:47:40 |
49996 | 2016-03-28 10:50:25 | Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... | privat | Angebot | $1,980 | control | cabrio | 1996 | manuell | 75 | astra | 150,000km | 5 | benzin | opel | nein | 2016-03-28 00:00:00 | 0 | 44807 | 2016-04-02 14:18:02 |
49997 | 2016-04-02 14:44:48 | Fiat_500_C_1.2_Dualogic_Lounge | privat | Angebot | $13,200 | test | cabrio | 2014 | automatik | 69 | 500 | 5,000km | 11 | benzin | fiat | nein | 2016-04-02 00:00:00 | 0 | 73430 | 2016-04-04 11:47:27 |
49998 | 2016-03-08 19:25:42 | Audi_A3_2.0_TDI_Sportback_Ambition | privat | Angebot | $22,900 | control | kombi | 2013 | manuell | 150 | a3 | 40,000km | 11 | diesel | audi | nein | 2016-03-08 00:00:00 | 0 | 35683 | 2016-04-05 16:45:07 |
49999 | 2016-03-14 00:42:12 | Opel_Vectra_1.6_16V | privat | Angebot | $1,250 | control | limousine | 1996 | manuell | 101 | vectra | 150,000km | 1 | benzin | opel | nein | 2016-03-13 00:00:00 | 0 | 45897 | 2016-04-06 21:18:48 |
50000 rows × 20 columns
autos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 20 columns): dateCrawled 50000 non-null object name 50000 non-null object seller 50000 non-null object offerType 50000 non-null object price 50000 non-null object abtest 50000 non-null object vehicleType 44905 non-null object yearOfRegistration 50000 non-null int64 gearbox 47320 non-null object powerPS 50000 non-null int64 model 47242 non-null object odometer 50000 non-null object monthOfRegistration 50000 non-null int64 fuelType 45518 non-null object brand 50000 non-null object notRepairedDamage 40171 non-null object dateCreated 50000 non-null object nrOfPictures 50000 non-null int64 postalCode 50000 non-null int64 lastSeen 50000 non-null object dtypes: int64(5), object(15) memory usage: 7.6+ MB
autos.head()
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
Quick observations:
- The dataset contains 20 columns, most of which are strings.
- The columns 'vehicleType', 'gearbox', 'gearbox', 'model', and 'notRepairedDamage' have some null objects. But none of them have more than ~20% null values.
- The data in the rows is in German. We will have to translate it.
- The columns 'price' and 'odometer' contain non-numeric values. We will clean that
- The column names use camelcase instead of Python's preferred snakecase, which means we can't just replace spaces with underscores.
print(autos.columns)
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'odometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen'], dtype='object')
# rename the columns
autos.rename({"yearOfRegistration":"registration_year"}, axis=1, inplace=True)
autos.rename({"registration_year":"registration_year"}, axis=1, inplace=True)
autos.rename({"notRepairedDamage":"unrepaired_damage"}, axis=1, inplace = True)
autos.rename({"monthOfRegistration":"registration_month"}, axis=1, inplace=True)
autos.rename({"dateCreated":"ad_created"}, axis=1, inplace=True)
autos.rename({'dateCrawled':"date_crawled"}, axis=1, inplace=True)
autos.rename({"offerType":"offer_type"}, axis=1, inplace=True)
autos.rename({"vehicleType":"vehicle_type"}, axis=1, inplace=True)
autos.rename({"powerPS":"power_ps"}, axis=1, inplace=True)
autos.rename({"fuelType":"fuel_type"}, axis=1, inplace=True)
autos.rename({"nrOfPictures":"number_of_pictures"}, axis=1, inplace=True)
autos.rename({"postalCode":"postal_code"}, axis=1, inplace=True)
autos.rename({"lastSeen":"last_seen"}, axis=1, inplace=True)
autos.head()
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_damage | ad_created | number_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
We renamed the columns names using snakecase instead of camelcase to make them more readable.
Now we'll explore the columns to see if we need to clean data.
autos.describe(include='all')
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_damage | ad_created | number_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000 | 50000 | 50000 | 50000 | 50000 | 50000 | 44905 | 50000.000000 | 47320 | 50000.000000 | 47242 | 50000 | 50000.000000 | 45518 | 50000 | 40171 | 50000 | 50000.0 | 50000.000000 | 50000 |
unique | 48213 | 38754 | 2 | 2 | 2357 | 2 | 8 | NaN | 2 | NaN | 245 | 13 | NaN | 7 | 40 | 2 | 76 | NaN | NaN | 39481 |
top | 2016-03-23 19:38:20 | Ford_Fiesta | privat | Angebot | $0 | test | limousine | NaN | manuell | NaN | golf | 150,000km | NaN | benzin | volkswagen | nein | 2016-04-03 00:00:00 | NaN | NaN | 2016-04-07 06:17:27 |
freq | 3 | 78 | 49999 | 49999 | 1421 | 25756 | 12859 | NaN | 36993 | NaN | 4024 | 32424 | NaN | 30107 | 10687 | 35232 | 1946 | NaN | NaN | 8 |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2005.073280 | NaN | 116.355920 | NaN | NaN | 5.723360 | NaN | NaN | NaN | NaN | 0.0 | 50813.627300 | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 105.712813 | NaN | 209.216627 | NaN | NaN | 3.711984 | NaN | NaN | NaN | NaN | 0.0 | 25779.747957 | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | 1067.000000 | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1999.000000 | NaN | 70.000000 | NaN | NaN | 3.000000 | NaN | NaN | NaN | NaN | 0.0 | 30451.000000 | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2003.000000 | NaN | 105.000000 | NaN | NaN | 6.000000 | NaN | NaN | NaN | NaN | 0.0 | 49577.000000 | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2008.000000 | NaN | 150.000000 | NaN | NaN | 9.000000 | NaN | NaN | NaN | NaN | 0.0 | 71540.000000 | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9999.000000 | NaN | 17700.000000 | NaN | NaN | 12.000000 | NaN | NaN | NaN | NaN | 0.0 | 99998.000000 | NaN |
autos["seller"].value_counts()
privat 49999 gewerblich 1 Name: seller, dtype: int64
autos["offer_type"].value_counts()
Angebot 49999 Gesuch 1 Name: offer_type, dtype: int64
autos['number_of_pictures'].value_counts()
0 50000 Name: number_of_pictures, dtype: int64
We have discovered, that:
We can see that almost all values in the columns 'seller' and "offer_type" are the same. The column 'number_of_pictures' have only one value, which is '0'. Hence we are going to drop them.
What is more, the columns 'price' and 'odometer' cointain numeric values stored as text. And the column 'data_crawled' containd time and date.
Columns 'vehicle_type', 'gearbox', 'model', 'fuel_type', 'unrepaired_damage' have missing values.
We'll start by converting the columns 'price' and 'odometer' to a numeric type.
autos["odometer"].head()
autos["odometer"].value_counts()
150,000km 32424 125,000km 5170 100,000km 2169 90,000km 1757 80,000km 1436 70,000km 1230 60,000km 1164 50,000km 1027 5,000km 967 40,000km 819 30,000km 789 20,000km 784 10,000km 264 Name: odometer, dtype: int64
# # removing non-numeric characters
autos['odometer'] = autos['odometer'].astype(str)
autos["odometer"] = autos['odometer'].str.replace("km",'').str.replace(',','').astype(int)
autos.rename(columns={'odometer':'odometer_km'}, inplace=True)
autos['price'] = autos['price'].astype(str)
autos['price'] = autos['price'].str.replace('$','').str.replace(',','').astype(int)
autos.rename(columns={'price':"price_dollars"}, inplace=True)
# analysing the 'odometer_km' column
autos['odometer_km'].unique()
array([150000, 70000, 50000, 80000, 10000, 30000, 125000, 90000, 20000, 60000, 5000, 100000, 40000])
autos['odometer_km'].describe()
count 50000.000000 mean 125732.700000 std 40042.211706 min 5000.000000 25% 125000.000000 50% 150000.000000 75% 150000.000000 max 150000.000000 Name: odometer_km, dtype: float64
autos['odometer_km'].value_counts()
150000 32424 125000 5170 100000 2169 90000 1757 80000 1436 70000 1230 60000 1164 50000 1027 5000 967 40000 819 30000 789 20000 784 10000 264 Name: odometer_km, dtype: int64
autos["price_dollars"].unique().shape
(2357,)
autos["price_dollars"].describe()
count 5.000000e+04 mean 9.840044e+03 std 4.811044e+05 min 0.000000e+00 25% 1.100000e+03 50% 2.950000e+03 75% 7.200000e+03 max 1.000000e+08 Name: price_dollars, dtype: float64
autos["price_dollars"].value_counts().head()
0 1421 500 781 1500 734 2500 643 1000 639 Name: price_dollars, dtype: int64
autos["price_dollars"].value_counts().sort_index(ascending=False).head(10)
99999999 1 27322222 1 12345678 3 11111111 2 10000000 1 3890000 1 1300000 1 1234566 1 999999 2 999990 1 Name: price_dollars, dtype: int64
autos["price_dollars"].value_counts().sort_index(ascending=True).head(10)
0 1421 1 156 2 3 3 1 5 2 8 1 9 1 10 7 11 2 12 3 Name: price_dollars, dtype: int64
autos['price_dollars'].value_counts().sort_index(ascending=True).tail(20)
197000 1 198000 1 220000 1 250000 1 259000 1 265000 1 295000 1 299000 1 345000 1 350000 1 999990 1 999999 2 1234566 1 1300000 1 3890000 1 10000000 1 11111111 2 12345678 3 27322222 1 99999999 1 Name: price_dollars, dtype: int64
We can see, that the price rises drastically from 350,000 to 999,990. We are going to eliminate the rows where the price equals to '0' or is higher them 350,000 as this price doesn't seem reasonable. However, we'll keep the 1$ price as it may indicate the startinf point for bidding.
#removing outliers
autos = autos[autos['price_dollars'].between(1,350000)]
autos['price_dollars'].value_counts().sort_index(ascending=True).tail(20)
151990 1 155000 1 163500 1 163991 1 169000 1 169999 1 175000 1 180000 1 190000 1 194000 1 197000 1 198000 1 220000 1 250000 1 259000 1 265000 1 295000 1 299000 1 345000 1 350000 1 Name: price_dollars, dtype: int64
autos['price_dollars'].value_counts().sort_index(ascending=True).head(20)
1 156 2 3 3 1 5 2 8 1 9 1 10 7 11 2 12 3 13 2 14 1 15 2 17 3 18 1 20 4 25 5 29 1 30 7 35 1 40 6 Name: price_dollars, dtype: int64
Now the rows with outliners are gone and the price range seem reasonable.
There are 5 columns that should represent date values. Some of these columns were created by the crawler, some came from the website itself:
date_crawled
: added by the crawlerlast_seen
: added by the crawlerad_created
: from the websiteregistration_month
: from the websiteregistration_year
: from the websiteRight now, the date_crawled, last_seen, and ad_created columns are all identified as string values by pandas. Because these three columns are represented as strings, we need to convert the data into a numerical representation so we can understand it quantitatively.
Let's first understand how the values in the three string columns are formatted. These columns all represent full timestamp values, like so:
autos[['date_crawled','ad_created','last_seen']][0:5]
date_crawled | ad_created | last_seen | |
---|---|---|---|
0 | 2016-03-26 17:47:46 | 2016-03-26 00:00:00 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | 2016-04-04 00:00:00 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | 2016-03-26 00:00:00 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | 2016-03-12 00:00:00 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | 2016-04-01 00:00:00 | 2016-04-01 14:38:50 |
To understand the date range, we can extract just the date values, generate a distribution, and then sort by the index.
print(autos['date_crawled'].str[:10].head())
0 2016-03-26 1 2016-04-04 2 2016-03-26 3 2016-03-12 4 2016-04-01 Name: date_crawled, dtype: object
** Exploring date_crawled **
autos["date_crawled"].str[:10].value_counts(normalize=True, dropna=False).sort_index()
2016-03-05 0.025327 2016-03-06 0.014043 2016-03-07 0.036014 2016-03-08 0.033296 2016-03-09 0.033090 2016-03-10 0.032184 2016-03-11 0.032575 2016-03-12 0.036920 2016-03-13 0.015670 2016-03-14 0.036549 2016-03-15 0.034284 2016-03-16 0.029610 2016-03-17 0.031628 2016-03-18 0.012911 2016-03-19 0.034778 2016-03-20 0.037887 2016-03-21 0.037373 2016-03-22 0.032987 2016-03-23 0.032225 2016-03-24 0.029342 2016-03-25 0.031607 2016-03-26 0.032204 2016-03-27 0.031092 2016-03-28 0.034860 2016-03-29 0.034099 2016-03-30 0.033687 2016-03-31 0.031834 2016-04-01 0.033687 2016-04-02 0.035478 2016-04-03 0.038608 2016-04-04 0.036487 2016-04-05 0.013096 2016-04-06 0.003171 2016-04-07 0.001400 Name: date_crawled, dtype: float64
We can see that the data was crawled since the 3d of March,2016 to the 7th of April,2016.
** Exploring the ad_created column**
autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index()
2015-06-11 0.000021 2015-08-10 0.000021 2015-09-09 0.000021 2015-11-10 0.000021 2015-12-05 0.000021 2015-12-30 0.000021 2016-01-03 0.000021 2016-01-07 0.000021 2016-01-10 0.000041 2016-01-13 0.000021 2016-01-14 0.000021 2016-01-16 0.000021 2016-01-22 0.000021 2016-01-27 0.000062 2016-01-29 0.000021 2016-02-01 0.000021 2016-02-02 0.000041 2016-02-05 0.000041 2016-02-07 0.000021 2016-02-08 0.000021 2016-02-09 0.000021 2016-02-11 0.000021 2016-02-12 0.000041 2016-02-14 0.000041 2016-02-16 0.000021 2016-02-17 0.000021 2016-02-18 0.000041 2016-02-19 0.000062 2016-02-20 0.000041 2016-02-21 0.000062 ... 2016-03-09 0.033151 2016-03-10 0.031895 2016-03-11 0.032904 2016-03-12 0.036755 2016-03-13 0.017008 2016-03-14 0.035190 2016-03-15 0.034016 2016-03-16 0.030125 2016-03-17 0.031278 2016-03-18 0.013590 2016-03-19 0.033687 2016-03-20 0.037949 2016-03-21 0.037579 2016-03-22 0.032801 2016-03-23 0.032060 2016-03-24 0.029280 2016-03-25 0.031751 2016-03-26 0.032266 2016-03-27 0.030989 2016-03-28 0.034984 2016-03-29 0.034037 2016-03-30 0.033501 2016-03-31 0.031875 2016-04-01 0.033687 2016-04-02 0.035149 2016-04-03 0.038855 2016-04-04 0.036858 2016-04-05 0.011819 2016-04-06 0.003253 2016-04-07 0.001256 Name: ad_created, Length: 76, dtype: float64
autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True)
2016-03-05 0.001071 2016-03-06 0.004324 2016-03-07 0.005395 2016-03-08 0.007413 2016-03-09 0.009595 2016-03-10 0.010666 2016-03-11 0.012375 2016-03-12 0.023783 2016-03-13 0.008895 2016-03-14 0.012602 2016-03-15 0.015876 2016-03-16 0.016452 2016-03-17 0.028086 2016-03-18 0.007351 2016-03-19 0.015834 2016-03-20 0.020653 2016-03-21 0.020632 2016-03-22 0.021373 2016-03-23 0.018532 2016-03-24 0.019767 2016-03-25 0.019211 2016-03-26 0.016802 2016-03-27 0.015649 2016-03-28 0.020859 2016-03-29 0.022341 2016-03-30 0.024771 2016-03-31 0.023783 2016-04-01 0.022794 2016-04-02 0.024915 2016-04-03 0.025203 2016-04-04 0.024483 2016-04-05 0.124761 2016-04-06 0.221806 2016-04-07 0.131947 Name: last_seen, dtype: float64
After examining these three columns, we can say that this website ads work well, car might has been sold because the ads are no longer on site.
autos['registration_year'].describe()
count 48565.000000 mean 2004.755421 std 88.643887 min 1000.000000 25% 1999.000000 50% 2004.000000 75% 2008.000000 max 9999.000000 Name: registration_year, dtype: float64
This column clearly has outliers, because th registration year cannot be higher than 2016 and less than 1950. We will only count the rows with registration_year values between 1950 and 2016, because we don't know for sure whether the cars registered earlier are antique or there is just a mistake. What is more, we the date was crowled in 2016, so the year of registration cannot be higher.
autos['registration_year'].value_counts().sort_index(ascending=True).head(20)
1000 1 1001 1 1111 1 1800 2 1910 5 1927 1 1929 1 1931 1 1934 2 1937 4 1938 1 1939 1 1941 2 1943 1 1948 1 1950 3 1951 2 1952 1 1953 1 1954 2 Name: registration_year, dtype: int64
autos['registration_year'].value_counts().sort_index(ascending=False).head(20)
9999 3 9000 1 8888 1 6200 1 5911 1 5000 4 4800 1 4500 1 4100 1 2800 1 2019 2 2018 470 2017 1392 2016 1220 2015 392 2014 663 2013 803 2012 1310 2011 1623 2010 1589 Name: registration_year, dtype: int64
#removing outliers
autos = autos[autos['registration_year'].between(1950,2016)]
autos['registration_year'].value_counts(normalize=True)
2000 0.067637 2005 0.062922 1999 0.062086 2004 0.057928 2003 0.057843 2006 0.057221 2001 0.056493 2002 0.053278 1998 0.050642 2007 0.048799 2008 0.047470 2009 0.044684 1997 0.041812 2011 0.034783 2010 0.034054 1996 0.029425 2012 0.028075 1995 0.026296 2016 0.026146 2013 0.017209 2014 0.014209 1994 0.013480 1993 0.009108 2015 0.008401 1992 0.007930 1990 0.007437 1991 0.007265 1989 0.003729 1988 0.002893 1985 0.002036 ... 1970 0.000814 1979 0.000729 1972 0.000707 1981 0.000600 1968 0.000557 1967 0.000557 1971 0.000557 1974 0.000514 1973 0.000493 1960 0.000493 1977 0.000471 1966 0.000471 1976 0.000450 1969 0.000407 1975 0.000386 1965 0.000364 1964 0.000257 1963 0.000171 1959 0.000129 1961 0.000129 1958 0.000086 1956 0.000086 1962 0.000086 1950 0.000064 1957 0.000043 1955 0.000043 1954 0.000043 1951 0.000043 1953 0.000021 1952 0.000021 Name: registration_year, Length: 67, dtype: float64
We have removed the rows with the outliers. Now we'll have a closer look at the 'brand' columns.
autos['brand'].unique()
array(['peugeot', 'bmw', 'volkswagen', 'smart', 'ford', 'chrysler', 'seat', 'renault', 'mercedes_benz', 'audi', 'sonstige_autos', 'opel', 'mazda', 'porsche', 'mini', 'toyota', 'dacia', 'nissan', 'jeep', 'saab', 'volvo', 'mitsubishi', 'jaguar', 'fiat', 'skoda', 'subaru', 'kia', 'citroen', 'chevrolet', 'hyundai', 'honda', 'daewoo', 'suzuki', 'trabant', 'land_rover', 'alfa_romeo', 'lada', 'rover', 'daihatsu', 'lancia'], dtype=object)
top_autos = autos["brand"].value_counts().sort_values(ascending=False).head(20).index
top_brands = {}
for c in top_autos:
selected_rows = autos[autos['brand'] == c]
mean_price = selected_rows['price_dollars'].mean()
top_brands[c] = mean_price
top_brands_df = pd.DataFrame.from_dict(top_brands, orient='index')
top_brands_df.sort_values(by=0, ascending=False)
0 | |
---|---|
sonstige_autos | 12336.588889 |
mini | 10613.459658 |
audi | 9336.687454 |
mercedes_benz | 8625.241502 |
bmw | 8332.203855 |
skoda | 6368.000000 |
volkswagen | 5398.709056 |
hyundai | 5365.254274 |
toyota | 5167.091062 |
volvo | 4946.501171 |
nissan | 4743.402525 |
seat | 4397.230950 |
mazda | 4112.596615 |
citroen | 3779.139144 |
ford | 3737.275767 |
smart | 3580.223903 |
peugeot | 3094.017229 |
opel | 2976.247260 |
fiat | 2813.748538 |
renault | 2475.717273 |
The brands with the highest mean price:
The brands with the lowest mean price:
Now we'll calculate the mean mileage for these brands and for a couple of in-between ones.
mean_mileage = {}
for c in top_autos:
selected_rows = autos[autos['brand'] == c]
mileage = selected_rows['odometer_km'].mean()
mean_mileage[c] = mileage
tb_series = pd.Series(top_brands)
mm_series = pd.Series(mean_mileage)
mean_data = pd.DataFrame(tb_series, columns=["mean_price"])
mean_data['mean_mileage'] = mm_series
mean_data.sort_values(by=["mean_price"], ascending=False)
mean_price | mean_mileage | |
---|---|---|
sonstige_autos | 12336.588889 | 90655.555556 |
mini | 10613.459658 | 88105.134474 |
audi | 9336.687454 | 129157.386785 |
mercedes_benz | 8625.241502 | 130832.037325 |
bmw | 8332.203855 | 132597.352025 |
skoda | 6368.000000 | 110848.563969 |
volkswagen | 5398.709056 | 128710.069973 |
hyundai | 5365.254274 | 106442.307692 |
toyota | 5167.091062 | 115944.350759 |
volvo | 4946.501171 | 138067.915691 |
nissan | 4743.402525 | 118330.995792 |
seat | 4397.230950 | 121131.301290 |
mazda | 4112.596615 | 124464.033850 |
citroen | 3779.139144 | 119694.189602 |
ford | 3737.275767 | 124374.233129 |
smart | 3580.223903 | 99326.777610 |
peugeot | 3094.017229 | 127153.625269 |
opel | 2976.247260 | 129336.521219 |
fiat | 2813.748538 | 117121.971596 |
renault | 2475.717273 | 128127.272727 |
Data cleaning next steps:
Analysis next steps:
autos.head(2)
date_crawled | name | seller | offer_type | price_dollars | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer_km | registration_month | fuel_type | brand | unrepaired_damage | ad_created | number_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | 5000 | control | bus | 2004 | manuell | 158 | andere | 150000 | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | 8500 | control | limousine | 1997 | automatik | 286 | 7er | 150000 | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
Now we'll translate the next columns:
autos['seller'].value_counts()
privat 46660 gewerblich 1 Name: seller, dtype: int64
autos['seller'] = autos['seller'].str.replace('privat', "private").str.replace('gewerblich','commercial')
autos['seller'].value_counts()
private 46660 commercial 1 Name: seller, dtype: int64
autos['offer_type'].value_counts()
Angebot 46661 Name: offer_type, dtype: int64
autos['offer_type'] = autos['offer_type'].str.replace('Angebot','offer')
autos['offer_type'].value_counts()
offer 46661 Name: offer_type, dtype: int64
autos['vehicle_type'].value_counts()
limousine 12596 kleinwagen 10584 kombi 8929 bus 4031 cabrio 3009 coupe 2461 suv 1964 andere 387 Name: vehicle_type, dtype: int64
autos['vehicle_type'] = autos['vehicle_type'].str.replace(
'limousine','sedan').str.replace('kleinwagen','small car').str.replace('kombi', 'station wagon').str.replace(
'cabrio','convertible').str.replace('andere','other')
autos['vehicle_type'].value_counts()
sedan 12596 small car 10584 station wagon 8929 bus 4031 convertible 3009 coupe 2461 suv 1964 other 387 Name: vehicle_type, dtype: int64
autos['gearbox'].value_counts()
manuell 34703 automatik 9856 Name: gearbox, dtype: int64
autos['gearbox'] = autos['gearbox'].str.replace('manuell','manual').str.replace('automatik','automatic')
autos['gearbox'].value_counts()
manual 34703 automatic 9856 Name: gearbox, dtype: int64
autos['model'].value_counts()
golf 3707 andere 3365 3er 2615 polo 1609 corsa 1591 passat 1349 astra 1348 a4 1231 c_klasse 1136 5er 1132 e_klasse 958 a3 825 a6 797 focus 762 fiesta 722 transporter 674 twingo 615 2_reihe 600 fortwo 550 vectra 544 a_klasse 539 1er 521 3_reihe 486 mondeo 479 clio 473 touran 433 punto 415 zafira 394 ka 349 megane 335 ... sirion 11 lanos 10 300c 10 terios 10 mii 10 gl 10 lybra 10 move 9 r19 9 range_rover 9 90 8 nubira 8 amarok 6 crossfire 6 exeo 6 range_rover_evoque 5 delta 5 9000 5 lodgy 5 145 4 croma 4 materia 4 charade 3 samara 3 v60 3 kappa 2 200 1 rangerover 1 b_max 1 i3 1 Name: model, Length: 244, dtype: int64
autos['model'] = autos['model'].str.replace('andere', 'other')
autos['fuel_type'].value_counts()
benzin 28527 diesel 14032 lpg 649 cng 71 hybrid 37 elektro 19 andere 15 Name: fuel_type, dtype: int64
autos['fuel_type'] = autos['fuel_type'].str.replace('benzin','gasoline').str.replace(
'elektro','electric').str.replace('andere','other')
autos['fuel_type'].value_counts()
gasoline 28527 diesel 14032 lpg 649 cng 71 hybrid 37 electric 19 other 15 Name: fuel_type, dtype: int64
autos['unrepaired_damage'].value_counts()
nein 33828 ja 4534 Name: unrepaired_damage, dtype: int64
autos['unrepaired_damage'] = autos['unrepaired_damage'].str.replace('nein', 'no').str.replace('ja', 'yes')
autos['unrepaired_damage'].value_counts()
no 33828 yes 4534 Name: unrepaired_damage, dtype: int64
autos.head(2)
date_crawled | name | seller | offer_type | price_dollars | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer_km | registration_month | fuel_type | brand | unrepaired_damage | ad_created | number_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | private | offer | 5000 | control | bus | 2004 | manual | 158 | other | 150000 | 3 | lpg | peugeot | no | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | private | offer | 8500 | control | sedan | 1997 | automatic | 286 | 7er | 150000 | 6 | gasoline | bmw | no | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
We have successfully translated all German data into English. Now we will convert the dates to the uniform numeric data.
The columns containing dates are:
We'll convert the dates to be uniform numeric data, so "2016-03-21" becomes the integer 20160321.
autos['date_crawled'] = autos['date_crawled'].str[:10]
autos['date_crawled'] = autos['date_crawled'].str.replace('-','')
autos['date_crawled'] = autos['date_crawled'].astype(float)
autos['date_crawled'].value_counts().head()
20160403.0 1810 20160320.0 1774 20160321.0 1741 20160312.0 1718 20160404.0 1708 Name: date_crawled, dtype: int64
autos['ad_created'] = autos['ad_created'].str[:10]
autos['ad_created'] = autos['ad_created'].str.replace('-','')
autos['ad_created'] = autos['ad_created'].astype(int)
autos['ad_created'].value_counts().head()
20160403 1821 20160320 1776 20160321 1751 20160404 1724 20160312 1710 Name: ad_created, dtype: int64
autos['last_seen'] = autos['last_seen'].str[:10]
autos['last_seen'] = autos['last_seen'].str.replace('-','')
autos['last_seen'] = autos['last_seen'].astype(int)
autos['last_seen'].value_counts().head()
20160406 10420 20160407 6194 20160405 5852 20160317 1311 20160403 1173 Name: last_seen, dtype: int64
Now when we converted text and numeric values, the dataset is clen and looks like this:
autos.head(5)
date_crawled | name | seller | offer_type | price_dollars | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer_km | registration_month | fuel_type | brand | unrepaired_damage | ad_created | number_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20160326.0 | Peugeot_807_160_NAVTECH_ON_BOARD | private | offer | 5000 | control | bus | 2004 | manual | 158 | other | 150000 | 3 | lpg | peugeot | no | 20160326 | 0 | 79588 | 20160406 |
1 | 20160404.0 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | private | offer | 8500 | control | sedan | 1997 | automatic | 286 | 7er | 150000 | 6 | gasoline | bmw | no | 20160404 | 0 | 71034 | 20160406 |
2 | 20160326.0 | Volkswagen_Golf_1.6_United | private | offer | 8990 | test | sedan | 2009 | manual | 102 | golf | 70000 | 7 | gasoline | volkswagen | no | 20160326 | 0 | 35394 | 20160406 |
3 | 20160312.0 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | private | offer | 4350 | control | small car | 2007 | automatic | 71 | fortwo | 70000 | 6 | gasoline | smart | no | 20160312 | 0 | 33729 | 20160315 |
4 | 20160401.0 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | private | offer | 1350 | test | station wagon | 2003 | manual | 0 | focus | 150000 | 7 | gasoline | ford | no | 20160401 | 0 | 39218 | 20160401 |
First we will find the most common brand/model combinations.
brand_counts = autos['brand'].value_counts(normalize=True)*100
top_brand = brand_counts[brand_counts > 5].index
top_brand
Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford'], dtype='object')
brand_model = {}
for c in top_brand:
a_brand = autos[autos['brand'] == c]
a_model = a_brand['model'].describe()['top']
brand_model[c] = a_model
brand_model
{'audi': 'a4', 'bmw': '3er', 'ford': 'focus', 'mercedes_benz': 'c_klasse', 'opel': 'corsa', 'volkswagen': 'golf'}
Now we'll see if average prices follows any patterns based on the milage.
# we split the 'odometer_km' values in five bins (ranges)
ranges = autos['odometer_km'].value_counts(bins=5).index
ranges
IntervalIndex([(121000.0, 150000.0], (63000.0, 92000.0], (34000.0, 63000.0], (4854.999, 34000.0], (92000.0, 121000.0]] closed='right', dtype='interval[float64]')
ranges_tuple = ranges.to_tuples()
km_price = {}
#we calculated the mean price for each odometer bin
for r in ranges_tuple:
by_km = autos[autos['odometer_km'].between(r[0], r[1])]
price = by_km['price_dollars'].mean()
km_price[r] = int(price)
km_price
{(4854.999, 34000.0): 15146, (34000.0, 63000.0): 13712, (63000.0, 92000.0): 9564, (92000.0, 121000.0): 8120, (121000.0, 150000.0): 4107}
We can conclude, that the the higher the mileage is, the cheaper the price gets.
Now we'll check How much cheaper cars are with damage than their non-damaged counterparts.
damage = autos['unrepaired_damage'].unique()
damaged_price = {}
for d in damage:
by_d = autos[autos['unrepaired_damage'] == d]
price = by_d['price_dollars'].mean()
damaged_price[d] = price
damaged_price
{nan: nan, 'no': 7162.054984036892, 'yes': 2222.3098808998675}
We can see that the price for cars with unrepaired damage is 30% lower than for the undamaged cars.