#!/usr/bin/env python # coding: utf-8 # # Exploring Ebay Used Vehicles Data # This project will analyze used vehicles data from eBay Kleinanzeigen (eBay Germany). The dataset was originally scraped (March, 5, 2016 through April, 7, 2016) and uploaded to Kaggle. The data contains several variables describing features about the vehicle in the ad (e.g., model, gas type, kilometers driven, etc), and variables about the website (e.g., type of seller, type of listing, number of pictures in the ad, etc). Here is the data dictionary associated with the data: # # - __dateCrawled:__ When this ad was first crawled. All field-values are taken from this date. # - __name:__ Name of the car. # - __seller:__ Whether the seller is private or a dealer. # - __offerType :__ The type of listing # - __price:__ The price on the ad to sell the car. # - __abtest:__ Whether the listing is included in an A/B test. # - __vehicleType:__ The vehicle Type. # - __yearOfRegistration:__ The year in which the car was first registered. # - __gearbox:__ The transmission type. # - __powerPS:__ The power of the car in PS. # - __model:__ The car model name. # - __kilometer:__ How many kilometers the car has driven. # - __monthOfRegistration:__ The month in which the car was first registered. # - __fuelType:__ What type of fuel the car uses. # - __brand:__ The brand of the car. # - __notRepairedDamage:__ If the car has a damage which is not yet repaired. # - __dateCreated:__ The date on which the eBay listing was created. # - __nrOfPictures:__ The number of pictures in the ad. # - __postalCode:__ The postal code for the location of the vehicle. # - __lastSeenOnline:__ When the crawler saw this ad last online. # # # Objective # The aim of this project is to clean the data and analyze the included used car listings. # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import geopandas import folium pd.options.mode.chained_assignment = None # default='warn' # In[2]: autos = pd.read_csv("autos.csv", encoding="Latin-1") # In[3]: # There shouldn't be duplicates given each row represents one unique ad. autos[autos.duplicated()] # In[4]: autos.info() # The dataset has __50.000__ entires and is fromed by __20__ columns (Types: int64 and object) # In[6]: autos.head() # In[7]: autos["notRepairedDamage"].value_counts() # - Columns like dateCrawled, dateCreated and lastSeen are __object__ type, but should be __datetime__ type. To work easier with them. # - Columns like vechicleType, gearbox, model or notRepairedDamage have missing values for some entries. # - Columns like price or odometer have special characters ($, km) # - Columns like notRepairedDamage contain German words (nein - no, ja - yes) # ## Cleaning the Data # In[8]: autos.columns # In[9]: columns_names = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen'] # In[10]: autos.columns = columns_names autos.head() # To make the dataset more understandable some some columns needed a name change. # # And all the columns were changed from camelcase to snakecase following the Python standards # In[11]: autos.describe(include='all') # In[12]: autos.describe() # In[13]: autos["nr_of_pictures"].value_counts() # In[14]: autos.drop(["nr_of_pictures"], axis = 1, inplace = True) # The column __nr_of_pictures__ is dropped because contais only one value __0__, which doesn't give any information. # In[15]: autos[["price", "odometer"]] # The columns __seller__ and __offer_type__ have the same values. # The columns __price__ and __odometer__ can be converted to numeric types. # In[16]: autos["price"].head(5) # In[16]: autos["price"] = autos["price"].str.replace("$", "") autos["price"] = autos["price"].str.replace(",", "").astype(int) # In[17]: autos["odometer"].head(5) # In[18]: autos["odometer"] = autos["odometer"].str.replace("km", "") autos["odometer"] = autos["odometer"].str.replace(",", "") autos["odometer"].astype(int) autos = autos.rename(columns={'odometer': 'odometer_km'}) # In[19]: autos["price"].unique().shape # In[20]: autos["odometer_km"].unique().shape # In[21]: autos["price"].describe() # In[22]: autos["odometer_km"].describe() # In[23]: autos["price"].value_counts().sort_index(ascending=False).head() # In[24]: autos["odometer_km"].value_counts().sort_index(ascending=False) # Knowing that eBay is an auction website, it is possible to have these low values in the price column. Several of these entries appear to be illegitimate entries (e.g., 1234566, 12345678, 1111111, 99999999). In addition, it is unlikely that these prices represent the value of the vehicles given that this is a used vehicle classified site. Therefore, anything below __1.00__ and above __350.000__ will be removed as these could have been errors or illegitimate entries. # # For the odometer column, the distribution of kilometers is skewed towards the high end. Perhaps this suggests that many of these vehicles are classics, or that people want to sell their vehicles for used parts. # In[25]: autos = autos[(autos.loc[:, "price"] > 0) & (autos.loc[:, "price"] <= 350000)] autos["price"].value_counts().head(5) # In[26]: autos["odometer_km"].describe() # In[27]: autos["price"].describe() # In[28]: autos["date_crawled"].str[:10].value_counts(normalize=True, dropna=False).sort_index() # The __date_crawled__ column range goes from 2016-04-05 to 2016-04-07 # In[29]: autos["ad_created"].str[:10].value_counts(normalize=True, dropna=False).sort_index() # The __ad_created__ range goes from 2015-06-11 to 2016-04-07 # In[30]: autos["last_seen"].str[:10].value_counts(normalize=True, dropna=False).sort_index() # The __last_seen__ range goes from 2016-03-05 to 2016-04-07 # In[31]: autos["registration_year"].describe() # In[32]: autos["registration_year"].head(5) # The highest registration year acceptable: 2016 (current year of creation of the dataset) # The lowest registration year acceptable: 1990 (when first cars where made) # In[34]: autos.drop(autos[(autos["registration_year"] < 1940) | (autos["registration_year"] > 2016)].index, inplace=True) # In[35]: autos["registration_year"].describe() # In[36]: autos["registration_year"].value_counts(normalize=True) # In[37]: autos.isnull().sum() # The columns entries: __vehicle_type (2700)__, __gearbox (2103)__, __model (2185)__, __fuel_type (3312)__, __unrepaired_damage (8299)__ with null values. # In[38]: autos["brand"].nunique() # In[39]: autos["brand"].unique() # In[40]: autos["vehicle_type"].nunique() # In[41]: autos["vehicle_type"].unique() # There are __8__ vehicle types (nan value included) and __40__ vehicle brands. # In[42]: autos["name"].nunique() # In[43]: autos["name"].unique() # In[44]: words_dictionary = {"privat": "private", "gewerblich": "commercial", "Angebot": "bid", "Gesuch": "application", "kleinwagen": "super mini", "kombi":"station wagon", "cabrio": "convertible", "limousine": "sedan", "andere": "other", "manuell": "manual", "automatik": "automatic", "benzin":"gas", "elektro": "electric", "sonstige_auto": "other", "sonstige_autos": "other", "nein": "no", "ja": "yes"} def translate(value): if value in words_dictionary: return words_dictionary[value] else: return value columns_to_change = ["seller", "offer_type", "vehicle_type", "gearbox", "model", "fuel_type", "brand", "unrepaired_damage"] autos[columns_to_change] = autos[columns_to_change].applymap(translate) # After translating German names to English, in the column __names__ appears to contain __vehicle_type__ values. # In[46]: import warnings warnings.filterwarnings("ignore", 'This pattern has match groups') # Used str.contains as a make-shift boolean index, hence disabling the warnings. # In[47]: # Replaced null values in vehicle type based word matches in name autos.loc[autos["name"].str.contains(r"(.*kombi)", case = False), "vehicle_type"] = "station wagon" autos.loc[autos["name"].str.contains(r"(.*cabrio)", case = False), "vehicle_type"] = "convertible" autos.loc[autos["name"].str.contains(r"(.*klein)", case = False), "vehicle_type"] = "small car" autos.loc[autos["name"].str.contains(r"(.*limo[^n])", case = False), "vehicle_type"] = "sedan" autos.loc[autos["name"].str.contains(r"(.*coupe)", case = False), "vehicle_type"] = "coupe" autos.loc[autos["name"].str.contains(r"(.*bus)", case = False), "vehicle_type"] = "bus" autos.loc[autos["name"].str.contains(r"(.*caravan)", case = False), "vehicle_type"] = "van" # Multiple instances where there could be multiple vehicle type keywords in a string. Thus, "smart car" was replaced to "small car" last # to reverse some of these cleaning induced errors. autos.loc[autos["name"].str.contains(r"(.*smart)", case = False), "vehicle_type"] = "small car" # In[48]: # Reactivated filtering warnings import warnings warnings.filterwarnings("default", 'This pattern has match groups') # In[49]: autos.isnull().sum() # The __null__ values from the __vehicle_type__ column where reduced from __2700__ to __2493__. # In[50]: autos.columns # In[51]: autos["brand"].value_counts().head(20).index # After seeing the value counts of the brand column, # I decided to agrregate the first __20 brands__ of the value counts index which are the ones whith more percentage of the total value. # In[52]: date_columns = ['date_crawled', 'ad_created', 'last_seen'] for column in date_columns: autos[column] = pd.to_datetime(autos[column], format='%Y%m%d %H:%M:%S.%f') autos[column] = autos[column].dt.strftime("%Y%m%d").astype(int) autos[date_columns] # ## Data Analysis # In[53]: # Brands list brands = [ 'volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford', 'renault', 'peugeot', 'fiat', 'seat', 'skoda', 'nissan', 'mazda', 'smart', 'citroen', 'toyota', 'hyundai', 'sonstige_autos', 'volvo', 'mini'] # In[54]: # Creates the mean price of a concrete brand autos[autos["brand"] == 'volkswagen']["price"].mean() # In[55]: # Import packages for plotting import matplotlib.pyplot as plt import seaborn as sns # In[56]: #Brands dict brands_dict = {} for brand in brands: brands_dict[brand] = autos[autos["brand"] == brand]["price"].mean() brands_dict # In[57]: lists = sorted(brands_dict.items()) x, y = zip(*lists) df = pd.DataFrame(index=x) df["mean_price"] = y df.plot.bar(figsize=(20, 10)) # In[58]: brand_values_count = autos["brand"].value_counts().head(20) brand_values_count # In[59]: brand_values_count.plot(figsize=(20, 10), kind='bar', title='Num of appearence of top 20 brands') # As we can see in the previous list the car whith __higher mean price__ is # is __Audi__. # # __Volkswagen__ is the car bran with more appearances and has a lower price than Audi. That may be an indicator that Volkswagen combines quality & price. # In[60]: # Only top 6 brands brands = ["audi", "bmw", "ford", "mercedes_benz", "opel", "volkswagen"] autos[autos["brand"].isin(brands)] # In[61]: price_dict = {} for brand in brands: price_dict[brand] = autos[autos["brand"] == brand]["price"].mean() price_dict # In[62]: autos["odometer_km"] = autos["odometer_km"].astype(float) mileage_dict = {} for brand in brands: mileage_dict[brand] = autos[autos["brand"] == brand]["odometer_km"].mean() mileage_dict # In[63]: autos["odometer_km"] # In[64]: price_series = pd.Series(price_dict) price_series # In[65]: mileage_series = pd.Series(mileage_dict) mileage_series # In[66]: df = pd.DataFrame(price_series, columns=['mean_price']) df # In[67]: df["mileage"] = mileage_series df # In[68]: fig = plt.figure(figsize =(20,10)) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) bar_plot = sns.barplot(x=df.index, y = df['mean_price'], dodge = False, ax = ax1, saturation = 1, palette = "Blues_d") idx = 0 for index in df.index: bar_plot.text(idx, df.loc[index, 'mean_price'], round(df.loc[index, 'mean_price'], 2), color='black', ha='center') idx += 1 ax1.set_title("Average price by brand", fontsize = 18, pad = 15) ax1.tick_params(axis='x', labelrotation=90) ax1.set_xlabel("Brand") ax1.set_ylabel("Mean Price $") sns.barplot(x=df.index, y = df['mileage'], dodge = False, ax = ax2, saturation = 1, palette = "Blues_d") ax2.set_title("Mile Age by Brand", fontsize = 18, pad = 15) ax2.tick_params(axis='x', labelrotation=90) ax2.set_xlabel("Brand") ax2.set_ylabel("Mile age in km") # So, as we see in the charts above, __Audi__ is the brand with biggest average price, but is only the second with biggest mean in mileage. That means that users doesn't spend a bigger amount on the vehicle cause it's resistence and persistence in time. This established a strong user behavior towards the brand value than any other factor. As we can see the foreign car brands like __Ford__ have a high mile age and a lower price, but doesn't have the same reputation like the national brands. # ### Model and brand analysis # In[69]: autos.model.value_counts() # In[70]: autos.model.nunique() # There are __244__ car models. # In[71]: top_models_by_brand = autos.groupby(["brand", "model"]).size().reset_index().rename(columns=({0: 'model_count'})).sort_values(by = 'model_count', ascending=False) top_models_by_brand.head(10) # The top __10 models__ pertain to the top __5 national brand__ cars. The most aucctioned model is __Volkswagen Golf_. # In[72]: # Add percentage of the model overall size = len(autos) top_models_by_brand["overall_percentage"] = ((top_models_by_brand["model_count"] / size) * 100).astype(int) top_models_by_brand # In[73]: for brand in top_models_by_brand["brand"]: total_count = top_models_by_brand[top_models_by_brand["brand"] == brand]["model_count"].sum() for model in top_models_by_brand[top_models_by_brand["brand"] == brand]["model"]: condition = (top_models_by_brand["brand"] == brand) & (top_models_by_brand["model"] == model) model_count = top_models_by_brand[condition]["model_count"] top_models_by_brand.loc[condition, "brand_percentage"] = (model_count / total_count * 100) top_models_by_brand # In[74]: top_ten_models = top_models_by_brand.head(10) top_ten_models # In the previous list we can see the top __10__ models, their __brand_percentage__ and theis __overall_percentage__ # In[75]: for model in top_ten_models['model']: top_ten_models.loc[top_ten_models['model'] == model, 'price_mean'] = autos.loc[autos['model'] == model, 'price'].mean() top_ten_models.loc[top_ten_models['model'] == model, 'non_damaged_price_mean'] = autos.loc[(autos['model'] == model) & (autos['unrepaired_damage'] == 'no'), 'price'].mean() top_ten_models.loc[top_ten_models['model'] == model, 'damaged_price_mean'] = autos.loc[(autos['model'] == model) & (autos['unrepaired_damage'] == 'yes'), 'price'].mean() top_ten_models.loc[top_ten_models['model'] == model, 'odometer_mean'] = autos.loc[autos['model'] == model, 'odometer_km'].mean() top_ten_models.loc[top_ten_models['model'] == model, 'automatic_price_mean'] = autos.loc[(autos['model'] == model) & (autos['gearbox'] == 'automatic'), 'price'].mean() top_ten_models.loc[top_ten_models['model'] == model, 'manual_price_mean'] = autos.loc[(autos['model'] == model) & (autos['gearbox'] == 'manual'), 'price'].mean() top_ten_models # In the table above the top ten models are shown, with their price mean, as well as the price mean if are damaged or not. Some conlcusions can be extracted: # - First models have a a __high__ odometer_mean and a __normal/low__ price mean. # - Repaired cars are cheeper that the not damaged ones. # - Cars with an automatic __gearbox__ tend to have a price higher (near the double), than __manuals__. # In[76]: top_ten_models[["model", "non_damaged_price_mean", "damaged_price_mean"]].plot(x='model', kind='bar', figsize=(20, 10)) # In the previous chart is shown how much-damaged cars price is reduced in comparison with the not damaged of each model in the list of the __Top 10 models__. # In[77]: for model in top_ten_models['model']: model_count = top_ten_models.loc[top_ten_models['model'] == model]['model_count'] damaged_cars = len(autos.loc[(autos['model'] == model) & (autos['unrepaired_damage'] == 'yes')]) damaged_cars_with_manual_gearbox = len(autos.loc[(autos['model'] == model) & (autos['unrepaired_damage'] == 'yes') & ((autos['gearbox'] == 'manual'))]) damaged_cars_with_automatic_gearbox = len(autos.loc[(autos['model'] == model) & (autos['unrepaired_damage'] == 'yes') & ((autos['gearbox'] == 'automatic'))]) automatic_gearbox = len(autos.loc[(autos['model'] == model) & ((autos['gearbox'] == 'automatic'))]) manual_gearbox = len(autos.loc[(autos['model'] == model) & ((autos['gearbox'] == 'manual'))]) top_ten_models.loc[top_ten_models['model'] == model, 'manual_gearbox_pecentage'] = (manual_gearbox / model_count) * 100 top_ten_models.loc[top_ten_models['model'] == model, 'automatic_gearbox_percenage'] = (automatic_gearbox / model_count) * 100 top_ten_models.loc[top_ten_models['model'] == model, 'damaged_pecentage'] = (damaged_cars / model_count) * 100 top_ten_models.loc[top_ten_models['model'] == model, 'manual_gearbox_damaged_pecentage'] = (damaged_cars_with_manual_gearbox / manual_gearbox) * 100 top_ten_models.loc[top_ten_models['model'] == model, 'automatic_gearbox_damaged_pecentage'] = (damaged_cars_with_automatic_gearbox / automatic_gearbox) * 100 top_ten_models # In[78]: fig = plt.figure(figsize =(20,10)) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) bar_plot = sns.barplot(x='model', y =(top_ten_models['manual_gearbox_damaged_pecentage'] / top_ten_models['manual_gearbox_pecentage']), data=top_ten_models, dodge = False, ax = ax1, saturation = 1, palette = "Blues_r") ax1.set_title("Damaged percentage with manual gearbox by brand", fontsize = 18, pad = 15) ax1.tick_params(axis='x', labelrotation=90, size=18) ax1.set_xlabel("Brand") ax1.set_ylabel("Damaged Percentage") bar_plot = sns.barplot(x='model', y =(top_ten_models['automatic_gearbox_damaged_pecentage'] / top_ten_models['automatic_gearbox_percenage']), data=top_ten_models, dodge = False, ax = ax2, saturation = 1, palette = "Blues_r") ax2.set_title("Damaged percentage with automatic gearbox by brand", fontsize = 18, pad = 15) ax2.tick_params(axis='x', labelrotation=90, size=18) ax2.set_xlabel("Brand") ax2.set_ylabel("Damaged Percentage") # The previous plots is taken into consideration the total damaged percentage of each model in the __Top 10 models list__. Distinguishing if the model had a manual or an automatic gearbox. # # As is shown, the total percentage of damaged cars with an automatic gearbox is higher than the total percentage of cars damaged with a manual gearbox. # # For example, __Volkswagen Golf__ is the model with more value counts. # - __Volkswagen Golf__ has 84% of the models with a manual gearbox. # - __Volkswagen Golf__ has 12% of the models with an automatic gearbox. # - The other 4% where null values of the column gearbox. # - __Volkswagen Golf__ has __3707__ value counts. Only a __8.3%__ of these cars were damaged. # - From the __84%__ of the models with a manual gearbox only __8.5%__ were damaged. # - From the __12%__ of the models with an automatic gearbox, __7.4%__ were damaged. # # Models with an __automatic gearbox__ tend to double the price compared to a manual one, but the __risk__ to be damaged is considerably higher. # # ## Geographical analysis # The objective of this part is to make a geographical anaylsis from the postal code of each car of the top 10 most common models. # # # Postal codes system in Germany from 1 July 1993 is a 5-digit system. From the list of the previous link, all the postal codes can be realted with the map area. # # In[79]: # Gets all related data of the 10 most common models models = top_ten_models["model"].tolist() top_ten_models_data = autos[autos["model"].isin(models)] top_ten_models_data # In[80]: # Creates a new column "region" which contains the first 2 values of the postal code top_ten_models_data['region'] = top_ten_models_data['postal_code'].astype("str").str.zfill(5).str[0:2] top_ten_models_data # In[81]: top_ten_models_data['region'].nunique() # In[82]: german_regions = pd.read_csv("GermanRegions.csv") german_regions # In[84]: # Cleans the dataframe columns german_regions["Region"] = german_regions["Region"].apply(str) german_regions["Region"] = german_regions["Region"].str.strip() german_regions["Area"] = german_regions["Area"].str.split(",").str[0] german_regions["Area"] = german_regions["Area"].str.strip() german_regions.loc[0:8, 'Region'] = '0' + german_regions.loc[0:8, 'Region'] german_regions # In[85]: top_ten_models_data = pd.merge(left=top_ten_models_data, right=german_regions, left_on='region', right_on='Region', how='left').drop("Region", axis=1) top_ten_models_data # In[86]: #Check if any row in the city column is null top_ten_models_data.isnull().sum() # In[87]: top_ten_models_data['Area'].value_counts() # In the previous list are shown the __cities__ with __most auctions__ of the __top 10__ models # In[88]: top_ten_models_data = top_ten_models_data.groupby(["model", "region", "Area"]).size().reset_index().rename(columns=({0:"count", "region":"plz"})) top_ten_models_data # Auctions are grouped by model and region # In[89]: models # In[90]: top_ten_models_data[top_ten_models_data.model == 'polo'] # In[91]: # Read json into geopandas dataframe germany_geopandas = geopandas.read_file("Germany.geojson") germany_geopandas["plz"] = germany_geopandas["plz"].str[0:2] # Adds counts and area name into geopandas dataframe geo_germany_merge = germany_geopandas.merge(top_ten_models_data, on = "plz", how = "left") # In[92]: # Creates separate geopandas dataframes for each model # This function can be used to add all top 10 brand models into the folium map def add_models_count_to_map_regions(): index = 0 # Center germany coordinates are 51.1657° N, 10.4515° E germany_center_coord = [51.1657, 10.4515] m = folium.Map(location=germany_center_coord, zoom_start=5) for model in models: show = False if index == 0: show = True current_model = geo_germany_merge[geo_germany_merge['model'] == model] current_model_geo = folium.Choropleth(geo_data = current_model, name=model, data = current_model, columns=["plz", "count"], key_on='feature.properties.plz', highlight = True, nan_fill_color='white', fill_opacity=0.7, line_opacity=0.3, show=show).add_to(m) current_model_geo.geojson.add_child(folium.features.GeoJsonTooltip(["count", "Area"])) index += 1 # Adds layer control on selected position folium.LayerControl(position='topright', collapsed=False).add_to(m) return m # To see the map uncomment the following lines: # In[1]: #germany_center_coord = [51.1657, 10.4515] #m = folium.Map(location=germany_center_coord, zoom_start=5) #golf_model = geo_germany_merge[geo_germany_merge['model'] == "golf"] #golf_model_geo = folium.Choropleth(geo_data = golf_model, # name="Golf", # data = golf_model, # columns=["plz", "count"], # key_on='feature.properties.plz', # highlight = True, # nan_fill_color='white', # fill_color="BuGn", # fill_opacity=0.7, # line_opacity=0.3, # show=True).add_to(m) #golf_model_geo.geojson.add_child(folium.features.GeoJsonTooltip(["count", "Area"])) #polo_model = geo_germany_merge[geo_germany_merge['model'] == "polo"] #polo_model_geo = folium.Choropleth(geo_data = polo_model, # name='Polo', # data = polo_model, # columns=["plz", "count"], # key_on='feature.properties.plz', # highlight = True, # fill_color="PuBu", # nan_fill_color='white', # fill_opacity=0.7, # line_opacity=0.3, # show=False).add_to(m) #polo_model_geo.geojson.add_child(folium.features.GeoJsonTooltip(["count", "Area"])) #folium.LayerControl(position='topright', collapsed=False).add_to(m) #m # On the map below are shown the locations where __Golf__ and __Polo__ models of __Volkswagen__ are auctioned mostly. Both of them have an immense number of auctions in the Braunschweig area. __Braunschweig__ region includes __Wolfsburg__ where is the main factory of Volkswagen. After the analysis, I conclude that near the main factories of each brand in Germany, is where the major number of car auctions occur on __eBay__. # # Conclusion # The brands with highest mean price are: # - audi # - bmw # - ford # - mercedes_benz # - opel # - volkswagen # # The top ten models are associated to this brands, excepted __ford__ # - volkswagen golf # - bmw 3er # - volkswagen polo # - opel corsa 1591 # - volkswagen passat # - audi a4 # - mercedes_benz c_klasse # - bmw 5er # - mercedes_benz e_klasse # # Germany is the country with most car factories in Europe, from the top 10 models we can see that foreign models aren't sold, It could be considered that German people trust more in national auto brands. # # After comparing the mean price of the top 10 models damaged or not, the conclusion arrived is that damaged cars independently the model or brand tends to cost half. # # The majority of the cars auctioned on eBay have a manual gearbox, comparing the percentage of the cars auctioned that suffered damage by gearbox type, I can conclude that cars with an automatic gearbox have a high risk to get damaged. # # From the geographical analysis the following conclusions are extracted: # - The major number of auctions of each model tends to be near the main factory of the model brand. # - From the previous conclusion, one reason could be causing the probability to auction for the newest or in better conditions second-hand car is bigger than further from the main factories.