#!/usr/bin/env python
# coding: utf-8

# # Project: Exploring Ebay Car Sales Data
# >  This poject is aimed at gaining insight on used cars from eBay site
# >  The dataset was originally scraped and uploaded to [Kaggle](https://www.kaggle.com/orgesleka/used-cars-database/data). Dataquest has made a few modifications from the original dataset that was uploaded to Kaggle.
# 
# A few notes describing the data columns can be found [here](https://www.kaggle.com/orgesleka/used-cars-database/data) 
# 
# 
# * dateCrawled : when this ad was first crawled, all field-values are taken from this date
# * name : "name" of the car
# * seller : private or dealer
# * price : the price on the ad to sell the car
# * yearOfRegistration : at which year the car was first registered
# * powerPS : power of the car in PS
# * lastSeenOnline : when the crawler saw this ad last online
# 

# In[1]:


#importing the numpy and pandas library
import numpy as np
import pandas as pd

autos = pd.read_csv("autos.csv", encoding="Latin-1") #reading the datafile


# In[2]:


#chcking for missing data and wrong data types using info()
autos.info()


# In[3]:


#printing first few rows of data using .head()
autos.head() 


# 

# In[4]:


#using DataFrame.columns to print out existing columns
autos.columns


# In[5]:


#editing the column names
autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'registration_year', 'gear_box', 'power_ps', 'model',
       'odometer', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
       'last_seen']


# In[6]:


#confirming changes to column names
autos.columns


# In[7]:


#using dataFrame.head() to look at the current state of autos dataframe
autos.head()


# The camelCase columns have been changed to snake_case, some column names have been edited to make more meanings from the data dictionary.

# In[8]:


#looking at the descriptive statistics for all columns
autos.describe(include='all')


# There are about 50000 records in the dataset with 20 columns. 'vehicleType','model','fuelType','notRepairedDamage' all contain missing values. Columns casing are not regular. Price and Odometer contain unit symbols. dateCrawled,dateCreated dtype should be datetime and  price,odometer should be int dtypes. seller, offerType, abtest, gearbox are about 2 unique values each.
# 
# The mean, std, median and percentiles for price and odometer are NaN which indicates a wrong data type. These requires more investigation

# In[9]:


#renaming the price and odometer columns
autos.rename(columns={'price':'price_usd','odometer':'odometer_km'},inplace=True)


# In[10]:


#confirming change in names
autos.columns


# In[11]:


#removing the '$' and ',' chars in the price values
autos['price_usd'] = autos['price_usd'].str.replace('$','').str.replace(',','').astype('int64')


# In[12]:


#viewing changes to price column
autos.head()


# In[13]:


#confirming change to price dtype
autos.info()


# In[14]:


#removing the km characters in odometer
autos['odometer_km'] = autos['odometer_km'].str.replace('km','').str.replace(',','')


# In[15]:


#chaniging odometer_km to int type
autos['odometer_km'] = autos['odometer_km'].astype('int64')


# In[16]:


autos.head()


# In[17]:


#understanding the shape of the price column
autos['price_usd'].unique().shape


# In[18]:


#summary statistics for price column
autos['price_usd'].describe() #.sort_index(ascending=False)


# In[19]:


#Series.value_counts() returns a series, better to use Series.sort_index()
#with ascending= True or False, Second look at max outlier
autos['price_usd'].value_counts().sort_index(ascending=False).head()


# In[20]:


#Second look at Min outlier
autos['price_usd'].value_counts().sort_index(ascending=False).tail()


# In[21]:


#removing outliers
autos = autos[autos["price_usd"].between(1,27322222)]


# In[22]:


autos['price_usd'].describe()


# After removing the outliers, it is observed that average sales is around $8,000 dollars

# In[23]:


#understanding the shape of the odometer column
autos['odometer_km'].unique().shape


# In[24]:


#summary statistics of the odometer column
autos['odometer_km'].describe()


# In[25]:


#checking for high outliers for odometer
autos['odometer_km'].value_counts().sort_index(ascending=False).head()


# In[26]:


#checking for low outliers for odometer
autos['odometer_km'].value_counts().sort_index(ascending=False).tail()


# The odometer column appears to not have outliers. Next stop is to look at the dates columns of 'date_crawled', 'ad_created', 'last_seen' which are in string formats

# In[27]:


#understanding the date columns
autos[['date_crawled','ad_created','last_seen']][0:5]


# In[28]:


#calculating the distribution for date_crawled
#To rank by date in ascending order (earliest to latest), chain the 
#Series.sort_index() method 
autos_date_crawled = autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index()


# In[29]:


autos_date_crawled


# In[30]:


#calculating the distribution of 'ad_created' values including missing values
#using Series.value_counts(normalize=True, dropna=False)
autos_ad_created = autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False)


# In[51]:


#viewing the values
autos_ad_created.sort_index().head(10)


# In[52]:


#viewing the values
autos_ad_created.sort_index().tail(10)


# In[32]:


#calculating the distribution of 'last_seen' values including missing values
#using Series.value_counts(normalize=True, dropna=False)
autos_last_seen = autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False)


# In[49]:


#viewing 'last_seen'
autos_last_seen.sort_index().head(10)


# In[34]:


#understanding the distribution of registration year
autos['registration_year'].describe()


# The earliest year is 1000 and the latest year is 9999. Weird dates with regards to when cars were produced and the future projection.In 1903, [Henry Ford](https://en.wikipedia.org/wiki/History_of_Ford_Motor_Company) invented a modernized car and the year appears to most suitable while 2016 will be taken as the highest year since the last seen ads were in 2016.

# In[35]:


#removing upper and lower bounds in the registration_year
autos = autos[autos["registration_year"].between(1902,2017)]


# In[48]:


autos['registration_year'].value_counts(normalize=True).sort_index().head(10)


# There appears to be more vehicle listings between 1990s and 2000s. A closer look at the brand of cars would be neccessary

# In[37]:


#exploring the unique brand values
autos['brand'].unique()


# In[38]:


#confirming brand index
autos.info()


# In[39]:


#selecting the top 20 brands
top_5_brands = autos['brand'].value_counts().sort_values(ascending=False).head(5).index
top_5_brands


# In[40]:


#aggregating prices for top 5 brands
top_brands_m_prices = {} #empty dictionary for top 5 brand mean prices

for brand in top_5_brands: #looping over top brands
    selected_brand = autos[autos['brand'] == brand] #selecting top brands autos dataset
    mean_price = selected_brand['price_usd'].mean()#calculating mean price for top brand
    top_brands_m_prices[brand] = int(mean_price) #assigning to dictionary
print(top_brands_m_prices) #printing out results 


# In[41]:


sorted(top_brands_m_prices.items(), key=lambda x: x[1])


# From the results of our aggregation, audi, mercedesbenz and bmw are most expensive brands, followed by volkswagen and opel is least amongst top 5. 

# In[42]:


#aggregating mileage for top 5 brands
top_brands_m_mileage = {} #empty dictionary for top 5 brand mean mileage

for brand in top_5_brands: #looping over top brands
    selected_brand = autos[autos['brand'] == brand] #selecting top brands autos dataset
    mean_mileage = selected_brand['odometer_km'].mean()#calculating mean mileage for top brand
    top_brands_m_mileage[brand] = int(mean_mileage) #assigning to dictionary
print(top_brands_m_mileage) #printing out results 


# In[43]:


#using pandas series constructor pd.Series to construct a series 
#for top brand mean prices
top_bmp_series = pd.Series(top_brands_m_prices)
print(top_bmp_series)


# In[44]:


# using the pandas DataFrame constructor to construct a dataframe
#for mean prices
df_brands = pd.DataFrame(top_bmp_series, columns=['mean_price'])
df_brands


# In[45]:


#brands mean mileage
top_bmm_series = pd.Series(top_brands_m_mileage)
print(top_bmm_series)


# In[46]:


#adding brands mean mileage to the dataframe
df_brands.insert(1, 'top_brands_m_mileage', top_bmm_series)


# In[47]:


#viewing new dataframe
df_brands


# Conclusion:
# The cost of top brand vehicles do not neccessarily determing the cars mileage.

# ### Resources
# >  * [Looping over series](https://community.dataquest.io/t/looping-over-the-correct-series/432463/10)
# 
# >  * [Sorting the dictionary](https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value)