#!/usr/bin/env python
# coding: utf-8

# **Ebay Germany Car Sales dataset analysis**

# In[2]:


import pandas as pd
import numpy as np
print("imported")


# In[3]:


autos=pd.read_csv("autos.csv", encoding="Latin-1")
print(autos.head(1))


# In[4]:


autos


# In[5]:


autos.info()


# In[6]:


autos.columns


# **Renaming columns - from camelCase to snake_case**

# In[7]:


autos.rename({"yearOfRegistration":"registration_year",
                           "monthOfRegistration":"registration_month",
                           "notRepairedDamage":"unrepaired_damage",
                           "dateCreated":"ad_created",
                            "dateCrawled":"date_crawled",
                             "offerType":"offer_type",
                             "vehicleType":"vehicle_type",
                             "powerPS":"power_ps",
                             "fuelType":"fuel_type",
                             "nrOfPictures":"nr_of_pictures",
                             "postalCode":"postal_code",
                             "lastSeen":"last_seen"},axis=1,inplace=True)
autos.head()


# In[8]:


autos.describe(include="all") #show all columns


# **Formating some columns' type from st to int.**

# In[9]:


try: #rerunning of code will raise attrib error because it already applied to dataframe
    autos["price"]=autos["price"].str.replace("$","").str.replace(",","").astype(int)
    autos["odometer"]=autos["odometer"].str.replace("km","").str.replace(",","").astype(int)
except:
    pass
autos.rename({"odometer":"odometer_km"},axis=1,inplace=True)
autos["price"].head()
autos["odometer_km"].head()


# In[10]:


autos["price"].head()


# In[11]:


print(autos["price"].unique().shape)
print(autos["odometer_km"].unique().shape)
print(autos["price"].describe())
autos["odometer_km"].describe()


# In[12]:


print(autos.isnull().sum())
print(autos["price"].isnull().sum())


# In[13]:


print(autos["price"].value_counts().sort_index().head(10))
print(autos["price"].value_counts().sort_index(ascending=False).head(10))
print(autos["odometer_km"].value_counts().sort_index().head())


# In[14]:


print(autos[autos["price"]<100].head())


# **Lets remove outliers with selecting only appropriate columns**

# In[15]:


autos=autos[(autos["price"]>100) & (autos["price"]<400000)]
print(autos.describe())


# **Let's check dates and their distribution. Normlaize for percentage expression**

# In[16]:


autos.loc[:,["date_crawled","last_seen","ad_created",
             "registration_month","registration_year"]].head()


# In[17]:


print(autos['date_crawled'].str[:10].value_counts(
                    normalize=True, dropna=False).sort_index(ascending=False)[0:5]) #or head()
print(autos['ad_created'].str[:10].value_counts(
                    normalize=True, dropna=False).sort_index()[0:5])
print(autos['last_seen'].str[:10].value_counts(
                    normalize=True, dropna=False).sort_index()[0:5])


# In[ ]:


# In[18]:


autos["registration_year"].describe()


# **Oldest registration year is year 1000 and neüest car is made in 9999 which is odd..Lets check what percent is between normal values**

# In[19]:


autos["registration_year"].between(1900,2020).value_counts(normalize=True)


# **It seem 99.9% is normal dates, so we can remove odd date rows

# In[23]:


autos=autos[autos["registration_year"].between(1900,2020)]


# In[25]:


autos["registration_year"].value_counts(normalize=True).sort_index()


# In[22]:


48071/autos.shape[0]


# **Discover brand columns**

# In[62]:


print(autos["brand"].isnull().sum())
print(autos["brand"].value_counts())
print(autos["brand"].value_counts(normalize=True)) #with percentage
print(autos["brand"].unique())
unique_brands=autos["brand"].unique()


# **Check price mean for unique brands(Excel Pivot can also do..)**

# In[75]:


brands_dict={}
for brand in unique_brands:
    print("\033[1m+Mean price of {}:".format(brand),autos[autos["brand"]==brand]["price"]
          .mean().astype(int)) #int yaziram ki float-u round off edim cevirsin.
    brands_dict[brand]=autos[autos["brand"]==brand]["price"].mean().astype(int)
print(brands_dict)  


# **Below is another way of getting the same result as above (value_counts.index method)**

# In[80]:


brand_counts=autos["brand"].value_counts()
brands_dict={}
for brand in brand_counts.index:
    brands_dict[brand]=brand_counts[brand]
print(brands_dict)  


# **Filter column itself according desired indicators**

# In[113]:


brand_counts=autos["brand"].value_counts(normalize=True)
common_brands=brand_counts[brand_counts>0.05].index
print(common_brands)  
c_brands_mean_prices={}
for brand in common_brands:
    mean_prices=autos[autos["brand"]==brand]["price"].mean()
    print(mean_prices)
    c_brands_mean_prices[brand]=int(mean_prices)
c_brands_mean_prices    


# **Above: of the top 6 brands most expensive are: audi,mercedes and bmw in sequence.**

# In[100]:


d={}
a=autos["brand"].value_counts()
print(type(a))

'''for i in a:
    d[a.index]=3'''

print(a[a>5000].values)


# **Series and dataframe constructos**

# In[108]:


bmp_series=pd.Series(c_brands_mean_prices)
#print(bmp_series)
sample_series=pd.Series(autos["brand"].value_counts(normalize=True))
print('indexes\n',sample_series.index)
#print(sample_series.array)
print('values\n',sample_series.values)
print('shape\n',sample_series.shape)


# In[109]:


sample_series=pd.DataFrame(autos["brand"].value_counts(normalize=True),
                       columns=['mean_price'])


# **Calculate mean mileages for common brands**

# In[114]:


c_brands_mean_mileages={} 
for brand in common_brands:
    mean_mileage=autos[autos["brand"]==brand]["odometer_km"].mean()
    print(mean_mileage)
    c_brands_mean_mileages[brand]=int(mean_mileage)
c_brands_mean_mileages   


# In[115]:


print(c_brands_mean_mileages)
print(c_brands_mean_prices)


# **Lets convert mean dicts to pd.DataFrames with merging series method**

# In[119]:


bmm_series=pd.Series(c_brands_mean_mileages)
bmp_series=pd.Series(c_brands_mean_prices)
bmm_df=pd.DataFrame(bmm_series,columns=['mean_mileage'])
#bmp_df=pd.DataFrame(bmp_series, columns=['mean_price'])

bmm_df['mean_price']=bmp_series
print(bmm_df)


# **Lets find most frequent models of common brands and collect to dict**

# In[145]:


c_models_dict={}
for brand in common_brands:
    freq_models=autos[autos["brand"]==brand]["model"].value_counts()
    print(brand.upper(),'\n',freq_models[:3],'\n')
    c_models_dict[brand]={freq_models.index[0],freq_models.values[0]}
c_models_dict


# In[149]:


autos["odometer_km"].value_counts()


# **Compare mean price of damaged and undamaged cars**

# In[160]:


not_damaged_price=autos[autos['unrepaired_damage']=="nein"]["price"].mean()
damaged_price=autos[autos['unrepaired_damage']=="ja"]["price"].mean()
print(damaged_price,not_damaged_price)
print("Price difference is:",int(not_damaged_price-damaged_price),"$")


# In[ ]: