#!/usr/bin/env python
# coding: utf-8

# ## Imports

# In[1]:


import pandas as pd
import os

print pd.__version__


# # Removing Columns/Rows (Vid-6)

# In[2]:


DATA_DIR = '../data'
# reading table
# making seperator as comma
# renaming column names for 0th row of the file
df = pd.read_table(
                   os.path.abspath(os.path.join(DATA_DIR,'day1/iris.csv')), 
                   sep=',',
                   header=0,
                   names=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
                  )
df.head(5)


# In[3]:


# see dimension of the dataset
# 150 rows, 5 columns
df.shape


# ## Column Drop

# In[4]:


# drop method takes the column names in array
# axis=1 corresponds to columns
# inplace=True does not require you to hold it in other variable, memory efficient
df.drop(['class'], axis=1, inplace=True)
df.head(5)


# ## Row Drop

# In[5]:


# drop method takes the row names in array
# axis=0 corresponds to rows, bydefault axis=0 in drop method
# inplace=True does not require you to hold it in other variable, memory efficient
df.drop([0, 1], axis=0, inplace=True)
df.head(5)


# ## Takeaways
# 
# 1. Keep in practice to always specify 'axis' parameter in drop method or other necessary methods for better understanding.

# # -----------------------

# # Sorting (Vid-7)

# In[6]:


df = pd.read_table(
                   'http://bit.ly/imdbratings', 
                   sep=','
                  )
df.head(5)


# In[7]:


# sort_values() method returns bydefault by ascending order
# sort_values() can take 'inplace=True/False' for changing the values inplace
print df['star_rating'].sort_values().head(5)
print df['star_rating'].sort_values().tail(5)


# In[8]:


# ascending=True/False parameter in sort_values() can decide the sorting order
print df['star_rating'].sort_values(ascending=False).head(5)
print df['star_rating'].sort_values(ascending=False).tail(5)


# In[9]:


# relatively better way to do is to use the below mentioned technique
# to sort by multiple fields, just populate the array inside sort_values()
df.sort_values(['duration'], ascending=True).head(5)


# # Takeaways
# 
# 1. Pandas dataframe is table having rows and columns.
# 2. Pandas Series is just one column in the dataframe.
# 3. sort_values() method returns bydefault by ascending order.

# # -----------------------
# 
# # Single Filter (Vid-8)

# In[10]:


df = pd.read_table(
                   'http://bit.ly/imdbratings', 
                   sep=','
                  )
df.head(5)


# ## Experiment 1

# In[11]:


# we need movies above 8.5
df_rating_bools = df['star_rating'].map(lambda row: row>8.5)
df[df_rating_bools].tail(5)


# ## Experiment 2

# In[12]:


# we need movies above 8.5
boolean = list()
for row in df['star_rating']:
    if row > 8.5: boolean.append(True)
    else: boolean.append(False)

# boolean is a list, and since column in pandas is a series, so we need to convert list to series
df_rating_bools = pd.Series(boolean)
df[df_rating_bools].tail(5)


# ## Experiment 3

# In[13]:


# df['star_rating'] > 8.5 automatically searches/iterates through all the rows satisying this condition
df[df['star_rating'] > 8.5].tail(5)


# # Takeaways
# 
# 1. Try practicing, Experiment 3 while coding.

# # -----------------------
# 
# # Multiple Filter (Vid-9)

# In[14]:


df = pd.read_table(
                   'http://bit.ly/imdbratings', 
                   sep=','
                  )
df.head(5)


# ## Experiment 1

# In[15]:


# we want movies that have rating above  8.5 and duration above 200mins
df[(df['star_rating'] > 8.5) & (df['duration'] > 200)].head(5)


# ## Experiment 2

# In[16]:


df_rating_bools = df['star_rating'].map(lambda row: row>8.5)
df_duration_bools = df['duration'].map(lambda row: row>200)
df[df_rating_bools & df_duration_bools].tail(5)


# ## Experiment 3

# In[17]:


# this appoarch is inspired by python 'if in [1,2]' functionality
bools = df['genre'].isin(['Drama', 'Action'])
df[bools].head(5)


# # Takeaways
# 
# 1. Use & when putting and filter
# 2. Use | when putting or filter
# 3. Remember to put parenthesis as shown in [Vid-9 Experiment 1], it helps pandas to set priority to the evaluations
# 4. When in situation to use multiple | conditions, try using Vid-9 Experiement 3