#!/usr/bin/env python # coding: utf-8 # ## Imports # In[1]: import pandas as pd import os print pd.__version__ # # Removing Columns/Rows (Vid-6) # In[2]: DATA_DIR = '../data' # reading table # making seperator as comma # renaming column names for 0th row of the file df = pd.read_table( os.path.abspath(os.path.join(DATA_DIR,'day1/iris.csv')), sep=',', header=0, names=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class'] ) df.head(5) # In[3]: # see dimension of the dataset # 150 rows, 5 columns df.shape # ## Column Drop # In[4]: # drop method takes the column names in array # axis=1 corresponds to columns # inplace=True does not require you to hold it in other variable, memory efficient df.drop(['class'], axis=1, inplace=True) df.head(5) # ## Row Drop # In[5]: # drop method takes the row names in array # axis=0 corresponds to rows, bydefault axis=0 in drop method # inplace=True does not require you to hold it in other variable, memory efficient df.drop([0, 1], axis=0, inplace=True) df.head(5) # ## Takeaways # # 1. Keep in practice to always specify 'axis' parameter in drop method or other necessary methods for better understanding. # # ----------------------- # # Sorting (Vid-7) # In[6]: df = pd.read_table( 'http://bit.ly/imdbratings', sep=',' ) df.head(5) # In[7]: # sort_values() method returns bydefault by ascending order # sort_values() can take 'inplace=True/False' for changing the values inplace print df['star_rating'].sort_values().head(5) print df['star_rating'].sort_values().tail(5) # In[8]: # ascending=True/False parameter in sort_values() can decide the sorting order print df['star_rating'].sort_values(ascending=False).head(5) print df['star_rating'].sort_values(ascending=False).tail(5) # In[9]: # relatively better way to do is to use the below mentioned technique # to sort by multiple fields, just populate the array inside sort_values() df.sort_values(['duration'], ascending=True).head(5) # # Takeaways # # 1. Pandas dataframe is table having rows and columns. # 2. Pandas Series is just one column in the dataframe. # 3. sort_values() method returns bydefault by ascending order. # # ----------------------- # # # Single Filter (Vid-8) # In[10]: df = pd.read_table( 'http://bit.ly/imdbratings', sep=',' ) df.head(5) # ## Experiment 1 # In[11]: # we need movies above 8.5 df_rating_bools = df['star_rating'].map(lambda row: row>8.5) df[df_rating_bools].tail(5) # ## Experiment 2 # In[12]: # we need movies above 8.5 boolean = list() for row in df['star_rating']: if row > 8.5: boolean.append(True) else: boolean.append(False) # boolean is a list, and since column in pandas is a series, so we need to convert list to series df_rating_bools = pd.Series(boolean) df[df_rating_bools].tail(5) # ## Experiment 3 # In[13]: # df['star_rating'] > 8.5 automatically searches/iterates through all the rows satisying this condition df[df['star_rating'] > 8.5].tail(5) # # Takeaways # # 1. Try practicing, Experiment 3 while coding. # # ----------------------- # # # Multiple Filter (Vid-9) # In[14]: df = pd.read_table( 'http://bit.ly/imdbratings', sep=',' ) df.head(5) # ## Experiment 1 # In[15]: # we want movies that have rating above 8.5 and duration above 200mins df[(df['star_rating'] > 8.5) & (df['duration'] > 200)].head(5) # ## Experiment 2 # In[16]: df_rating_bools = df['star_rating'].map(lambda row: row>8.5) df_duration_bools = df['duration'].map(lambda row: row>200) df[df_rating_bools & df_duration_bools].tail(5) # ## Experiment 3 # In[17]: # this appoarch is inspired by python 'if in [1,2]' functionality bools = df['genre'].isin(['Drama', 'Action']) df[bools].head(5) # # Takeaways # # 1. Use & when putting and filter # 2. Use | when putting or filter # 3. Remember to put parenthesis as shown in [Vid-9 Experiment 1], it helps pandas to set priority to the evaluations # 4. When in situation to use multiple | conditions, try using Vid-9 Experiement 3