# To install this library in Jupyter notebook
#import sys
#!{sys.executable} -m pip install pandas
import pandas as pd
pd.__version__ , pd.__path__
map()
Methoddf.remove()
Methoddf.apply()
Methoddf.applymap()
Method
import pandas as pd
df = pd.read_csv('datasets/groupdata.csv')
df.head()
# `shape` attribute of a dataframe object return a two value tuple containing rows and columns
# Note the rows count does not include the column labels and column count does not include the row index
df.shape
# `index` attribute of a dataframe object return the list of row indices and its datatype
df.index
# `columns` attribute of a dataframe object return the list of column labels and its datatype
df.columns
# `dtypes` attribute of a dataframe object return the data type of each column in the dataframe
df.dtypes
! cat datasets/groupdatawithoutcollables.csv
names
argument of pd.read_csv()
methodimport pandas as pd
df = pd.read_csv('datasets/groupdatawithoutcollables.csv', names = ['roll no', 'name', 'age', 'address', 'session',
'group', 'gender','subj1', 'subj2', 'scholarship'])
df.head(3)
columns
attribute of dataframe)¶df = pd.read_csv('datasets/groupdatawithoutcollables.csv', header = None)
df.head(3)
df.columns = ['roll no', 'name', 'age', 'address', 'session', 'group', 'gender', 'subj1', 'subj2', 'scholarship']
df.head(3)
- Suppose we have a dataframe in which there are certain column labels having spaces in between the names.
- We want to rename all such columns by replacing the space character with an underscore
- One way to do this is call
replace()
method of String class on all the column names of dataframe
df.columns
df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace(' ', '_')
df.columns
df.head()
- Suppose we have a dataframe in which there are column labels having names in different cases.
- We want to rename all such columns such that the names are all lower or all upper case.
- One way to do this is to generate a new list as per the requirement using List comprehension.
list1 = [x.upper() for x in df.columns]
list1
df.columns = list1
df.head(3)
df.rename()
method)¶df.rename()
method to modify one or more column names to new onedf.rename(mapper, axis=None, inplace=False)
mapper
: can be a dictionary having comma separated key:value pairs, where, key is the old column name, while the value is the new column nameaxis
: If you want to change the column names use axis = 1 (column axis that moves from left to right)inplace
: If you want this change to occur inplace make this argument True, in which case the method will return Nonedf = pd.read_csv('datasets/groupdata.csv')
df.head(3)
#Since the inplace argument is by default False, so the rename() method will return a new dataframe
df.rename(mapper={'roll no': 'rollno', 'name':'fname'}, axis=1)
df.columns
#Since the inplace argument is now set to True, so the rename() method will return None
#however, the `df` will be changed
df.rename(mapper={ 'roll no': 'rollno'}, axis=1, inplace=True)
df.columns
df.set_index()
and df.reset_index()
, to handle this issue.
df = pd.read_csv('datasets/groupdata.csv')
df.head(3)
Let us suppose we want to change the subj1
and subj2
marks of Shaista
# Returns a Series object
df.loc[2,:]
# Returns a Dataframe object
df.loc[df.name=='Shaista', :]
# Any of the following two LOC will work
df.loc[2,:] = ['MS03', 'Shaista', 35, 'Karachi', 'AFTERNOON', 'group B', 'Female', 99, 99, 8500.0]
df.loc[df.name=='Shaista', :] = ['MS03', 'Shaista', 35, 'Karachi', 'AFTERNOON', 'group B', 'Female', 99, 99, 8500.0]
df.head(3)
# Returns a series
df.loc[2, ['subj1', 'subj2']]
# Returns a dataframe
df.loc[df.name=='Shaista', ['subj1', 'subj2']]
df.loc[2, ['subj1', 'subj2']] = [100, 100]
df.loc[df.name=='Shaista', ['subj1', 'subj2']] = [100, 100]
df.head(3)
Note: You can also use df.iloc[]
method instead of df.loc[]
to change multiple or single value of a row. Other than these two you may also try using df.at[]
method to change a single value of a row.
df.loc[filter, 'column(s)'] = 'value(s)'
map()
df.replace()
df.apply()
df.applymap()
map()
Method¶map(aFunction, *iterables)
function simply returns a map object after applying aFunction()
to all the elements of iterable(s)
.import pandas as pd
df = pd.read_csv('datasets/groupdata.csv')
df.head(3)
Example: Using built-in function with map()
# Passing a Series object (a column of dataframe) to map() as argument
# The Python built-in `len()` function is applied to all the values of name column and return a map object
map(len, df['name'])
# Type cast the map object to Series
pd.Series(map(len, df['name']))
# Another way is to call the map() method by a Series object using dot notation
df['name'].map(len)
# Third way is to access the column name as well using dot notation
df.name.map(len)
Example: Using a user-defined function with map()
df = pd.read_csv('datasets/groupdata.csv')
df.head(3)
# Let us pass a user-defined function
def myfunc(x):
if (x <= 50):
return "Young"
else:
return "Old"
df['age'].map(myfunc)
# If you want to save this as a new column in the dataframe you can do that
df['newcol'] = df['age'].map(myfunc)
df.head()
Example: Using a Lambda function with map()
df['age'].map(lambda x: "Young" if x<=50 else "Old")
Example: Using a Lambda Function with map()
# You cannot pass upper to map() as we have passed len to map()
# as upper() is not a built-in function rather is a method of string class
#df['name'].map(upper)
df['name'].map(lambda x: x.upper())
Example: Passing a Dictionary {oldval:newval} to map()
for changing selected values of a categorical column
df = pd.read_csv('datasets/groupdata.csv')
df.head()
df['session'].map({'MORNING':'M', 'AFTERNOON':'A'})
Limitations of
map()
Method
- If there are values for which there is no match, the old values are changed and have become NaN. Solution is use
df.replace()
method- You can use it on an iterable or Series object not with entire dataframe. Solution is use
df.apply()
anddf.applymap()
df.replace()
Method¶df.replace()
method is used to replace values given in to_replace
with value
.loc
or .iloc
, which require you to specify a location to update with some value.df.replace(to_replace, value, inplace=False)
df = pd.read_csv('datasets/groupdata.csv')
df.head()
df['session'].replace({'MORNING':'M', 'AFTERNOON':'A'})
- Note that now there are no NaN values, rather the values that do not have a match remains as such
- Another important point is
replace()
method works equally well with dataframe
# Calling replace on entire dataframe
df.replace({'MORNING':'M', 'AFTERNOON':'A', 'group A':'GROUP-A'})
# Above operation is not inplace
df
df.apply()
Method¶df.apply()
method is used to run a function along the mentioned axis of the dataframe.apply()
method runs a function on all the elements of a series of a dataframedf.apply(func, axis=0, args)
func
: It can be a built-in, user-defined or a lambda function that is applied to every series of the dataframe as per the axis argument. (Objects passed to the func are series objects)axis
: The default value of axis argument is zero, so the func is applied to each column. If you want to apply the func to the values of a row, mention axis as one.args
: If you want to pass additional arguments to func
in addition to the element of series, you can pass them as a tuple.import pandas as pd
df = pd.read_csv('datasets/groupdata.csv')
df.head(3)
# Let us pass the built-in function `len()` and compute the length of each name under the name column of df
# So now the len() method is applied to all the values of a single column and return a series object
df['name'].apply(len)
# Let us pass a user-defined function, with an additional argument as well. This was not possible with map() method
def myfunc(x, age):
if (x <= age):
return "Young"
else:
return "Old"
df['age'].apply(myfunc, args = (50,))
# Let us use Lambda function to convert each name under the name column of df to upper case
df['name'].apply(lambda x : x.upper())
def myfunc(x, age):
if (x <= age):
return "Young"
else:
return "Old"
# If you are satisfied with the result, you may assign it to the specific column
df['name'] = df['name'].apply(lambda x : x.upper())
# Verify
df.head(3)
# Can anyone guess what this LOC will do?
df['subj1'] = df['subj1'].apply(lambda x : x+5)
df.head(3)
Uptill now we have applied the
df.apply()
method on a specific column of a dataframe. Let us apply it on a row of dataframe
# Since we have different dtypes in each row, so let us create a dataframe hving numeric columns only
df = pd.read_csv('datasets/groupdata.csv')
df_numeric = df.loc[:,['age','subj1','subj2','scholarship']]
df_string = df.loc[:,['roll no','name','address','session', 'group', 'gender']]
df_numeric.head()
# Although not much meaningful, let us add a number to each value of the row
df_numeric.loc[0].apply(lambda x : x+5)
# If you want to commit this to the datafream you can do that
df_numeric.loc[0] = df_numeric.loc[0].apply(lambda x : x+5)
df_numeric.head()
Let us use the
df.apply()
method on entire dataframe
df_numeric.apply(lambda x: x+5).head()
df.apply(min)
min(df['subj1'])
The min()
function has been applied on each column of the dataframe and for each column the minimum value has been computed and the df.apply()
method has returned a Series object
df.applymap()
Method¶df.map()
method applies a function to datafreame element wise.df.applymap(func, axis=0)
func
: A function that is passed a single value and returns a single value.Note: A Series object do not have a applymap()
method, so you cannot call it with a Series object
df = pd.read_csv('datasets/groupdata.csv')
df_string = df.loc[:,['roll no','name','address','session', 'group', 'gender']]
df_numeric = df.loc[:,['age','subj1','subj2','scholarship']]
df_string.head()
df_numeric.head()
df_string.head()
df_string.applymap(str.upper).head()
df_numeric.head(5)
# The applymap() method will apply the len function on each element of dataframe
df_numeric.applymap(lambda x : x+5).head(5)