#!/usr/bin/env python # coding: utf-8 # 1. Replace Not available with NaN value in pandas? (Example.csv) # In[1]: import pandas as pd df=pd.read_csv('Example.csv',na_values=['not available']) df.head() # 2. To get the first 3 rows of a given DataFrame. # Sample Python dictionary data and list labels: # exam_data = {'name': ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas'], # 'score': [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19], # 'attempts': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], # 'qualify': ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']} # labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] # # In[8]: dict_data = {'name': list(df.loc[:2,'name']),'class': list(df.loc[:2,'class'])} dict_data # In[2]: import numpy as np import pandas as pd dictx= {'name': ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas'], 'score': [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19], 'attempts': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'qualify': ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']} df2=pd.DataFrame(dictx,index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']) df2 # In[4]: df2.iloc[:3] # 3. To select the 'name' and 'score' columns from frame # In[10]: df2.loc[:,['name','score']] # 4. To select the specified columns and rows from a given data frame. # # Select 'name' and 'score' columns in rows 1, 3, 5, 6 from the following data frame. # # In[11]: df2.index =[0,1,2,3,4,5,6,7,8,9] # In[12]: df2.loc[[1,3,5,6],['name','score']] # In[13]: df2.loc[:,['name','score']] # In[7]: df2.groupby('name').aggregate({'attempts':max}).reset_index() # In[3]: df2.groupby('attempts').max().reset_index().sort_values(by='attempts',ascending=False).head(1) # 5. To select the rows where the number of attempts in the examination is greater than 2. # In[12]: df2[df2['attempts']>2]['name'] # In[5]: df2.loc[df2['attempts']>2] # In[10]: df2.loc[df2['attempts']>2,['attempts','name']] # 6. To count the number of rows and columns of a DataFrame. # In[16]: df2.shape[0] # 7. To select the rows where the score is missing, i.e. is NaN. # In[15]: df2.loc[df2['score'].isna()==True] # In[22]: df2['score'].value_counts() # In[29]: df2['score'].isna().sum() # In[18]: df2[df2['score'].isna()] # 8. To select the rows the score is between 15 and 20 (inclusive). # In[19]: df2[df2['score'].between(15,20)] # 9. To select the rows where number of attempts in the examination is less than 2 and score greater than 15. # In[20]: df2.loc[(df2['attempts']<2) & (df2['score']>15)] # 10. To change the score in row 'd' to 11.5. # In[21]: df2.loc[4,'score']=11.5 df2 # 11. To calculate the sum of the examination attempts by the students. # In[22]: df2['attempts'].aggregate(sum) # 12. To calculate the mean score for each different student in DataFrame. # In[16]: df2.groupby('name').aggregate({'score':np.mean}) # 13. To append a new row 'k' to data frame with given values for each column. Now delete the new row and return the original DataFrame. # In[18]: df2.loc['k'] = ['smesh',15,1,'yes'] print(df2) # In[19]: df2.loc['k'] df2.drop('k') # In[26]: df2['extra']=np.arange(11) df2 # In[27]: df2.drop('extra',axis=1,inplace=True) # 14. To Sort the data frame first by 'name' in descending order, then by 'score' in ascending order: # In[28]: df2.sort_values(by=['name','score'],ascending=[True,False]) # 15. To replace the 'qualify' column contains the values 'yes' and 'no' with True and False. # In[29]: df3=df2.replace({'yes':True,'no':False}) df3 # 16. To change the name 'James' to 'Suresh' in name column of the DataFrame. # In[30]: df3.loc[df3['name']=='James','name']='Suresh' # In[31]: df2[df2['name']=='James']['name'] ='Suresh' # In[32]: df3 # 17. To delete the 'attempts' column from the DataFrame. # In[33]: df3=df3.drop('attempts',axis=1) # 18. To insert a new column in existing DataFrame. # In[34]: df3['color']=np.nan # Bonus # In[35]: df3.loc[[True,False,False,False,False,False,False,False,False,False,False]] # Iterating through DataFrame # In[36]: for key,value in df.iteritems(): print('key',key) print('value',value) # In[37]: for key,value in df.iterrows(): print('key',key) print('value',value) # In[38]: for value in df.itertuples(): print('value',value) # In[39]: import seaborn as sns import matplotlib.pyplot as plt sns.heatmap(df.corr(),cmap='GnBu',annot=np.array([['',''],list(df.corr()['total marks'])]),fmt = 's',linewidths=10,linecolor='r') plt.show() # In[40]: df.corr() # In[41]: df.corr()['total marks'] # In[42]: df.corr().loc['class',['total marks']] # In[ ]: