In [2]:
import numpy as np
import seaborn as sns
import os
import pandas as pd

df = pd.DataFrame()
print (df)
Empty DataFrame
Columns: []
Index: []
In [3]:
import pandas as pd
data = [1,2,3,4,5]
df = pd.DataFrame(data)
print (df)
   0
0  1
1  2
2  3
3  4
4  5
In [4]:
import pandas as pd
data = [['Mouse',10],['KBD',12],['Monitor',13]]
df = pd.DataFrame(data,columns=['Item','Quantity'])
print (df)
      Item  Quantity
0    Mouse        10
1      KBD        12
2  Monitor        13
In [5]:
import pandas as pd
data = [['Mouse',10],['KBD',12],['Monitor',13]]
df = pd.DataFrame(data,columns=['Name','Quantity'],dtype=float)
print (df)
      Name  Quantity
0    Mouse      10.0
1      KBD      12.0
2  Monitor      13.0
In [6]:
import pandas as pd
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data)
print (df)
    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42
In [7]:
import pandas as pd
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])
print (df)
        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42
In [8]:
print(os.listdir())
['.ipynb_checkpoints', 'box office for teaching.ipynb', 'cancer diagnosis for teaching.ipynb', 'data2.csv', 'Day 8', 'DPA - Day 1.ipynb', 'DPA-Class 2-13-09-2020.ipynb', 'gapminder.ipynb', 'temp.csv', 'test.csv', 'train.csv']
In [9]:
df=pd.read_csv("temp.csv")
df
Out[9]:
A B
0 1 1.0
1 NaN NaN
2 3 31.0
3 2 22.0
4 3 33.0
5 1 11.0
6 2 21.0
7 NaN 24.0
8 1 12.0
9 na 32.0
In [10]:
df.isnull()
Out[10]:
A B
0 False False
1 True True
2 False False
3 False False
4 False False
5 False False
6 False False
7 True False
8 False False
9 False False
In [11]:
df.isnull().sum()
Out[11]:
A    2
B    1
dtype: int64
In [12]:
missing_values=["N/a","na",np.nan]
df=pd.read_csv("temp.csv",na_values=missing_values)
In [13]:
df
Out[13]:
A B
0 1.0 1.0
1 NaN NaN
2 3.0 31.0
3 2.0 22.0
4 3.0 33.0
5 1.0 11.0
6 2.0 21.0
7 NaN 24.0
8 1.0 12.0
9 NaN 32.0
In [14]:
df.isnull()
Out[14]:
A B
0 False False
1 True True
2 False False
3 False False
4 False False
5 False False
6 False False
7 True False
8 False False
9 True False
In [15]:
df.isnull().sum()
Out[15]:
A    3
B    1
dtype: int64
In [16]:
df.isnull().any()
Out[16]:
A    True
B    True
dtype: bool
In [17]:
sns.heatmap(df.isnull(),yticklabels=False)
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x2086359aa00>
In [18]:
sns.heatmap(df.isnull(),yticklabels=False,annot=True)
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x20863d5cc40>
In [19]:
df11 = pd.DataFrame(data={"A":[1,np.nan,1,2,3],
                          "B":[2,np.nan,4,np.nan,22]
                         })
df11
Out[19]:
A B
0 1.0 2.0
1 NaN NaN
2 1.0 4.0
3 2.0 NaN
4 3.0 22.0
In [20]:
df11.dropna()
Out[20]:
A B
0 1.0 2.0
2 1.0 4.0
4 3.0 22.0
In [21]:
df11.dropna(how="all")
Out[21]:
A B
0 1.0 2.0
2 1.0 4.0
3 2.0 NaN
4 3.0 22.0
In [22]:
df11.fillna(0)
Out[22]:
A B
0 1.0 2.0
1 0.0 0.0
2 1.0 4.0
3 2.0 0.0
4 3.0 22.0
In [23]:
df11.fillna(method='ffill')
Out[23]:
A B
0 1.0 2.0
1 1.0 2.0
2 1.0 4.0
3 2.0 4.0
4 3.0 22.0
In [24]:
df11.fillna(method='bfill')
Out[24]:
A B
0 1.0 2.0
1 1.0 4.0
2 1.0 4.0
3 2.0 22.0
4 3.0 22.0
In [25]:
df11.interpolate()
Out[25]:
A B
0 1.0 2.0
1 1.0 3.0
2 1.0 4.0
3 2.0 13.0
4 3.0 22.0
In [26]:
df11
Out[26]:
A B
0 1.0 2.0
1 NaN NaN
2 1.0 4.0
3 2.0 NaN
4 3.0 22.0
In [27]:
df12=df11.interpolate()
In [28]:
df
Out[28]:
A B
0 1.0 1.0
1 NaN NaN
2 3.0 31.0
3 2.0 22.0
4 3.0 33.0
5 1.0 11.0
6 2.0 21.0
7 NaN 24.0
8 1.0 12.0
9 NaN 32.0
In [29]:
df12
Out[29]:
A B
0 1.0 2.0
1 1.0 3.0
2 1.0 4.0
3 2.0 13.0
4 3.0 22.0
In [30]:
df.fillna({'A':99999})
Out[30]:
A B
0 1.0 1.0
1 99999.0 NaN
2 3.0 31.0
3 2.0 22.0
4 3.0 33.0
5 1.0 11.0
6 2.0 21.0
7 99999.0 24.0
8 1.0 12.0
9 99999.0 32.0