import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# Let's see how we would find outliers in a dataset
# First we'll seed the numpy generator
np.random.seed(12345)
#Next we'll create the dataframe
dframe = DataFrame(np.random.randn(1000,4))
#Show preview
dframe.head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | -0.204708 | 0.478943 | -0.519439 | -0.555730 |
1 | 1.965781 | 1.393406 | 0.092908 | 0.281746 |
2 | 0.769023 | 1.246435 | 1.007189 | -1.296221 |
3 | 0.274992 | 0.228913 | 1.352917 | 0.886429 |
4 | -2.001637 | -0.371843 | 1.669025 | -0.438570 |
# Lets describe the data
dframe.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.067684 | 0.067924 | 0.025598 | -0.002298 |
std | 0.998035 | 0.992106 | 1.006835 | 0.996794 |
min | -3.428254 | -3.548824 | -3.184377 | -3.745356 |
25% | -0.774890 | -0.591841 | -0.641675 | -0.644144 |
50% | -0.116401 | 0.101143 | 0.002073 | -0.013611 |
75% | 0.616366 | 0.780282 | 0.680391 | 0.654328 |
max | 3.366626 | 2.653656 | 3.260383 | 3.927528 |
# Lets select the first column
col = dframe[0]
# NOw we can check which values in the column are greater than 3, for instance.
col[np.abs(col)>3]
523 -3.428254 900 3.366626 Name: 0, dtype: float64
# So we now know in column[0], rows 523 and 900 have values with abs > 3
#How about all the columns?
# We can use the "any" method
dframe[(np.abs(dframe)>3).any(1)]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
5 | -0.539741 | 0.476985 | 3.248944 | -1.021228 |
97 | -0.774363 | 0.552936 | 0.106061 | 3.927528 |
102 | -0.655054 | -0.565230 | 3.176873 | 0.959533 |
305 | -2.315555 | 0.457246 | -0.025907 | -3.399312 |
324 | 0.050188 | 1.951312 | 3.260383 | 0.963301 |
400 | 0.146326 | 0.508391 | -0.196713 | -3.745356 |
499 | -0.293333 | -0.242459 | -3.056990 | 1.918403 |
523 | -3.428254 | -0.296336 | -0.439938 | -0.867165 |
586 | 0.275144 | 1.179227 | -3.184377 | 1.369891 |
808 | -0.362528 | -3.548824 | 1.553205 | -2.186301 |
900 | 3.366626 | -2.372214 | 0.851010 | 1.332846 |
# WE could also possibly cap the data at 3
dframe[np.abs(dframe)>3] = np.sign(dframe) *3
dframe.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.061623 | 0.074473 | 0.037153 | 0.009919 |
std | 0.995875 | 0.989820 | 1.003604 | 0.989688 |
min | -2.969411 | -2.989741 | -2.925113 | -2.881858 |
25% | -0.774132 | -0.588138 | -0.622310 | -0.636641 |
50% | -0.115171 | 0.102787 | 0.012889 | -0.010997 |
75% | 0.619779 | 0.787953 | 0.682401 | 0.659019 |
max | 3.000000 | 3.000000 | 3.000000 | 3.000000 |
# Next we'll learn about Permutation!