import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import pickle
from PIL import Image
%matplotlib inline
# load pickle data
df = pickle.load(open('./data/df_img_100_100.pickle','rb'))
df.head()
gender | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 9990 | 9991 | 9992 | 9993 | 9994 | 9995 | 9996 | 9997 | 9998 | 9999 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | female | 126 | 116 | 104 | 97 | 98 | 95 | 85 | 79 | 81 | ... | 142 | 143 | 147 | 151 | 156 | 152 | 157 | 162 | 165 | 167 |
1 | female | 221 | 221 | 221 | 221 | 221 | 222 | 224 | 225 | 223 | ... | 160 | 37 | 27 | 31 | 14 | 59 | 159 | 106 | 51 | 150 |
3 | female | 35 | 35 | 37 | 41 | 41 | 40 | 44 | 43 | 27 | ... | 34 | 34 | 34 | 34 | 34 | 34 | 34 | 34 | 34 | 33 |
4 | female | 53 | 60 | 71 | 59 | 38 | 38 | 52 | 63 | 55 | ... | 84 | 84 | 84 | 84 | 84 | 84 | 84 | 84 | 84 | 85 |
5 | female | 18 | 19 | 21 | 21 | 19 | 17 | 18 | 25 | 34 | ... | 71 | 73 | 73 | 72 | 80 | 86 | 84 | 89 | 93 | 99 |
5 rows × 10001 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5460 entries, 0 to 6057 Columns: 10001 entries, gender to 9999 dtypes: object(1), uint8(10000) memory usage: 52.2+ MB
# checking for missing values
df.isnull().sum().sum()
0
# removing missing values if there are
# df.dropna(axis=0,inplace=True)
# df.isnull().sum()
# split the into two parts
X = df.iloc[:,1:].values
y = df.iloc[:,0].values
X.shape
(5460, 10000)
Xnorm = $\frac {x - minValue} { maxValue - minValue}$
X.min(), X.max()
(0, 255)
Xnorm = (X - X.min()) / (X.max() - X.min())
Xnorm.shape
(5460, 10000)
# female = 1, male = 0
y_norm = np.where(y=='female',1,0)
# save x and y in numpy zip
np.savez('./data/data_10000_norm.npz',Xnorm,y_norm)