Частично использованы материалы
import pandas as pd
import numpy as np
def make_s(n_rows):
tmp = pd.DataFrame({'price': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})
tmp['price'] = tmp['price'].astype(str) + '$'
return tmp
data = make_s(5)
data
price | feature | |
---|---|---|
0 | 17$ | 0.0 |
1 | 89$ | 0.0 |
2 | 39$ | 0.0 |
3 | 97$ | 0.0 |
4 | 23$ | 0.0 |
data['price($)_v1'] = data['price'].apply(lambda x: int(x[:-1]))
data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)
data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int)
data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)
data
price | feature | price($)_v1 | price($)_v2 | price($)_v3 | price($)_v4 | |
---|---|---|---|---|---|---|
0 | 17$ | 0.0 | 17 | 17 | 17 | 17 |
1 | 89$ | 0.0 | 89 | 89 | 89 | 89 |
2 | 39$ | 0.0 | 39 | 39 | 39 | 39 |
3 | 97$ | 0.0 | 97 | 97 | 97 | 97 |
4 | 23$ | 0.0 | 23 | 23 | 23 | 23 |
data = make_s(10000000)
%%time
data['price($))_v1'] = data['price'].apply(lambda x: int(x[:-1]))
# 4.2-4.33
CPU times: user 4.26 s, sys: 60 ms, total: 4.32 s Wall time: 4.36 s
%%time
data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)
# 2.47 s - 2.52 s
CPU times: user 2.5 s, sys: 124 ms, total: 2.62 s Wall time: 2.65 s
%%time
data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int)
# 3.14 - 3.31
CPU times: user 3.19 s, sys: 152 ms, total: 3.34 s Wall time: 3.35 s
%%time
data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)
# 3.43 - 4
CPU times: user 3.43 s, sys: 164 ms, total: 3.6 s Wall time: 3.63 s
def make_t(n_rows):
tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'A', 'B'), 'feature': np.zeros(n_rows)})
return tmp
data = make_t(5)
data
type | feature | |
---|---|---|
0 | A | 0.0 |
1 | B | 0.0 |
2 | A | 0.0 |
3 | A | 0.0 |
4 | A | 0.0 |
data['type_v1'] = data['type'].apply(lambda x: 1 if x == "A" else 0)
data['type_v2'] = (data['type']=='A').astype(int)
data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)
data['type_v4'] = data['type'].map({'A': 1, 'B': 0})
data['type_v5'] = data['type'].factorize()[0] # некорректный ответ
data['type_v6'] = pd.get_dummies(data['type'])['A'] # uint8!!!
from sklearn import preprocessing
data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ
data
type | feature | type_v1 | type_v2 | type_v3 | type_v4 | type_v5 | type_v6 | type_v7 | |
---|---|---|---|---|---|---|---|---|---|
0 | A | 0.0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 |
1 | B | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
2 | A | 0.0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 |
3 | A | 0.0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 |
4 | A | 0.0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 |
data = make_t(10000000)
%%time
data['type_v1'] = data['type'].apply(lambda x: 1 if x == "A" else 0)
# 2.14 s - 2.18 s
CPU times: user 2.14 s, sys: 72 ms, total: 2.22 s Wall time: 2.25 s
%%time
data['type_v1'] = data['type'].apply(lambda x: "1" if x == "A" else "0").astype(int)
# 1.82 s - 1.86 s
CPU times: user 1.82 s, sys: 60 ms, total: 1.88 s Wall time: 1.89 s
%%time
data['type_v2'] = (data['type']=='A').astype(int)
# 348 - 364
CPU times: user 364 ms, sys: 20 ms, total: 384 ms Wall time: 386 ms
%%time
data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)
# 380-398
CPU times: user 376 ms, sys: 20 ms, total: 396 ms Wall time: 398 ms
%%time
data['type_v4'] = data['type'].map({'A': 1, 'B': 0})
# 400-424
CPU times: user 420 ms, sys: 16 ms, total: 436 ms Wall time: 443 ms
%%time
data['type_v5'] = data['type'].factorize()[0]
# 304-324
CPU times: user 304 ms, sys: 36 ms, total: 340 ms Wall time: 357 ms
%%time
data['type_v6'] = pd.get_dummies(data['type'])['A']
# 360-392
CPU times: user 364 ms, sys: 28 ms, total: 392 ms Wall time: 395 ms
%%time
from sklearn import preprocessing
data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ
# 5.47 s - 5.57 s
CPU times: user 5.5 s, sys: 36 ms, total: 5.53 s Wall time: 5.52 s
def make_ab(n_rows):
tmp = pd.DataFrame({'A': (100*np.random.rand(n_rows)).astype(int), 'B': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})
tmp['A/B'] = tmp['A'].astype(str) + '/' + tmp['B'].astype(str)
del tmp['A']
del tmp['B']
return tmp
data = make_ab(5)
data
feature | A/B | |
---|---|---|
0 | 0.0 | 23/60 |
1 | 0.0 | 65/76 |
2 | 0.0 | 66/53 |
3 | 0.0 | 57/53 |
4 | 0.0 | 85/18 |
tmp = data['A/B'].str.split('/')
data['A_v1'] = tmp.apply(lambda x: x[0])
data['B_v1'] = tmp.apply(lambda x: x[1])
data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())
data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)
st = '/'.join(data['A/B'])
data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))
data
feature | A/B | A_v1 | B_v1 | A_v2 | B_v2 | A_v3 | B_v3 | A_v4 | B_v4 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 23/60 | 23 | 60 | 23 | 60 | 23 | 60 | 23 | 60 |
1 | 0.0 | 65/76 | 65 | 76 | 65 | 76 | 65 | 76 | 65 | 76 |
2 | 0.0 | 66/53 | 66 | 53 | 66 | 53 | 66 | 53 | 66 | 53 |
3 | 0.0 | 57/53 | 57 | 53 | 57 | 53 | 57 | 53 | 57 | 53 |
4 | 0.0 | 85/18 | 85 | 18 | 85 | 18 | 85 | 18 | 85 | 18 |
data = make_ab(10000000)
%%time
tmp = data['A/B'].str.split('/')
data['A_v1'] = tmp.apply(lambda x: x[0])
data['B_v1'] = tmp.apply(lambda x: x[1])
# 12.5 s-13.4 s
CPU times: user 13.1 s, sys: 1.51 s, total: 14.6 s Wall time: 14.6 s
%%time
data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())
# 10.2 s - 12.2 s
CPU times: user 9.94 s, sys: 176 ms, total: 10.1 s Wall time: 10.2 s
%%time
data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)
# 26.1 s - 29.2 s
CPU times: user 26.1 s, sys: 368 ms, total: 26.5 s Wall time: 26.5 s
%%time
st = '/'.join(data['A/B'])
data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))
# 3.65 s - 4.54 s
CPU times: user 3.65 s, sys: 168 ms, total: 3.82 s Wall time: 3.84 s
def make_t(n_rows):
tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'train', 'test'),
'feature': np.where(np.random.rand(n_rows)<0.5, (100*np.random.rand(n_rows)).astype(int), np.nan)})
tmp['feature_v1'] = tmp['feature']
tmp['feature_v2'] = tmp['feature']
tmp['feature_v3'] = tmp['feature']
tmp['feature_v4'] = tmp['feature']
return tmp
data = make_t(10)
data
type | feature | feature_v1 | feature_v2 | feature_v3 | feature_v4 | |
---|---|---|---|---|---|---|
0 | test | NaN | NaN | NaN | NaN | NaN |
1 | test | 43.0 | 43.0 | 43.0 | 43.0 | 43.0 |
2 | test | NaN | NaN | NaN | NaN | NaN |
3 | train | 4.0 | 4.0 | 4.0 | 4.0 | 4.0 |
4 | train | 18.0 | 18.0 | 18.0 | 18.0 | 18.0 |
5 | train | NaN | NaN | NaN | NaN | NaN |
6 | train | NaN | NaN | NaN | NaN | NaN |
7 | train | 25.0 | 25.0 | 25.0 | 25.0 | 25.0 |
8 | train | NaN | NaN | NaN | NaN | NaN |
9 | train | NaN | NaN | NaN | NaN | NaN |
name = 'feature_v1'
data.loc[data['type'] == 'test', name] = \
data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())
data.loc[data['type'] == 'train', name] = \
data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())
name = 'feature_v2'
data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))
name = 'feature_v3'
data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')
name = 'feature_v4'
data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])
#name = 'feature_v4'
#gb = data.groupby('type')
#mn = gb.mean()
#for gn, x in gb:
# x[name].fillna(mn.loc[gn], inplace=True)
data
type | feature | feature_v1 | feature_v2 | feature_v3 | feature_v4 | |
---|---|---|---|---|---|---|
0 | test | NaN | 43.000000 | 43.000000 | 43.000000 | 43.000000 |
1 | test | 43.0 | 43.000000 | 43.000000 | 43.000000 | 43.000000 |
2 | test | NaN | 43.000000 | 43.000000 | 43.000000 | 43.000000 |
3 | train | 4.0 | 4.000000 | 4.000000 | 4.000000 | 4.000000 |
4 | train | 18.0 | 18.000000 | 18.000000 | 18.000000 | 18.000000 |
5 | train | NaN | 15.666667 | 15.666667 | 15.666667 | 15.666667 |
6 | train | NaN | 15.666667 | 15.666667 | 15.666667 | 15.666667 |
7 | train | 25.0 | 25.000000 | 25.000000 | 25.000000 | 25.000000 |
8 | train | NaN | 15.666667 | 15.666667 | 15.666667 | 15.666667 |
9 | train | NaN | 15.666667 | 15.666667 | 15.666667 | 15.666667 |
data = make_t(10000000)
%%time
name = 'feature_v1'
data.loc[data['type'] == 'test', name] = data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())
data.loc[data['type'] == 'train', name] = data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())
# 3.44 s - 3.84 s
CPU times: user 3.66 s, sys: 132 ms, total: 3.79 s Wall time: 3.81 s
%%time
name = 'feature_v2'
data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))
# 1.9 s - 2.04 s
CPU times: user 1.9 s, sys: 152 ms, total: 2.05 s Wall time: 2.06 s
%%time
name = 'feature_v3'
data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')
# 1.17 - 1.18 s
CPU times: user 1.2 s, sys: 128 ms, total: 1.32 s Wall time: 1.35 s
%%time
name = 'feature_v4'
data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])
# 1.26 s - 1.38
CPU times: user 1.37 s, sys: 72 ms, total: 1.44 s Wall time: 1.45 s