import vaex
# from datetime import datetime
# Code for initially converting to hdf5 format
# d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d')
# df= vaex.from_csv('train_data.csv', convert=True, chunk_size=100_000, parse_dates=['S_2'],
# date_parser=d_parser)
df=vaex.open('train_data.csv.hdf5')
df.shape #No rows and features
(5531451, 190)
df.head(2)
# | customer_ID | S_2 | P_2 | D_39 | B_1 | B_2 | R_1 | S_3 | D_41 | B_3 | D_42 | D_43 | D_44 | B_4 | D_45 | B_5 | R_2 | D_46 | D_47 | D_48 | D_49 | B_6 | B_7 | B_8 | D_50 | D_51 | B_9 | R_3 | D_52 | P_3 | B_10 | D_53 | S_5 | B_11 | S_6 | D_54 | R_4 | S_7 | B_12 | S_8 | D_55 | D_56 | B_13 | R_5 | D_58 | S_9 | B_14 | D_59 | D_60 | D_61 | B_15 | S_11 | D_62 | D_63 | D_64 | D_65 | B_16 | B_17 | B_18 | B_19 | D_66 | B_20 | D_68 | S_12 | R_6 | S_13 | B_21 | D_69 | B_22 | D_70 | D_71 | D_72 | S_15 | B_23 | D_73 | P_4 | D_74 | D_75 | D_76 | B_24 | R_7 | D_77 | B_25 | B_26 | D_78 | D_79 | R_8 | R_9 | S_16 | D_80 | R_10 | R_11 | B_27 | D_81 | D_82 | S_17 | R_12 | B_28 | R_13 | D_83 | R_14 | R_15 | D_84 | R_16 | B_29 | B_30 | S_18 | D_86 | D_87 | R_17 | R_18 | D_88 | B_31 | S_19 | R_19 | B_32 | S_20 | R_20 | R_21 | B_33 | D_89 | R_22 | R_23 | D_91 | D_92 | D_93 | D_94 | R_24 | R_25 | D_96 | S_22 | S_23 | S_24 | S_25 | S_26 | D_102 | D_103 | D_104 | D_105 | D_106 | D_107 | B_36 | B_37 | R_26 | R_27 | B_38 | D_108 | D_109 | D_110 | D_111 | B_39 | D_112 | B_40 | S_27 | D_113 | D_114 | D_115 | D_116 | D_117 | D_118 | D_119 | D_120 | D_121 | D_122 | D_123 | D_124 | D_125 | D_126 | D_127 | D_128 | D_129 | B_41 | B_42 | D_130 | D_131 | D_132 | D_133 | R_28 | D_134 | D_135 | D_136 | D_137 | D_138 | D_139 | D_140 | D_141 | D_142 | D_143 | D_144 | D_145 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | '0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fb... | 2017-03-09 00:00:00.000000000 | 0.938469 | 0.00173334 | 0.00872445 | 1.00684 | 0.00922772 | 0.124035 | 0.00877113 | 0.00470924 | nan | nan | 0.000630135 | 0.0809863 | 0.708906 | 0.1706 | 0.00620403 | 0.358587 | 0.525351 | 0.255736 | nan | 0.0639022 | 0.0594157 | 0.00646558 | 0.148698 | 1.33586 | 0.00820674 | 0.0014225 | 0.207334 | 0.736463 | 0.0962188 | nan | 0.0233811 | 0.00276806 | 0.00832165 | 1.00152 | 0.00829844 | 0.161345 | 0.148266 | 0.922998 | 0.354596 | 0.152025 | 0.118075 | 0.00188179 | 0.158612 | 0.0657284 | 0.0183846 | 0.0636465 | 0.199617 | 0.308233 | 0.0163606 | 0.401619 | 0.091071 | CR | O | 0.00712616 | 0.00766527 | nan | 0.652984 | 0.00852044 | nan | 0.00472983 | 6 | 0.272008 | 0.00836254 | 0.515222 | 0.00264403 | 0.0090133 | 0.00480751 | 0.00834172 | 0.119403 | 0.0048019 | 0.108271 | 0.0508819 | nan | 0.00755443 | 0.0804216 | 0.0690668 | nan | 0.00432679 | 0.00756245 | nan | 0.00772865 | 0.000271828 | 0.00157574 | 0.00423936 | 0.00143399 | nan | 0.00227094 | 0.00406052 | 0.00712109 | 0.00245606 | 0.0023103 | 0.00353198 | 0.506612 | 0.00803302 | 1.00982 | 0.0846826 | 0.00381998 | 0.0070426 | 0.000437955 | 0.00645163 | 0.00082952 | 0.00505487 | nan | 0 | 0.00572042 | 0.00708447 | nan | 0.000198308 | 0.00890741 | nan | 1 | 0.00253721 | 0.00517736 | 0.00662618 | 0.00970514 | 0.00778159 | 0.00244996 | 1.0011 | 0.00266533 | 0.00747876 | 0.00689281 | 1.50367 | 1.00613 | 0.00356854 | 0.00887059 | 0.00394973 | 0.00364714 | 0.00495003 | 0.89409 | 0.135561 | 0.911191 | 0.974539 | 0.0012434 | 0.766688 | 1.00869 | 1.00459 | 0.893734 | nan | 0.670041 | 0.00996848 | 0.00457161 | nan | 1.00895 | 2 | nan | 0.00432553 | nan | nan | nan | 1.00734 | 0.21006 | 0.676922 | 0.00787114 | 1 | 0.23825 | 0 | 4 | 0.23212 | 0.236266 | 0 | 0.70228 | 0.434345 | 0.0030567 | 0.686516 | 0.00873972 | 1 | 1.00332 | 1.00782 | 1.00008 | 0.00680497 | nan | 0.00205169 | 0.00597188 | nan | 0.00434506 | 0.00153473 | nan | nan | nan | nan | nan | 0.00242704 | 0.00370627 | 0.00381782 | nan | 0.00056924 | 0.000609837 | 0.00267421 |
1 | '0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fb... | 2017-04-07 00:00:00.000000000 | 0.936665 | 0.00577544 | 0.00492335 | 1.00065 | 0.00615131 | 0.12675 | 0.000798359 | 0.00271358 | nan | nan | 0.00252627 | 0.0694192 | 0.712795 | 0.113239 | 0.00620567 | 0.35363 | 0.521311 | 0.223329 | nan | 0.0652611 | 0.0577438 | 0.00161401 | 0.149723 | 1.33979 | 0.00837324 | 0.00198443 | 0.202778 | 0.720886 | 0.099804 | nan | 0.0305986 | 0.00274936 | 0.00248207 | 1.00903 | 0.00513618 | 0.140951 | 0.14353 | 0.919414 | 0.326757 | 0.156201 | 0.118737 | 0.00160996 | 0.148459 | 0.0939354 | 0.0130348 | 0.0655014 | 0.151387 | 0.265026 | 0.0176879 | 0.406326 | 0.0868048 | CR | O | 0.00241324 | 0.00714816 | nan | 0.647093 | 0.00223779 | nan | 0.00387926 | 6 | 0.18897 | 0.00402957 | 0.509048 | 0.00419312 | 0.00784238 | 0.00128316 | 0.00652381 | 0.140611 | 9.36286e-05 | 0.101018 | 0.0404689 | nan | 0.00483217 | 0.0814132 | 0.0741664 | nan | 0.00420276 | 0.00530352 | nan | 0.00186413 | 0.000978889 | 0.00989584 | 0.00759728 | 0.000509316 | nan | 0.00981023 | 0.000126509 | 0.00596581 | 0.000395391 | 0.00132673 | 0.0077727 | 0.500855 | 0.000760442 | 1.00946 | 0.0818432 | 0.000346625 | 0.00778935 | 0.00431088 | 0.0023325 | 0.00946879 | 0.00375319 | nan | 0 | 0.00758434 | 0.0066773 | nan | 0.00114229 | 0.00590701 | nan | 1 | 0.0084272 | 0.00897916 | 0.00185411 | 0.00992378 | 0.00598744 | 0.00224682 | 1.00678 | 0.00250769 | 0.00682727 | 0.00283708 | 1.50358 | 1.00579 | 0.000570901 | 0.000390776 | 0.00835129 | 0.00884997 | 0.00318008 | 0.902135 | 0.136333 | 0.919876 | 0.975624 | 0.00456138 | 0.786007 | 1.00008 | 1.00412 | 0.906841 | nan | 0.668647 | 0.003921 | 0.00465385 | nan | 1.00321 | 2 | nan | 0.00870721 | nan | nan | nan | 1.00765 | 0.184093 | 0.822281 | 0.0034444 | 1 | 0.247217 | 0 | 4 | 0.243532 | 0.241885 | 0 | 0.707017 | 0.430501 | 0.00130585 | 0.686414 | 0.000755019 | 1 | 1.00839 | 1.00433 | 1.00834 | 0.00440716 | nan | 0.00103356 | 0.00483756 | nan | 0.00749478 | 0.00493136 | nan | nan | nan | nan | nan | 0.00395421 | 0.00316709 | 0.00503163 | nan | 0.00957648 | 0.00549205 | 0.00921683 |
print("There are {} unique customers in the dataset".format(df.customer_ID.nunique()))
There are 458913 unique customers in the dataset
df.S_2
Expression = S_2 Length: 5,531,451 dtype: datetime64[ns] (column) ------------------------------------------------ 0 2017-03-09 00:00:00.000000000 1 2017-04-07 00:00:00.000000000 2 2017-05-28 00:00:00.000000000 3 2017-06-13 00:00:00.000000000 4 2017-07-16 00:00:00.000000000 ... 5531446 2017-11-05 00:00:00.000000000 5531447 2017-12-23 00:00:00.000000000 5531448 2018-01-06 00:00:00.000000000 5531449 2018-02-06 00:00:00.000000000 5531450 2018-03-14 00:00:00.000000000
df.rename('S_2','Date')
'Date'
cols=list(df.columns)
nulls={}
for col in cols:
nulls[col]=df[col].isna().sum().sum()
# dict(sorted(nulls.items(), key=lambda item: item[1]))
df.Date.min()
array('2017-03-01T00:00:00.000000000', dtype='datetime64[ns]')
df.Date.max()
array('2018-03-31T00:00:00.000000000', dtype='datetime64[ns]')
cat_cols=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cols_use=[]
for col in cols:
if nulls[col]==0 and col not in cat_cols:
cols_use.append(col)
df_new=df[cols_use].groupby(df.customer_ID,agg='mean')
df_new=df_new.drop('Date_mean',inplace=True)
cols_use=list(df_new.columns)
cols_use.remove('customer_ID')
# target= vaex.from_csv('train_labels.csv', convert=True)
target=vaex.open('train_labels.csv.hdf5')
df_new=df_new.join(target,on='customer_ID')
df_train,df_valid=df_new.split_random([0.7,0.3],random_state=2)
import vaex.ml.tensorflow
from tensorflow.keras import layers
from tensorflow import keras
gen_train = df_train.ml.tensorflow.to_keras_generator(features=cols_use, target=target, batch_size=512)
gen_valid = df_valid.ml.tensorflow.to_keras_generator(features=cols_use, target=target, batch_size=512)
Recommended "steps_per_epoch" arg: 628.0 Recommended "steps_per_epoch" arg: 269.0
early_stopping = keras.callbacks.EarlyStopping(
patience=10,
min_delta=0.001,
restore_best_weights=True)
nn_model = keras.Sequential()
nn_model.add(layers.Dense(64, activation='relu', input_shape=[66]))
nn_model.add(layers.BatchNormalization()),
nn_model.add(layers.Dropout(rate=0.3)),
nn_model.add(layers.Dense(8, activation='relu'))
nn_model.add(layers.BatchNormalization()),
nn_model.add(layers.Dropout(rate=0.3)),
nn_model.add(layers.Dense(1,activation='sigmoid'))
nn_model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['binary_accuracy'])
nn_model.fit(x=gen_train, validation_data=gen_valid, epochs=100, steps_per_epoch=628,
validation_steps=269,callbacks=[early_stopping])
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_2148/1653863919.py in <module> ----> 1 nn_model.fit(x=gen_train, validation_data=gen_valid, epochs=100, steps_per_epoch=628, 2 validation_steps=269,callbacks=[early_stopping]) ~\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in _method_wrapper(self, *args, **kwargs) 106 def _method_wrapper(self, *args, **kwargs): 107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access --> 108 return method(self, *args, **kwargs) 109 110 # Running inside `run_distribute_coordinator` already. ~\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing) 1047 training_utils.RespectCompiledTrainableState(self): 1048 # Creates a `tf.data.Dataset` and handles batch and epoch iteration. -> 1049 data_handler = data_adapter.DataHandler( 1050 x=x, 1051 y=y, ~\anaconda3\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py in __init__(self, x, y, sample_weight, batch_size, steps_per_epoch, initial_epoch, epochs, shuffle, class_weight, max_queue_size, workers, use_multiprocessing, model, steps_per_execution) 1103 1104 adapter_cls = select_data_adapter(x, y) -> 1105 self._adapter = adapter_cls( 1106 x, 1107 y, ~\anaconda3\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py in __init__(self, x, y, sample_weights, workers, use_multiprocessing, max_queue_size, model, **kwargs) 784 # Since we have to know the dtype of the python generator when we build the 785 # dataset, we have to look at a batch to infer the structure. --> 786 peek, x = self._peek_and_restore(x) 787 peek = self._standardize_batch(peek) 788 peek = _process_tensorlike(peek) ~\anaconda3\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py in _peek_and_restore(x) 841 @staticmethod 842 def _peek_and_restore(x): --> 843 peek = next(x) 844 return peek, itertools.chain([peek], x) 845 ~\anaconda3\lib\site-packages\vaex\ml\tensorflow.py in _generator(features, target, chunk_size, parallel, shuffle, infinite) 79 if target is not None: 80 target = vaex.utils._ensure_list(target) ---> 81 target = vaex.utils._ensure_strings_from_expressions(target) 82 n_target_cols = len(target) 83 column_names = features + target ~\anaconda3\lib\site-packages\vaex\utils.py in _ensure_strings_from_expressions(expressions) 645 def _ensure_strings_from_expressions(expressions): 646 if _issequence(expressions): --> 647 return [_ensure_strings_from_expressions(k) for k in expressions] 648 else: 649 return _ensure_string_from_expression(expressions) ~\anaconda3\lib\site-packages\vaex\utils.py in <listcomp>(.0) 645 def _ensure_strings_from_expressions(expressions): 646 if _issequence(expressions): --> 647 return [_ensure_strings_from_expressions(k) for k in expressions] 648 else: 649 return _ensure_string_from_expression(expressions) ~\anaconda3\lib\site-packages\vaex\utils.py in _ensure_strings_from_expressions(expressions) 647 return [_ensure_strings_from_expressions(k) for k in expressions] 648 else: --> 649 return _ensure_string_from_expression(expressions) 650 651 ~\anaconda3\lib\site-packages\vaex\utils.py in _ensure_string_from_expression(expression) 640 return expression.expression 641 else: --> 642 raise ValueError('%r is not of string or Expression type, but %r' % (expression, type(expression))) 643 644 ValueError: # customer_ID target 0 '0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fb... 0 1 '00000fd6641609c6ece5454664794f0340ad84dddce9a26... 0 2 '00001b22f846c82c51f6e3958ccd81970162bae8b007e80... 0 3 '000041bdba6ecadd89a52d11886e8eaaec9325906c97233... 0 4 '00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad... 0 ... ... ... 458,908 'ffff41c8a52833b56430603969b9ca48d208e7c192c6a40... 0 458,909 'ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fdd... 0 458,910 'ffff9984b999fccb2b6127635ed0736dda94e544e67e026... 0 458,911 'ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145... 1 458,912 'fffff1d38b785cef84adeace64f8f83db3a0c31e8d92eab... 0 is not of string or Expression type, but <class 'vaex.dataframe.DataFrameLocal'>