A VirtualDataset in ConX allows you to load just part of a dataset at a time so that you don't require all of the data to be in memory at once.
You can construct a VirtualDataset as follows:
cx.VirtualDataset(FUNCTION,
LENGTH,
INPUTS_SHAPES,
TARGET_SHAPES,
INPUT_RANGES,
TARGET_RANGES,
generator_ordered=True|False,
load_cache_direct=True|False,
cache_size=SIZE)
Where:
The SIZE determines how many input/target pairs are generated at a time. Usually this should match the cache_size used in the training of the network.
import conx as cx
import numpy as np
import random
Using TensorFlow backend. ConX, version 3.7.5
cache_size = 8
def f(self, pos):
print("Generating position:", pos)
return ([pos/100, pos/100], [pos/100])
f(None, 50)
Generating position: 50
([0.5, 0.5], [0.5])
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
load_cache_direct=False,
cache_size=cache_size)
Generating position: 0 Generating position: 1 Generating position: 2 Generating position: 3 Generating position: 4 Generating position: 5 Generating position: 6 Generating position: 7
dataset.inputs[0]
[0.0, 0.0]
dataset.inputs[0]
[0.0, 0.0]
As you can see from the above, retrieving input/target patterns from the current batch does not regenerate the batch.
However, moving beyond the range does generate a new batch:
dataset.inputs[50]
Generating position: 48 Generating position: 49 Generating position: 50 Generating position: 51 Generating position: 52 Generating position: 53 Generating position: 54 Generating position: 55
[0.5, 0.5]
dataset.inputs[0]
Generating position: 0 Generating position: 1 Generating position: 2 Generating position: 3 Generating position: 4 Generating position: 5 Generating position: 6 Generating position: 7
[0.0, 0.0]
def f(self, batch):
print("Generating batch:", batch)
i = batch * cache_size
while True:
all_inputs = [[]]
all_targets = [[]]
while i < (batch + 1) * cache_size:
all_inputs[0].append([i/100, i/100])
all_targets[0].append([i/100])
i += 1
return ([np.array(inputs) for inputs in all_inputs],
[np.array(targets) for targets in all_targets])
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
cache_size=cache_size)
Generating batch: 0
dataset.inputs[0]
[0.0, 0.0]
dataset.inputs[50]
Generating batch: 6
[0.5, 0.5]
dataset.inputs[0]
Generating batch: 0
[0.0, 0.0]
dataset.inputs[0]
[0.0, 0.0]
def f(self):
i = 0
while True:
print("Generating position:", i)
yield ([i/100, i/100], [i/100])
i += 1
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
generator_ordered=True,
load_cache_direct=False,
cache_size=cache_size)
Generating position: 0 Generating position: 1 Generating position: 2 Generating position: 3 Generating position: 4 Generating position: 5 Generating position: 6 Generating position: 7
dataset.inputs[0]
[0.0, 0.0]
dataset.inputs[20]
Generating position: 0 Generating position: 1 Generating position: 2 Generating position: 3 Generating position: 4 Generating position: 5 Generating position: 6 Generating position: 7 Generating position: 8 Generating position: 9 Generating position: 10 Generating position: 11 Generating position: 12 Generating position: 13 Generating position: 14 Generating position: 15 Generating position: 16 Generating position: 17 Generating position: 18 Generating position: 19 Generating position: 20 Generating position: 21 Generating position: 22 Generating position: 23
[0.20000000298023224, 0.20000000298023224]
dataset.inputs[24]
Generating position: 24 Generating position: 25 Generating position: 26 Generating position: 27 Generating position: 28 Generating position: 29 Generating position: 30 Generating position: 31
[0.23999999463558197, 0.23999999463558197]
dataset.inputs[0]
Generating position: 0 Generating position: 1 Generating position: 2 Generating position: 3 Generating position: 4 Generating position: 5 Generating position: 6 Generating position: 7
[0.0, 0.0]
def f(self):
i = 0
while True:
print("Generating positions:", i, "-", i + cache_size)
i_end = i + cache_size
all_inputs = [[]]
all_targets = [[]]
while i < i_end:
all_inputs[0].append([i/100, i/100])
all_targets[0].append([i/100])
i += 1
yield ([np.array(inputs) for inputs in all_inputs],
[np.array(targets) for targets in all_targets])
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
generator_ordered=True,
load_cache_direct=True,
cache_size=cache_size)
Generating positions: 0 - 8
dataset.inputs[0]
[0.0, 0.0]
dataset.inputs[25]
Generating positions: 0 - 8 Generating positions: 8 - 16 Generating positions: 16 - 24 Generating positions: 24 - 32
[0.25, 0.25]
dataset.inputs[8]
Generating positions: 0 - 8 Generating positions: 8 - 16
[0.08, 0.08]
def f(self):
for i in range(100):
print("generating position:", i)
yield ([i/100, i/100], [i/100])
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
generator_ordered=True,
load_cache_direct=False,
cache_size=cache_size)
generating position: 0 generating position: 1 generating position: 2 generating position: 3 generating position: 4 generating position: 5 generating position: 6 generating position: 7
dataset.inputs[0]
[0.0, 0.0]
dataset.inputs[25]
generating position: 0 generating position: 1 generating position: 2 generating position: 3 generating position: 4 generating position: 5 generating position: 6 generating position: 7 generating position: 8 generating position: 9 generating position: 10 generating position: 11 generating position: 12 generating position: 13 generating position: 14 generating position: 15 generating position: 16 generating position: 17 generating position: 18 generating position: 19 generating position: 20 generating position: 21 generating position: 22 generating position: 23 generating position: 24 generating position: 25 generating position: 26 generating position: 27 generating position: 28 generating position: 29 generating position: 30 generating position: 31
[0.25, 0.25]
dataset.inputs[0]
generating position: 0 generating position: 1 generating position: 2 generating position: 3 generating position: 4 generating position: 5 generating position: 6 generating position: 7
[0.0, 0.0]
def f(self):
while True:
print("Generating a position!")
r = random.random()
yield ([r, r], [r])
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
generator_ordered=False,
load_cache_direct=False,
cache_size=cache_size)
Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position!
dataset.inputs[0]
[0.7236223220825195, 0.7236223220825195]
dataset.inputs[0]
[0.7236223220825195, 0.7236223220825195]
dataset.inputs[10]
Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position!
[0.661766767501831, 0.661766767501831]
dataset.inputs[0]
Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position! Generating a position!
[0.2500929534435272, 0.2500929534435272]
def f(self):
while True:
print("Generating a batch!")
all_inputs = [[]]
all_targets = [[]]
for i in range(cache_size):
r = random.random()
all_inputs[0].append([r, r])
all_targets[0].append([r])
yield ([np.array(inputs) for inputs in all_inputs],
[np.array(targets) for targets in all_targets])
dataset = cx.VirtualDataset(f, 100, [(2,)], [(1,)], [(0,1)], [(0,1)],
generator_ordered=False,
load_cache_direct=True,
cache_size=cache_size)
Generating a batch!
dataset.inputs[0]
[0.5143756422352775, 0.5143756422352775]
dataset.inputs[50]
Generating a batch!
[0.14642449150160008, 0.14642449150160008]
dataset.inputs[0]
Generating a batch!
[0.5630570620543651, 0.5630570620543651]
dataset.inputs[99]
Generating a batch!
[0.5517772337207292, 0.5517772337207292]
%%file test0.dat
[[0/3], [0/3]], [0/3]
Overwriting test0.dat
%%file test1.dat
[[1/3], [1/3]], [1/3]
Overwriting test1.dat
%%file test2.dat
[[2/3], [2/3]], [2/3]
Overwriting test2.dat
%%file test3.dat
[[3/3], [3/3]], [3/3]
Overwriting test3.dat
import glob
filenames = sorted(glob.glob("./*.dat"))
def f(self, pos):
# To get a specific order, always number from beginning:
print("Generating position:", pos)
return eval(open(filenames[pos]).read())
filenames
['./test0.dat', './test1.dat', './test2.dat', './test3.dat']
f(None, 0)
Generating position: 0
([[0.0], [0.0]], [0.0])
dataset = cx.VirtualDataset(f, len(filenames), [(2,)], [(1,)], [(0,1)], [(0,1)],
load_cache_direct=False,
cache_size=3)
Generating position: 0 Generating position: 1 Generating position: 2
dataset.inputs[0]
[[0.0], [0.0]]
dataset.inputs[0]
[[0.0], [0.0]]
dataset.inputs[3]
Generating position: 3
[[1.0], [1.0]]