Notebook

Three Ways of Storing and Accessing Lots of Images in Python¶

@ysbecca

First, loading the CIFAR-10 data into memory as numpy arrays.

In [1]:

import numpy as np
import pickle
from pathlib import Path

# Path to the unzipped CIFAR data
data_dir = Path("data/cifar-10-batches-py/")

# Unpickle function provided by the CIFAR hosts
def unpickle(file):
    with open(file, "rb") as fo:
        dict = pickle.load(fo, encoding="bytes")
    return dict


images, labels = [], []
for batch in data_dir.glob("data_batch_*"):
    batch_data = unpickle(batch)
    for i, flat_im in enumerate(batch_data[b"data"]):
        im_channels = []
        # Each image is flattened, with channels in order of R, G, B
        for j in range(3):
            im_channels.append(
                flat_im[j * 1024 : (j + 1) * 1024].reshape(
                    (32, 32)
                )
            )
        # Reconstruct the original image
        images.append(np.dstack((im_channels)))
        # Save the label
        labels.append(batch_data[b"labels"][i])

print("Loaded CIFAR-10 training set:")
print(" - np.shape(images)     ", np.shape(images))
print(" - np.shape(labels)     ", np.shape(labels))

Loaded CIFAR-10 training set:
 - np.shape(images)      (50000, 32, 32, 3)
 - np.shape(labels)      (50000,)

Specifying the directories for storage using the different methods.

In [2]:

from pathlib import Path

disk_dir = Path("data/disk/")
lmdb_dir = Path("data/lmdb/")
hdf5_dir = Path("data/hdf5/")

Helper functions for timing.

Imports required for the methods.

In [3]:

class CIFAR_Image():
    def __init__(self, image, label):
        # Dimensions of image for reconstruction - not really necessary for this
        # dataset, but some datasets may include images of varying sizes
        self.channels = image.shape[2]
        self.size = image.shape[:2]

        self.image = image.tobytes()
        self.label = label

    def get_image(self):
        """ Returns the image as a numpy array. """
        image = np.frombuffer(self.image, dtype=np.uint8)
        return image.reshape(*self.size, self.channels)

In [4]:

# For disk
from PIL import Image
import csv

# For lmdb
import lmdb
import pickle

# For HDF5
import h5py


def store_single_disk(image, image_id, label):
    """ Stores a single image as a .png file on disk. 
        Parameters:
        ---------------
        image       image array, (32, 32, 3) to be stored
        image_id    integer unique ID for image
        label       image label
    """
    Image.fromarray(image).save(
        disk_dir / f"{image_id}.png"
    )

    with open(
        disk_dir / f"{image_id}.csv", "wt"
    ) as csvfile:
        writer = csv.writer(
            csvfile,
            delimiter=" ",
            quotechar="|",
            quoting=csv.QUOTE_MINIMAL,
        )
        writer.writerow([label])


def store_single_lmdb(image, image_id, label):
    """ Stores a single image to a LMDB.
        Parameters:
        ---------------
        image       image array, (32, 32, 3) to be stored
        image_id    integer unique ID for image
        label       image label
    """

    map_size = image.nbytes * 10

    # Create a new LMDB environment
    env = lmdb.open(
        str(lmdb_dir / f"single_lmdb"), map_size=map_size
    )

    # Start a new write transaction
    with env.begin(write=True) as txn:
        # All key-value pairs need to be strings
        value = CIFAR_Image(image, label)
        key = f"{image_id:08}"
        txn.put(key.encode("ascii"), pickle.dumps(value))
    env.close()


def store_single_hdf5(image, image_id, label):
    """ Stores a single image to an HDF5 file.
        Parameters:
        ---------------
        image       image array, (32, 32, 3) to be stored
        image_id    integer unique ID for image
        label       image label
    """

    # Create a new HDF5 file
    file = h5py.File(hdf5_dir / f"{image_id}.h5", "w")

    # Create a dataset in the file
    dataset = file.create_dataset(
        "image",
        np.shape(image),
        h5py.h5t.STD_U8BE,
        data=image,
    )
    meta_set = file.create_dataset(
        "meta",
        np.shape(label),
        h5py.h5t.STD_U8BE,
        data=label,
    )
    file.close()


_store_single_funcs = dict(
    disk=store_single_disk,
    lmdb=store_single_lmdb,
    hdf5=store_single_hdf5,
)

/Users/ysbecca/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

Run the write single image experiment

In [5]:

from timeit import timeit

store_single_timings = dict()

for method in ("disk", "lmdb", "hdf5"):
    t = timeit(
        "_store_single_funcs[method](image, 0, label)",
        setup="image=images[0]; label=labels[0]",
        number=1,
        globals=globals(),
    )
    store_single_timings[method] = t
    print(f"Method: {method}, Time usage: {t}")

Method: disk, Time usage: 0.02178083499893546
Method: lmdb, Time usage: 0.0024577950025559403
Method: hdf5, Time usage: 0.00864763500430854

In [6]:

def store_many_disk(images, labels):
    """ Stores an array of images to disk
        Parameters:
        ---------------
        images       images array, (N, 32, 32, 3) to be stored
        labels       labels array, (N, 1) to be stored
    """
    num_images = len(images)

    # Save all the images one by one
    for i, image in enumerate(images):
        Image.fromarray(image).save(disk_dir / f"{i}.png")

    # Save all the labels to the csv file
    with open(
        disk_dir / f"{num_images}.csv", "w"
    ) as csvfile:
        writer = csv.writer(
            csvfile,
            delimiter=" ",
            quotechar="|",
            quoting=csv.QUOTE_MINIMAL,
        )
        for label in labels:
            # Remember that this typically would be more than just one value per row
            writer.writerow([label])


def store_many_lmdb(images, labels):
    """ Stores an array of images to LMDB.
        Parameters:
        ---------------
        images       images array, (N, 32, 32, 3) to be stored
        labels       labels array, (N, 1) to be stored
    """
    num_images = len(images)

    map_size = num_images * images[0].nbytes * 10

    # Create a new LMDB DB for all the images
    env = lmdb.open(
        str(lmdb_dir / f"{num_images}_lmdb"),
        map_size=map_size,
    )

    # Same as before; but let's write all the images in a single transaction
    with env.begin(write=True) as txn:
        for i in range(num_images):
            # All key-value pairs need to be Strings
            value = CIFAR_Image(images[i], labels[i])
            key = f"{i:08}"
            txn.put(
                key.encode("ascii"), pickle.dumps(value)
            )
    env.close()


def store_many_hdf5(images, labels):
    """ Stores an array of images to HDF5.
        Parameters:
        ---------------
        images       images array, (N, 32, 32, 3) to be stored
        labels       labels array, (N, 1) to be stored
    """
    num_images = len(images)

    # Create a new HDF5 file
    file = h5py.File(
        hdf5_dir / f"{num_images}_many.h5", "w"
    )

    # Create a dataset in the file
    dataset = file.create_dataset(
        "images",
        np.shape(images),
        h5py.h5t.STD_U8BE,
        data=images,
    )
    meta_set = file.create_dataset(
        "meta",
        np.shape(labels),
        h5py.h5t.STD_U8BE,
        data=labels,
    )
    file.close()


_store_many_funcs = dict(
    disk=store_many_disk,
    lmdb=store_many_lmdb,
    hdf5=store_many_hdf5,
)

Run the multiple images experiment now.

In [7]:

cutoffs = [10, 100, 1000, 10000, 100000]

# Let's double our images so that we have 100,000
images = np.concatenate((images, images), axis=0)
labels = np.concatenate((labels, labels), axis=0)

print(np.shape(images))
print(np.shape(labels))

(100000, 32, 32, 3)
(100000,)

In [8]:

from timeit import timeit

store_many_timings = {"disk": [], "lmdb": [], "hdf5": []}

for cutoff in cutoffs:
    for method in ("disk", "lmdb", "hdf5"):
        t = timeit(
            "_store_many_funcs[method](images_, labels_)",
            setup="images_=images[:cutoff]; labels_=labels[:cutoff]",
            number=1,
            globals=globals(),
        )
        store_many_timings[method].append(t)

        # Print out the method, cutoff, and elapsed time
        print(f"Method: {method}, Time usage: {t}")

Method: disk, Time usage: 0.011318083998048678
Method: lmdb, Time usage: 0.0015257080012816004
Method: hdf5, Time usage: 0.002065688000584487
Method: disk, Time usage: 0.1060425999967265
Method: lmdb, Time usage: 0.007775104997563176
Method: hdf5, Time usage: 0.00335644900042098
Method: disk, Time usage: 0.6783636820036918
Method: lmdb, Time usage: 0.03405023599771084
Method: hdf5, Time usage: 0.004012751996924635
Method: disk, Time usage: 8.831336139999621
Method: lmdb, Time usage: 0.48035822899691993
Method: hdf5, Time usage: 0.03763485200033756
Method: disk, Time usage: 80.57666425999923
Method: lmdb, Time usage: 4.341894056000456
Method: hdf5, Time usage: 0.28518653900391655

In [9]:

store_many_timings

Out[9]:

{'disk': [0.011318083998048678,
  0.1060425999967265,
  0.6783636820036918,
  8.831336139999621,
  80.57666425999923],
 'hdf5': [0.002065688000584487,
  0.00335644900042098,
  0.004012751996924635,
  0.03763485200033756,
  0.28518653900391655],
 'lmdb': [0.0015257080012816004,
  0.007775104997563176,
  0.03405023599771084,
  0.48035822899691993,
  4.341894056000456]}

Let's visualise those results.

In [12]:

import matplotlib.pyplot as plt

plt.rcParams.update({'font.size': 20})

def plot_with_legend(
    x_range,
    y_data,
    legend_labels,
    x_label,
    y_label,
    title,
    log=False,
):
    """ Displays a single plot with multiple datasets and matching legends.
        Parameters:
        --------------
        x_range         list of lists containing x data
        y_data          list of lists containing y values
        legend_labels   list of string legend labels
        x_label         x axis label
        y_label         y axis label
    """
    plt.style.use("seaborn-whitegrid")
    plt.figure(figsize=(10, 7))

    if len(y_data) != len(legend_labels):
        raise TypeError(
            "Error: the number of data sets does not match the number of labels provided."
        )

    all_plots = []
    for data, label in zip(y_data, legend_labels):
        if log:
            temp, = plt.loglog(x_range, data, label=label)
        else:
            temp, = plt.plot(x_range, data, label=label)
        all_plots.append(temp)

    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(handles=all_plots)
    plt.figure(figsize=(20,10))
    plt.show()

In [13]:

disk_x = store_many_timings["disk"]
lmdb_x = store_many_timings["lmdb"]
hdf5_x = store_many_timings["hdf5"]

plot_with_legend(
    cutoffs,
    [disk_x, lmdb_x, hdf5_x],
    ["PNG files", "LMDB", "HDF5"],
    "Number of images",
    "Seconds to store",
    "Storage time",
    log=False,
)

plot_with_legend(
    cutoffs,
    [disk_x, lmdb_x, hdf5_x],
    ["PNG files", "LMDB", "HDF5"],
    "Number of images",
    "Seconds to store",
    "Log storage time",
    log=True,
)

<matplotlib.figure.Figure at 0x11aa400b8>

<matplotlib.figure.Figure at 0x11a6e7908>

Visualise how much memory is used.

In [14]:

# Memory used in KB
disk_mem = [24, 204, 2004, 20032, 200296]
lmdb_mem = [60, 420, 4000, 39000, 393000]
hdf5_mem = [36, 304, 2900, 29000, 293000]

X = [disk_mem, lmdb_mem, hdf5_mem]

ind = np.arange(3)
width = 0.35

plt.subplots(figsize=(8, 10))
plots = [plt.bar(ind, [row[0] for row in X], width)]
for i in range(1, len(cutoffs)):
    plots.append(
        plt.bar(
            ind,
            [row[i] for row in X],
            width,
            bottom=[row[i - 1] for row in X],
        )
    )

plt.ylabel("Memory in KB")
plt.title("Disk memory used by method")
plt.xticks(ind, ("PNG", "LMDB", "HDF5"))
plt.yticks(np.arange(0, 400000, 100000))

plt.legend(
    [plot[0] for plot in plots],
    ("10", "100", "1,000", "10,000", "100,000"),
)
plt.show()

Read out a single image.

In [15]:

def read_single_disk(image_id):
    """ Stores a single image to disk.
        Parameters:
        ---------------   
        image_id    integer unique ID for image
        
        Returns:
        ----------
        image       image array, (32, 32, 3) to be stored
        label       associated meta data, int label
    """
    image = np.array(
        Image.open(disk_dir / f"{image_id}.png")
    )

    with open(disk_dir / f"{image_id}.csv", "r") as csvfile:
        reader = csv.reader(
            csvfile,
            delimiter=" ",
            quotechar="|",
            quoting=csv.QUOTE_MINIMAL,
        )
        label = int(next(reader)[0])

    return image, label


def read_single_lmdb(image_id):
    """ Stores a single image to LMDB.
        Parameters:
        ---------------   
        image_id    integer unique ID for image
        
        Returns:
        ----------
        image       image array, (32, 32, 3) to be stored
        label       associated meta data, int label
    """

    # Open the LMDB environment; see (1)
    env = lmdb.open(
        str(lmdb_dir / f"single_lmdb"), readonly=True
    )

    # Start a new read transaction
    with env.begin() as txn:
        # Encode the key the same way as we stored it
        data = txn.get(f"{image_id:08}".encode("ascii"))
        # Remember that it's a CIFAR_Image object that we get back out
        cifar_image = pickle.loads(data)
        # Retrieve the relevant bits
        image = cifar_image.get_image()
        label = cifar_image.label
    env.close()

    return image, label


def read_single_hdf5(image_id):
    """ Stores a single image to HDF5.
        Parameters:
        ---------------   
        image_id    integer unique ID for image
        
        Returns:
        ----------
        image       image array, (32, 32, 3) to be stored
        label       associated meta data, int label
    """

    # Open the HDF5 file
    file = h5py.File(hdf5_dir / f"{image_id}.h5", "r+")

    image = np.array(file["/image"]).astype("uint8")
    label = int(np.array(file["/meta"]).astype("uint8"))

    return image, label


_read_single_funcs = dict(
    disk=read_single_disk,
    lmdb=read_single_lmdb,
    hdf5=read_single_hdf5,
)

In [16]:

from timeit import timeit

read_single_timings = dict()

for method in ("disk", "lmdb", "hdf5"):
    t = timeit(
        "_read_single_funcs[method](0)",
        setup="image=images[0]; label=labels[0]",
        number=1,
        globals=globals(),
    )
    read_single_timings[method] = t
    print(f"Method: {method}, Time usage: {t}")

Method: disk, Time usage: 0.0029513100016629323
Method: lmdb, Time usage: 0.0010519620045670308
Method: hdf5, Time usage: 0.0038483430034830235

In [17]:

read_single_timings

Out[17]:

{'disk': 0.0029513100016629323,
 'hdf5': 0.0038483430034830235,
 'lmdb': 0.0010519620045670308}

Reading in many images

In [18]:

def read_many_disk(num_images):
    """ Reads image from disk.
        Parameters:
        ---------------   
        num_images   number of images to read
        
        Returns:
        ----------
        images      images array, (N, 32, 32, 3) to be stored
        labels      associated meta data, int label (N, 1)
    """
    images, labels = [], []

    # Loop over all IDs and read each image in one by one
    for image_id in range(num_images):
        images.append(
            np.array(
                Image.open(disk_dir / f"{image_id}.png")
            )
        )

    with open(
        disk_dir / f"{num_images}.csv", "r"
    ) as csvfile:
        reader = csv.reader(
            csvfile,
            delimiter=" ",
            quotechar="|",
            quoting=csv.QUOTE_MINIMAL,
        )
        for row in reader:
            labels.append(int(row[0]))
    return images, labels


def read_many_lmdb(num_images):
    """ Reads image from LMDB.
        Parameters:
        ---------------   
        num_images   number of images to read
        
        Returns:
        ----------
        images      images array, (N, 32, 32, 3) to be stored
        labels      associated meta data, int label (N, 1)
    """
    images, labels = [], []
    env = lmdb.open(
        str(lmdb_dir / f"{num_images}_lmdb"), readonly=True
    )

    # Start a new read transaction
    with env.begin() as txn:
        # Read all images in one single transaction, with one lock
        # We could split this up into multiple transactions if needed
        for image_id in range(num_images):
            data = txn.get(f"{image_id:08}".encode("ascii"))
            # Remember that it's a CIFAR_Image object that is stored as the value
            cifar_image = pickle.loads(data)
            # Retrieve the relevant bits
            images.append(cifar_image.get_image())
            labels.append(cifar_image.label)
    env.close()
    return images, labels


def read_many_hdf5(num_images):
    """ Reads image from HDF5.
        Parameters:
        ---------------   
        num_images   number of images to read
        
        Returns:
        ----------
        images      images array, (N, 32, 32, 3) to be stored
        labels      associated meta data, int label (N, 1)
    """
    images, labels = [], []

    # Open the HDF5 file
    file = h5py.File(
        hdf5_dir / f"{num_images}_many.h5", "r+"
    )

    images = np.array(file["/images"]).astype("uint8")
    labels = np.array(file["/meta"]).astype("uint8")

    return images, labels


_read_many_funcs = dict(
    disk=read_many_disk,
    lmdb=read_many_lmdb,
    hdf5=read_many_hdf5,
)

In [19]:

from timeit import timeit

read_many_timings = {"disk": [], "lmdb": [], "hdf5": []}

for cutoff in cutoffs:
    for method in ("disk", "lmdb", "hdf5"):
        t = timeit(
            "_read_many_funcs[method](num_images)",
            setup="num_images=cutoff",
            number=1,
            globals=globals(),
        )
        read_many_timings[method].append(t)

        # Print out the method, cutoff, and elapsed time
        print(
            f"Method: {method}, No. images: {cutoff}, Time usage: {t}"
        )

Method: disk, No. images: 10, Time usage: 0.007798415004799608
Method: lmdb, No. images: 10, Time usage: 0.0014411589945666492
Method: hdf5, No. images: 10, Time usage: 0.0024644029981573112
Method: disk, No. images: 100, Time usage: 0.07457431899820222
Method: lmdb, No. images: 100, Time usage: 0.009639914002036676
Method: hdf5, No. images: 100, Time usage: 0.004666212997108232
Method: disk, No. images: 1000, Time usage: 0.5799051089998102
Method: lmdb, No. images: 1000, Time usage: 0.04127998500189278
Method: hdf5, No. images: 1000, Time usage: 0.014238975003536325
Method: disk, No. images: 10000, Time usage: 5.8348617760057095
Method: lmdb, No. images: 10000, Time usage: 0.31768411499797367
Method: hdf5, No. images: 10000, Time usage: 0.09621400500327582
Method: disk, No. images: 100000, Time usage: 62.479549773001054
Method: lmdb, No. images: 100000, Time usage: 3.457494147995021
Method: hdf5, No. images: 100000, Time usage: 1.3067588940029964

In [20]:

disk_x_r = read_many_timings["disk"]
lmdb_x_r = read_many_timings["lmdb"]
hdf5_x_r = read_many_timings["hdf5"]

plot_with_legend(
    cutoffs,
    [disk_x_r, lmdb_x_r, hdf5_x_r],
    ["PNG files", "LMDB", "HDF5"],
    "Number of images",
    "Seconds to read",
    "Read time",
    log=False,
)

plot_with_legend(
    cutoffs,
    [disk_x_r, lmdb_x_r, hdf5_x_r],
    ["PNG files", "LMDB", "HDF5"],
    "Number of images",
    "Seconds to read",
    "Log read time",
    log=True,
)

<matplotlib.figure.Figure at 0x11a92de80>

<matplotlib.figure.Figure at 0x118647860>

Compare with the write times.

In [22]:

plot_with_legend(
    cutoffs,
    [disk_x_r, lmdb_x_r, hdf5_x_r, disk_x, lmdb_x, hdf5_x],
    [
        "Read PNG",
        "Read LMDB",
        "Read HDF5",
        "Write PNG",
        "Write LMDB",
        "Write HDF5",
    ],
    "Number of images",
    "Seconds",
    "Store and Read Times",
    log=False,
)

plot_with_legend(
    cutoffs,
    [disk_x_r, lmdb_x_r, hdf5_x_r, disk_x, lmdb_x, hdf5_x],
    [
        "Read PNG",
        "Read LMDB",
        "Read HDF5",
        "Write PNG",
        "Write LMDB",
        "Write HDF5",
    ],
    "Number of images",
    "Seconds",
    "Log Store and Read Times",
    log=True,
)

<matplotlib.figure.Figure at 0x11aab70f0>

<matplotlib.figure.Figure at 0x11f134f60>

In [ ]: