#!/usr/bin/env python # coding: utf-8 # Deep Learning Models -- A collection of various deep learning architectures, models, and tips for TensorFlow and PyTorch in Jupyter Notebooks. # - Author: Sebastian Raschka # - GitHub Repository: https://github.com/rasbt/deeplearning-models # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -p torch") # # Model Zoo -- Using PyTorch Dataset Loading Utilities for Custom Datasets (CSV files converted to HDF5) # This notebook provides an example for how to load a dataset from an HDF5 file created from a CSV file, using PyTorch's data loading utilities. For a more in-depth discussion, please see the official # # - [Data Loading and Processing Tutorial](http://pytorch.org/tutorials/beginner/data_loading_tutorial.html) # - [torch.utils.data](http://pytorch.org/docs/master/data.html) API documentation # # An Hierarchical Data Format (HDF) is a convenient way that allows quick access to data instances during minibatch learning if a dataset is too large to fit into memory. The approach outlined in this notebook uses uses the common [HDF5](https://support.hdfgroup.org/HDF5/) format and should be accessible to any programming language or tool with an HDF5 API. # # **In this example, we are going to use the Iris dataset for illustrative purposes. Let's pretend it's our large training dataset that doesn't fit into memory**. # # # ## Imports # In[2]: import pandas as pd import numpy as np import h5py import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader # ## Converting a CSV file to HDF5 # In this first step, we are going to process a CSV file (here, Iris) into an HDF5 database: # In[3]: # suppose this is a large CSV that does not # fit into memory: csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' # Get number of lines in the CSV file if it's on your hard drive: #num_lines = subprocess.check_output(['wc', '-l', in_csv]) #num_lines = int(nlines.split()[0]) num_lines = 150 num_features = 4 class_dict = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} # use 10,000 or 100,000 or so for large files chunksize = 10 # this is your HDF5 database: with h5py.File('iris.h5', 'w') as h5f: # use num_features-1 if the csv file has a column header dset1 = h5f.create_dataset('features', shape=(num_lines, num_features), compression=None, dtype='float32') dset2 = h5f.create_dataset('labels', shape=(num_lines,), compression=None, dtype='int32') # change range argument from 0 -> 1 if your csv file contains a column header for i in range(0, num_lines, chunksize): df = pd.read_csv(csv_path, header=None, # no header, define column header manually later nrows=chunksize, # number of rows to read at each iteration skiprows=i) # skip rows that were already read df[4] = df[4].map(class_dict) features = df.values[:, :4] labels = df.values[:, -1] # use i-1 and i-1+10 if csv file has a column header dset1[i:i+10, :] = features dset2[i:i+10] = labels[0] # After creating the database, let's double-check that everything works correctly: # In[4]: with h5py.File('iris.h5', 'r') as h5f: print(h5f['features'].shape) print(h5f['labels'].shape) # In[5]: with h5py.File('iris.h5', 'r') as h5f: print('Features of entry no. 99:', h5f['features'][99]) print('Class label of entry no. 99:', h5f['labels'][99]) # ## Implementing a Custom Dataset Class # Now, we implement a custom `Dataset` for reading the training examples. The `__getitem__` method will # # 1. read a single training example from HDF5 based on an `index` (more on batching later) # 2. return a single training example and it's corresponding label # # Note that we will keep an open connection to the database for efficiency via `self.h5f = h5py.File(h5_path, 'r')` -- you may want to close it when you are done (more on this later). # In[6]: class Hdf5Dataset(Dataset): """Custom Dataset for loading entries from HDF5 databases""" def __init__(self, h5_path, transform=None): self.h5f = h5py.File(h5_path, 'r') self.num_entries = self.h5f['labels'].shape[0] self.transform = transform def __getitem__(self, index): features = self.h5f['features'][index] label = self.h5f['labels'][index] if self.transform is not None: features = self.transform(features) return features, label def __len__(self): return self.num_entries # Now that we have created our custom Dataset class, we can initialize a Dataset instance for the training examples using the 'iris.h5' database file. Then, we initialize a `DataLoader` that allows us to read from the dataset. # In[7]: train_dataset = Hdf5Dataset(h5_path='iris.h5', transform=None) train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True, num_workers=4) # That's it! Now we can iterate over an epoch using the train_loader as an iterator and use the features and labels from the training dataset for model training as shown in the next section # ## Iterating Through the Custom Dataset # In[8]: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.manual_seed(0) num_epochs = 5 for epoch in range(num_epochs): for batch_idx, (x, y) in enumerate(train_loader): print('Epoch:', epoch+1, end='') print(' | Batch index:', batch_idx, end='') print(' | Batch size:', y.size()[0]) x = x.to(device) y = y.to(device) # do model training on x and y here # **Remember that we kept an open connection to the HDF5 database in the `Hdf5Dataset` (via `self.h5f = h5py.File(h5_path, 'r')`). Once we are done, we may want to close this connection:** # In[9]: train_dataset.h5f.close() # In[10]: get_ipython().run_line_magic('watermark', '-iv')