#!/usr/bin/env python
# coding: utf-8

# Deep Learning Models -- A collection of various deep learning architectures, models, and tips for TensorFlow and PyTorch in Jupyter Notebooks.
# - Author: Sebastian Raschka
# - GitHub Repository: https://github.com/rasbt/deeplearning-models

# In[1]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -p torch")


# # Model Zoo -- Using PyTorch Dataset Loading Utilities for Custom Datasets (CSV files converted to HDF5)

# This notebook provides an example for how to load a dataset from an HDF5 file created from a CSV file, using PyTorch's data loading utilities. For a more in-depth discussion, please see the official
# 
# - [Data Loading and Processing Tutorial](http://pytorch.org/tutorials/beginner/data_loading_tutorial.html)
# - [torch.utils.data](http://pytorch.org/docs/master/data.html) API documentation
# 
# An Hierarchical Data Format (HDF) is a convenient way that allows quick access to data instances during minibatch learning if a dataset is too large to fit into memory. The approach outlined in this notebook uses uses the common [HDF5](https://support.hdfgroup.org/HDF5/) format and should be accessible to any programming language or tool with an HDF5 API.
# 
# **In this example, we are going to use the Iris dataset for illustrative purposes. Let's pretend it's our large training dataset that doesn't fit into memory**.
# 
# 

# ## Imports

# In[2]:


import pandas as pd
import numpy as np
import h5py
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


# ## Converting a CSV file to HDF5

# In this first step, we are going to process a CSV file (here, Iris) into an HDF5 database:

# In[3]:


# suppose this is a large CSV that does not 
# fit into memory:
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

# Get number of lines in the CSV file if it's on your hard drive:
#num_lines = subprocess.check_output(['wc', '-l', in_csv])
#num_lines = int(nlines.split()[0]) 
num_lines = 150
num_features = 4

class_dict = {'Iris-setosa': 0,
              'Iris-versicolor': 1,
              'Iris-virginica': 2}

# use 10,000 or 100,000 or so for large files
chunksize = 10

# this is your HDF5 database:
with h5py.File('iris.h5', 'w') as h5f:
    
    # use num_features-1 if the csv file has a column header
    dset1 = h5f.create_dataset('features',
                               shape=(num_lines, num_features),
                               compression=None,
                               dtype='float32')
    dset2 = h5f.create_dataset('labels',
                               shape=(num_lines,),
                               compression=None,
                               dtype='int32')

    # change range argument from 0 -> 1 if your csv file contains a column header
    for i in range(0, num_lines, chunksize):  

        df = pd.read_csv(csv_path,  
                header=None,  # no header, define column header manually later
                nrows=chunksize, # number of rows to read at each iteration
                skiprows=i)   # skip rows that were already read
        
        df[4] = df[4].map(class_dict)

        features = df.values[:, :4]
        labels = df.values[:, -1]
        
        # use i-1 and i-1+10 if csv file has a column header
        dset1[i:i+10, :] = features
        dset2[i:i+10] = labels[0]


# After creating the database, let's double-check that everything works correctly:

# In[4]:


with h5py.File('iris.h5', 'r') as h5f:
    print(h5f['features'].shape)
    print(h5f['labels'].shape)


# In[5]:


with h5py.File('iris.h5', 'r') as h5f:
    print('Features of entry no. 99:', h5f['features'][99])
    print('Class label of entry no. 99:', h5f['labels'][99])


# ## Implementing a Custom Dataset Class

# Now, we implement a custom `Dataset` for reading the training examples. The `__getitem__` method will
# 
# 1. read a single training example from HDF5 based on an `index` (more on batching later)
# 2. return a single training example and it's corresponding label
# 
# Note that we will keep an open connection to the database for efficiency via `self.h5f = h5py.File(h5_path, 'r')` -- you may want to close it when you are done (more on this later).

# In[6]:


class Hdf5Dataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, h5_path, transform=None):
    
        self.h5f = h5py.File(h5_path, 'r')
        self.num_entries = self.h5f['labels'].shape[0]
        self.transform = transform

    def __getitem__(self, index):
        
        features = self.h5f['features'][index]
        label = self.h5f['labels'][index]
        if self.transform is not None:
            features = self.transform(features)
        return features, label

    def __len__(self):
        return self.num_entries


# Now that we have created our custom Dataset class, we can initialize a Dataset instance for the training examples using the 'iris.h5' database file. Then, we initialize a `DataLoader` that allows us to read from the dataset.

# In[7]:


train_dataset = Hdf5Dataset(h5_path='iris.h5',
                            transform=None)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=50,
                          shuffle=True,
                          num_workers=4) 


# That's it! Now we can iterate over an epoch using the train_loader as an iterator and use the features and labels from the training dataset for model training as shown in the next section

# ## Iterating Through the Custom Dataset

# In[8]:


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)

num_epochs = 5
for epoch in range(num_epochs):

    for batch_idx, (x, y) in enumerate(train_loader):
        
        print('Epoch:', epoch+1, end='')
        print(' | Batch index:', batch_idx, end='')
        print(' | Batch size:', y.size()[0])
        
        x = x.to(device)
        y = y.to(device)

        # do model training on x and y here


# **Remember that we kept an open connection to the HDF5 database in the `Hdf5Dataset` (via `self.h5f = h5py.File(h5_path, 'r')`). Once we are done, we may want to close this connection:**

# In[9]:


train_dataset.h5f.close()


# In[10]:


get_ipython().run_line_magic('watermark', '-iv')