#!/usr/bin/env python
# coding: utf-8

# # Introduction

# This IPython notebook illustrates how to read a CSV file from disk as a table and set its metadata.

# First, we need to import *py_entitymatching* package and other libraries as follows:

# In[2]:


import py_entitymatching as em
import pandas as pd
import os, sys


# # Different Ways to Read a CSV File and Set Metadata

# First, we need to get the path of the CSV file in disk. For the convenience of the user, we have included some sample files in the package. The path of a sample CSV file can be obtained like this:
# 
# 

# In[3]:


# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the path of the input table
path_A = datasets_dir + os.sep + 'person_table_A.csv'


# In[4]:


# Display the contents of the file in path_A
get_ipython().system('cat $path_A | head -3')


# Once we get the CSV file path, we can use it read the contents and set metadata.

# ## Different Ways to Read a CSV File and Set Metadata

# There are three different ways to read a CSV file and set  metadata:
# 
# 1. Read a CSV file first, and then set the metadata
# 2. Read a CSV file and set the metadata together
# 3. Read a CSV file and set the metadata from a file in disk 

# #### Read the CSV file First and Then Set the Metadata

# First, read the CSV files as follows:

# In[5]:


A = em.read_csv_metadata(path_A)


# In[6]:


A.head()


# In[7]:


# Display the 'type' of A 
type(A)


# Then set the metadata for the table. We see `ID` is the key attribute (since it contains unique values and no value is missing) for the table. We can set this metadata as follows:

# In[8]:


em.set_key(A, 'ID')


# In[9]:


# Get the metadata that were set for table A
em.get_key(A)


# Now the CSV file is read into the memory and the metadata (i.e. key) is set for the table. 

# ## Read a CSV File and Set Metadata Together

# In the above, we saw that we first read in the CSV file and then set the metadata. These two steps can be combined into a single step like this:

# In[10]:


A = em.read_csv_metadata(path_A, key='ID')


# In[11]:


# Display the 'type' of A
type(A)


# In[12]:


# Get the metadata that were set for the table A 
em.get_key(A)


# ## Read a CSV File and Set Metadata from a File in Disk

# The user can specify the metadata in a file.
# 
# This file *MUST* be in the same directory as the CSV file and the file name 
# should be same, except the extension is set to '.metadata'.

# In[13]:


# Specify the metadata for table A (stored in person_table_A.csv).

# Get the file name (with full path) where the metadata file must be stored
metadata_fname = 'person_table_A.metadata'
metadata_file = datasets_dir + os.sep + metadata_fname

# Specify the metadata for table A . Here we specify that 'ID' is the key attribute for the table. 

# Note that this step  requires write permission to the datasets directory.
with open(metadata_file, 'w') as the_file:
    the_file.write('#key=ID')


# Note: In the above, we used Unix shell command `echo` to write the metadata contents. If you are on Windows, you can use `echo|set /p` instead of `echo` to acheive the same effect.

# In[14]:


# If you donot have write permissions to the datasets directory, first copy the file to the local directory and 
# then create a metadata file like this (you need to uncomment the following lines and then execute):

# import shutil
# shutil.copy2('path_A', './person_table_A.metadata')
# metadata_local_file = 'person_table_A.metadata'
# with open(metadata_local_file, 'w') as the_file:
#    the_file.write('#key=ID'))


# In[15]:


# Read the CSV file for table A
A = em.read_csv_metadata(path_A)


# In[16]:


# Get the key for table A
em.get_key(A)


# In[17]:


# Remove the metadata file
os.remove(metadata_file) if os.path.exists(metadata_file) else None
os.remove('person_table_A.csv') if os.path.exists('person_table_A.csv') else None
os.remove(metadata_fname) if os.path.exists(metadata_fname) else None