#!/usr/bin/env python # coding: utf-8 # This IPython notebook illustrates how to read the CSV files from disk as tables and set their metadata. # First, we need to import *py_entitymatching* package and other libraries as follows: # # In[1]: import py_entitymatching as em import pandas as pd import os, sys # ## Get the Path of the CSV File # the paths of the CSV file in the disk. For the convenience of the user, we have included some sample files in the package. The path of a sample CSV file can be obtained like this: # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' # In[3]: # Display the contents of the file in path_A get_ipython().system('cat $path_A | head -3') # ## Ways to Read a CSV File and Set Metadata # There are three different ways to read a CSV file and set metadata: # # 1. Read a CSV file first, and then set the metadata # 2. Read a CSV file and set the metadata together # 3. Read a CSV file and set the metadata from a file in disk # ### Read the CSV file First and Then Set the Metadata # First, read the CSV files as follows: # In[4]: A = em.read_csv_metadata(path_A) # In[5]: A.head() # In[6]: # Display the 'type' of A type(A) # Then set the metadata for the table. We see `ID` is the key attribute (since it contains unique values and no value is missing) for the table. We can set this metadata as follows: # In[7]: em.set_key(A, 'ID') # In[8]: # Get the metadata that were set for table A em.get_key(A) # Now the CSV file is read into the memory and the metadata (i.e. key) is set for the table. # ### Read a CSV File and Set Metadata Together # In the above, we saw that we first read in the CSV file and then set the metadata. These two steps can be combined into a single step like this: # In[9]: A = em.read_csv_metadata(path_A, key='ID') # In[10]: # Display the 'type' of A type(A) # In[11]: # Get the metadata that were set for the table A em.get_key(A) # ### Read a CSV File and Set Metadata from a File in Disk # The user can specify the metadata in a file. # # This file *MUST* be in the same directory as the CSV file and the file name # should be same, except the extension is set to '.metadata'. # In[12]: # We set the metadata for table A (stored in person_table_A.CSV). # Get the path where the metadata file must be stored metadata_file = datasets_dir + os.sep + 'person_table_A.metadata' # Specify the metadata for table A . Here we specify that 'ID' is the key attribute for the table. Note that this step # requires write permission to the datasets directory. get_ipython().system("echo '#key=ID' > $metadata_file") # In[13]: # Read the CSV file for table A A = em.read_csv_metadata(path_A) # In[14]: em.get_key(A) # In[15]: get_ipython().system('rm $metadata_file')