#!/usr/bin/env python # coding: utf-8 # # Introduction # This IPython notebook illustrates how to read a CSV file from disk as a table and set its metadata. # First, we need to import *py_entitymatching* package and other libraries as follows: # In[2]: import py_entitymatching as em import pandas as pd import os, sys # # Different Ways to Read a CSV File and Set Metadata # First, we need to get the path of the CSV file in disk. For the convenience of the user, we have included some sample files in the package. The path of a sample CSV file can be obtained like this: # # # In[3]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the path of the input table path_A = datasets_dir + os.sep + 'person_table_A.csv' # In[4]: # Display the contents of the file in path_A get_ipython().system('cat $path_A | head -3') # Once we get the CSV file path, we can use it read the contents and set metadata. # ## Different Ways to Read a CSV File and Set Metadata # There are three different ways to read a CSV file and set metadata: # # 1. Read a CSV file first, and then set the metadata # 2. Read a CSV file and set the metadata together # 3. Read a CSV file and set the metadata from a file in disk # #### Read the CSV file First and Then Set the Metadata # First, read the CSV files as follows: # In[5]: A = em.read_csv_metadata(path_A) # In[6]: A.head() # In[7]: # Display the 'type' of A type(A) # Then set the metadata for the table. We see `ID` is the key attribute (since it contains unique values and no value is missing) for the table. We can set this metadata as follows: # In[8]: em.set_key(A, 'ID') # In[9]: # Get the metadata that were set for table A em.get_key(A) # Now the CSV file is read into the memory and the metadata (i.e. key) is set for the table. # ## Read a CSV File and Set Metadata Together # In the above, we saw that we first read in the CSV file and then set the metadata. These two steps can be combined into a single step like this: # In[10]: A = em.read_csv_metadata(path_A, key='ID') # In[11]: # Display the 'type' of A type(A) # In[12]: # Get the metadata that were set for the table A em.get_key(A) # ## Read a CSV File and Set Metadata from a File in Disk # The user can specify the metadata in a file. # # This file *MUST* be in the same directory as the CSV file and the file name # should be same, except the extension is set to '.metadata'. # In[13]: # Specify the metadata for table A (stored in person_table_A.csv). # Get the file name (with full path) where the metadata file must be stored metadata_fname = 'person_table_A.metadata' metadata_file = datasets_dir + os.sep + metadata_fname # Specify the metadata for table A . Here we specify that 'ID' is the key attribute for the table. # Note that this step requires write permission to the datasets directory. with open(metadata_file, 'w') as the_file: the_file.write('#key=ID') # Note: In the above, we used Unix shell command `echo` to write the metadata contents. If you are on Windows, you can use `echo|set /p` instead of `echo` to acheive the same effect. # In[14]: # If you donot have write permissions to the datasets directory, first copy the file to the local directory and # then create a metadata file like this (you need to uncomment the following lines and then execute): # import shutil # shutil.copy2('path_A', './person_table_A.metadata') # metadata_local_file = 'person_table_A.metadata' # with open(metadata_local_file, 'w') as the_file: # the_file.write('#key=ID')) # In[15]: # Read the CSV file for table A A = em.read_csv_metadata(path_A) # In[16]: # Get the key for table A em.get_key(A) # In[17]: # Remove the metadata file os.remove(metadata_file) if os.path.exists(metadata_file) else None os.remove('person_table_A.csv') if os.path.exists('person_table_A.csv') else None os.remove(metadata_fname) if os.path.exists(metadata_fname) else None