#!/usr/bin/env python # coding: utf-8 # This IPython notebook illustrates how to down sample two large tables that are loaded in the memory # In[1]: import py_entitymatching as em # Down sampling is typically done when the input tables are large (e.g. each containing more than 100K tuples). For the purposes of this notebook we will use two large datasets: Citeseer and DBLP. You can download Citeseer dataset from http://pages.cs.wisc.edu/~anhai/data/falcon_data/citations/citeseer.csv and DBLP dataset from http://pages.cs.wisc.edu/~anhai/data/falcon_data/citations/dblp.csv. Once downloaded, save these datasets as 'citeseer.csv' and 'dblp.csv' in the current directory. # In[5]: # Read the CSV files A = em.read_csv_metadata('./citeseer.csv',low_memory=False) # setting the parameter low_memory to False to speed up loading. B = em.read_csv_metadata('./dblp.csv', low_memory=False) # In[6]: len(A), len(B) # In[7]: A.head() # In[8]: B.head() # In[9]: # Set 'id' as the keys to the input tables em.set_key(A, 'id') em.set_key(B, 'id') # In[10]: # Display the keys em.get_key(A), em.get_key(B) # In[12]: # Downsample the datasets sample_A, sample_B = em.down_sample(A, B, size=1000, y_param=1) # In the down_sample command, set the `size` to the number of tuples that should be sampled from B (this would be the size of sampled B table) and set the `y_param` to be the number of matching tuples to be picked from A. # # In the above, we set the number of tuples to be sampled from B to be 1000. We set the `y_param` to 1 meaning that for each tuple sampled from B pick one matching tuple from A. # # In[13]: # Display the lengths of sampled datasets len(sample_A), len(sample_B) # Now, the input tables `A` and `B` (with 1.8M and 2.5M tuples) are down sampled to smaller tables `sample_A` and `sample_B` (with ).