Fast Queries on a CSV

About: This is the completion project for Introduction to Algorithims on the Data Engineer Track Goal: Compare methods for accessing data using in-line processsing and pre-cachching

In [1]:
import csv
header=[]
rows=[]
with open('laptops.csv') as f:
    rows = list(csv.reader(f))
    header=rows[0]
    del rows[0]
    
print(header)
print(rows[0:5])
['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']
[['6571244', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 2.3GHz', '8GB', '128GB SSD', 'Intel Iris Plus Graphics 640', 'macOS', '1.37kg', '1339'], ['7287764', 'Apple', 'Macbook Air', 'Ultrabook', '13.3', '1440x900', 'Intel Core i5 1.8GHz', '8GB', '128GB Flash Storage', 'Intel HD Graphics 6000', 'macOS', '1.34kg', '898'], ['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', '575'], ['9722156', 'Apple', 'MacBook Pro', 'Ultrabook', '15.4', 'IPS Panel Retina Display 2880x1800', 'Intel Core i7 2.7GHz', '16GB', '512GB SSD', 'AMD Radeon Pro 455', 'macOS', '1.83kg', '2537'], ['8550527', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 3.1GHz', '8GB', '256GB SSD', 'Intel Iris Plus Graphics 650', 'macOS', '1.37kg', '1803']]
In [2]:
import csv

class Inventory:
    
    ##########################################################################################
    # Constructor csv_filename should be the name/location to a laptops.csv formatted file
    ##########################################################################################
    def __init__(self, csv_filename):
        self.filename=csv_filename
        self.__loadfile()
        self.header=[]
        self.rows=[]
        self.__loadfile() #load the file
        
        for row in self.rows:
            row[-1] = float(row[-1])
                
        self.rows_by_price = sorted(self.rows, key=lambda r: r[-1])
        
        #indexes
        self.id_to_row ={}
        self.prices=set()
        
        #pre-cache valuable lookups
        self.__create_indexes() 
    ############################################################
    # Private method that loads the file passed into the constructor
    ############################################################
    def __loadfile(self):
        with open('laptops.csv') as f:
            self.rows = list(csv.reader(f)) # read all the rows
            self.header=self.rows[0]        # grab row zero as the header
            del self.rows[0]                # remove the header row from the data
    
    ############################################################
    # Private method that denormalizes the data into indexes
    ############################################################
    def __create_indexes(self):
        
        if len(self.id_to_row) == 0:
            for i in range(len(self.rows)):

                #build an id index into the data
                self.id_to_row[int(self.rows[i][0])] = i
                
                #build a price cache 
                self.prices.add(self.rows[i][12])
            
    ############################################################
    # O(N) 
    # Returns a laptop based on its id or None if no matching id exists
    ############################################################            
    def get_laptop_from_id(self, laptop_id): # O(N)
        for row in self.rows:
            if int(row[0]) == laptop_id:
                return row
        
        return None
            
    ############################################################
    # O(1)
    # Returns a laptop based on its id or None if no matching id exists
    ############################################################            
    def get_laptop_from_id_fast(self, laptop_id): # O(1)

                
        # send back the matching row or None if no match O(1)
        return self.rows[self.id_to_row[laptop_id]] if laptop_id in self.id_to_row else None

    ############################################################
    # O(N^2)
    # Find if one or two laptops exist that add up to the 
    # specified amount. If a match is found then True is returned
    # If no match is found then None is return 
    ############################################################  
    def check_promotion_dollars(self, dollars):
        # find if one item will match the dollars
        for row in self.rows:
            if float(row[12]) == dollars: 
                return True
            
        # check if the sum of two items will match the dollars
        for i in range(len(self.rows)): # O(N)
            for j in range(len(self.rows)): # O(N*N)= O(N^2)
                celli = float(self.rows[i][12])
                cellj = float(self.rows[j][12])
                if celli+cellj == dollars:
                    return True
        return None
        
    ############################################################
    # O(N)
    # Find if one or two laptops exist that add up to the 
    # specified amount. If a match is found then True is returned
    # If no match is found then None is return 
    ############################################################          
    def check_promotion_dollars_fast(self, dollars):
        
        # find if one item will match the dollars
        if dollars in self.prices:
            return True # O(1)
    
        # check if the sum of two items will match the dollars
        # this is the clever solution of backing into a single value
        for p in self.prices: # O(N)
            value1 = dollars - p
            if value1 in self.prices:
                return True
            
        return None
    
    

    ############################################################
    # O(log(N))
    # Find the first laptop that is more expensive than the price passed in
    # If no match is found then -1 is returned
    ############################################################  
    def find_first_laptop_more_expensive(self, target_price):
        range_start = 0                                   
        range_end = len(self.rows_by_price) - 1                       
        while range_start < range_end:
            range_middle = (range_end + range_start) // 2  
            price = self.rows_by_price[range_middle][-1]
            if price > target_price:                            
                range_end = range_middle                        
            elif price <= target_price:                           
                range_start = range_middle + 1    
        
        price = self.rows_by_price[range_start][-1]
        
        # if we are returning the max price and the price is greater than that, return -1 
        # This means the target_price is higher than any price in the data
        if range_start == len(self.rows_by_price)-1 and price < target_price: 
            return -1
        
        return range_start
        
In [3]:
c = Inventory('laptops.csv')
print('1')
print(c.header)
print(len(c.rows))
print(c.get_laptop_from_id_fast(3362737))
print(c.get_laptop_from_id_fast(3362736))
1
['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']
1303
['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575.0]
None
In [4]:
# adding a index for promotional dollar calculation

import time
import random

c2 = Inventory('laptops.csv')

ids = [random.randint(1000000,9999999) for _ in range(10000)]

# I Commented this out because it is just tooooooo slow
total_time_no_dict = 0
# for i in ids:
#     start = time.time()
#     c.check_promotion_dollars(i) # O(N^2)
#     end = time.time()
#     total_time_no_dict += end-start

total_time_dict = 0
for i in ids: 
    start = time.time()
    c.check_promotion_dollars_fast(i) # O(N)
    end = time.time()
    total_time_dict += end-start

print(total_time_no_dict, total_time_dict)
    
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-4-be69c655d148> in <module>
     11 for i in ids:
     12     start = time.time()
---> 13     c.check_promotion_dollars(i) # O(N^2)
     14     end = time.time()
     15     total_time_no_dict += end-start

<ipython-input-2-cc4e5587cfa0> in check_promotion_dollars(self, dollars)
     84             for j in range(len(self.rows)): # O(N*N)= O(N^2)
     85                 celli = float(self.rows[i][12])
---> 86                 cellj = float(self.rows[j][12])
     87                 if celli+cellj == dollars:
     88                     return True

KeyboardInterrupt: 
In [ ]:
# Promotional Dollar Test Values (1000 - True, 442 - False)
#Slow
v = c.check_promotion_dollars(1000)
assert v==True, f"Expected True, Received {str(v)}"
v = c.check_promotion_dollars(442)
assert v==None, f"Expected None, Received {str(v)}"

#Fast
v=c.check_promotion_dollars_fast(1000)
assert v==True, f"Expected True, Received {str(v)}"
v=c.check_promotion_dollars_fast(442)
assert v==None, f"Expected None, Received {str(v)}"
In [ ]:
# Adding an index to lookup a laptop by id
prices = [random.randint(100,5000) for _ in range(100)]
c3 = Inventory('laptops.csv')

#Time comparisons
total_time_no_set = 0
for i in ids:
    start = time.time()
    c.get_laptop_from_id(i) # O(N)
    end = time.time()
    total_time_no_set += end-start

total_time_set = 0
for i in ids: 
    start = time.time()
    c.get_laptop_from_id_fast(i) # O(1)
    end = time.time()
    total_time_set += end-start

print(total_time_no_set, total_time_set, f"Using a set is {int(total_time_no_set/total_time_set)} times faster")
In [ ]:
# Binary Search for prices
c3 = Inventory('laptops.csv')

v=c3.find_first_laptop_more_expensive(150) # Boundary Case result should be 0
print(v, c3.rows_by_price[v][12]>150, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==0, f"index should be 0, received {str(v)}"

v=c3.find_first_laptop_more_expensive(174) # Normal Case result should be 1
print(v, c3.rows_by_price[v][12]>174, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==1, f"index should be 1, received {str(v)}"

v=c3.find_first_laptop_more_expensive(1000) # Normal Case result should be 683
print(v, c3.rows_by_price[v][12]>1000, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==683, f"index should be 683, received {str(v)}"

v=c3.find_first_laptop_more_expensive(5500) # Max Case result should be 1302
print(v, c3.rows_by_price[v][12]>5500, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], 'NA')
assert v==1302, f"index should be 1302, received {str(v)}"

v=c3.find_first_laptop_more_expensive(10000) #Error case should be -1
print(v, c3.rows_by_price[v][12]>10000, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], 'NA')
assert v==-1,f"index should be -1, received {str(v)}"

# x =0
# for row in c3.rows_by_price:
#     print(x, row[12])
#     x+=1
In [ ]: