About: This is the completion project for Introduction to Algorithims on the Data Engineer Track Goal: Compare methods for accessing data using in-line processsing and pre-cachching
import csv
header=[]
rows=[]
with open('laptops.csv') as f:
rows = list(csv.reader(f))
header=rows[0]
del rows[0]
print(header)
print(rows[0:5])
import csv
class Inventory:
##########################################################################################
# Constructor csv_filename should be the name/location to a laptops.csv formatted file
##########################################################################################
def __init__(self, csv_filename):
self.filename=csv_filename
self.__loadfile()
self.header=[]
self.rows=[]
self.__loadfile() #load the file
for row in self.rows:
row[-1] = float(row[-1])
self.rows_by_price = sorted(self.rows, key=lambda r: r[-1])
#indexes
self.id_to_row ={}
self.prices=set()
#pre-cache valuable lookups
self.__create_indexes()
############################################################
# Private method that loads the file passed into the constructor
############################################################
def __loadfile(self):
with open('laptops.csv') as f:
self.rows = list(csv.reader(f)) # read all the rows
self.header=self.rows[0] # grab row zero as the header
del self.rows[0] # remove the header row from the data
############################################################
# Private method that denormalizes the data into indexes
############################################################
def __create_indexes(self):
if len(self.id_to_row) == 0:
for i in range(len(self.rows)):
#build an id index into the data
self.id_to_row[int(self.rows[i][0])] = i
#build a price cache
self.prices.add(self.rows[i][12])
############################################################
# O(N)
# Returns a laptop based on its id or None if no matching id exists
############################################################
def get_laptop_from_id(self, laptop_id): # O(N)
for row in self.rows:
if int(row[0]) == laptop_id:
return row
return None
############################################################
# O(1)
# Returns a laptop based on its id or None if no matching id exists
############################################################
def get_laptop_from_id_fast(self, laptop_id): # O(1)
# send back the matching row or None if no match O(1)
return self.rows[self.id_to_row[laptop_id]] if laptop_id in self.id_to_row else None
############################################################
# O(N^2)
# Find if one or two laptops exist that add up to the
# specified amount. If a match is found then True is returned
# If no match is found then None is return
############################################################
def check_promotion_dollars(self, dollars):
# find if one item will match the dollars
for row in self.rows:
if float(row[12]) == dollars:
return True
# check if the sum of two items will match the dollars
for i in range(len(self.rows)): # O(N)
for j in range(len(self.rows)): # O(N*N)= O(N^2)
celli = float(self.rows[i][12])
cellj = float(self.rows[j][12])
if celli+cellj == dollars:
return True
return None
############################################################
# O(N)
# Find if one or two laptops exist that add up to the
# specified amount. If a match is found then True is returned
# If no match is found then None is return
############################################################
def check_promotion_dollars_fast(self, dollars):
# find if one item will match the dollars
if dollars in self.prices:
return True # O(1)
# check if the sum of two items will match the dollars
# this is the clever solution of backing into a single value
for p in self.prices: # O(N)
value1 = dollars - p
if value1 in self.prices:
return True
return None
############################################################
# O(log(N))
# Find the first laptop that is more expensive than the price passed in
# If no match is found then -1 is returned
############################################################
def find_first_laptop_more_expensive(self, target_price):
range_start = 0
range_end = len(self.rows_by_price) - 1
while range_start < range_end:
range_middle = (range_end + range_start) // 2
price = self.rows_by_price[range_middle][-1]
if price > target_price:
range_end = range_middle
elif price <= target_price:
range_start = range_middle + 1
price = self.rows_by_price[range_start][-1]
# if we are returning the max price and the price is greater than that, return -1
# This means the target_price is higher than any price in the data
if range_start == len(self.rows_by_price)-1 and price < target_price:
return -1
return range_start
c = Inventory('laptops.csv')
print('1')
print(c.header)
print(len(c.rows))
print(c.get_laptop_from_id_fast(3362737))
print(c.get_laptop_from_id_fast(3362736))
# adding a index for promotional dollar calculation
import time
import random
c2 = Inventory('laptops.csv')
ids = [random.randint(1000000,9999999) for _ in range(10000)]
# I Commented this out because it is just tooooooo slow
total_time_no_dict = 0
# for i in ids:
# start = time.time()
# c.check_promotion_dollars(i) # O(N^2)
# end = time.time()
# total_time_no_dict += end-start
total_time_dict = 0
for i in ids:
start = time.time()
c.check_promotion_dollars_fast(i) # O(N)
end = time.time()
total_time_dict += end-start
print(total_time_no_dict, total_time_dict)
# Promotional Dollar Test Values (1000 - True, 442 - False)
#Slow
v = c.check_promotion_dollars(1000)
assert v==True, f"Expected True, Received {str(v)}"
v = c.check_promotion_dollars(442)
assert v==None, f"Expected None, Received {str(v)}"
#Fast
v=c.check_promotion_dollars_fast(1000)
assert v==True, f"Expected True, Received {str(v)}"
v=c.check_promotion_dollars_fast(442)
assert v==None, f"Expected None, Received {str(v)}"
# Adding an index to lookup a laptop by id
prices = [random.randint(100,5000) for _ in range(100)]
c3 = Inventory('laptops.csv')
#Time comparisons
total_time_no_set = 0
for i in ids:
start = time.time()
c.get_laptop_from_id(i) # O(N)
end = time.time()
total_time_no_set += end-start
total_time_set = 0
for i in ids:
start = time.time()
c.get_laptop_from_id_fast(i) # O(1)
end = time.time()
total_time_set += end-start
print(total_time_no_set, total_time_set, f"Using a set is {int(total_time_no_set/total_time_set)} times faster")
# Binary Search for prices
c3 = Inventory('laptops.csv')
v=c3.find_first_laptop_more_expensive(150) # Boundary Case result should be 0
print(v, c3.rows_by_price[v][12]>150, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==0, f"index should be 0, received {str(v)}"
v=c3.find_first_laptop_more_expensive(174) # Normal Case result should be 1
print(v, c3.rows_by_price[v][12]>174, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==1, f"index should be 1, received {str(v)}"
v=c3.find_first_laptop_more_expensive(1000) # Normal Case result should be 683
print(v, c3.rows_by_price[v][12]>1000, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==683, f"index should be 683, received {str(v)}"
v=c3.find_first_laptop_more_expensive(5500) # Max Case result should be 1302
print(v, c3.rows_by_price[v][12]>5500, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], 'NA')
assert v==1302, f"index should be 1302, received {str(v)}"
v=c3.find_first_laptop_more_expensive(10000) #Error case should be -1
print(v, c3.rows_by_price[v][12]>10000, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], 'NA')
assert v==-1,f"index should be -1, received {str(v)}"
# x =0
# for row in c3.rows_by_price:
# print(x, row[12])
# x+=1