#!/usr/bin/env python # coding: utf-8 # # Brute Forces, Secretaries, and Dichotomies # # Chapter 11 of [Real World Algorithms](https://mitpress.mit.edu/books/real-world-algorithms). # # --- # # > Panos Louridas
# > Athens University of Economics and Business # ## Sequential Search # # Sequential search is perhaps the most straightforward search method. # # We start from the beginning and check each item in turn until we find the one we want. # # It can be used for both sorted and unsorted data, but there are much better methods for sorted data. # # Here is a straightforward implementation: # In[1]: def sequential_search(a, s): for i, element in enumerate(a): if element == s: return i return -1 # Let's check it on a random list: # In[2]: import random a = list(range(1000)) random.shuffle(a) pos = sequential_search(a, 314) print(a[pos], 'found at', pos) pos = sequential_search(a, 1001) print(pos) # We need not write `sequential_search(a, s)` in Python. # # If `a` is a list, we can use `a.index(s)` instead. # # In fact that's what we should do, because it is way faster (we saw that also in Chapter 7). # Here is the timing for our version: # In[3]: import timeit total_elapsed = 0 for i in range(100): a = list(range(10000)) random.shuffle(a) start = timeit.default_timer() index = sequential_search(a, 314) end = timeit.default_timer() total_elapsed += end - start print(total_elapsed) # And here is the timing for the native version (which actually calls a function written in C): # In[4]: total_elapsed = 0 for i in range(100): a = list(range(10000)) random.shuffle(a) start = timeit.default_timer() index = a.index(314) end = timeit.default_timer() total_elapsed += end - start print(total_elapsed) # ## Matching, Comparing, Records, Keys # # When we are searching for an item in the list, Python performs an equality test between each item and the item we are searching for. # # The equality test is performed with the operator `==`. # # Checking for equality is not the same as checking whether two items are the *same*. # # This is called *strict comparison* and in Python it is implemented with the operator `is`. # That means that the following two are equal: # In[5]: an_item = (1, 2) another_item = (1, 2) an_item == another_item # But they are not the same: # In[6]: an_item is another_item # As another example, let's see what happens with Python's support for complex numbers: # In[7]: x = 3.14+1.62j y = 3.14+1.62j print(x == y) print(x is y) # String comparison is must faster than equality checking, but it is not what we usually want to use. # # A common idiom for identity checking in Python is checking for `None`, like `if a is None` or `if a is not None`. # In many cases, we hold information for entities in *records*, which are collections of *attributes*. # # In that case, we want to search for an entity based on a particular attribute that identifies it. # # The attribute is called a *key*. # In Python we can represent records as *objects* that are instances of a class. # # Alternatively, we can represent them as dictionaries. # # In fact, Python objects use dictionaries internally. # # Let's get a list of two persons. # In[8]: john = { 'first_name': 'John', 'surname': 'Doe', 'passport_no': 'AI892495', 'year_of_birth': 1990, 'occupation': 'teacher' } jane = { 'first_name': 'Jane', 'surname': 'Roe', 'passport_no': 'AI485713', 'year_of_birth': 1986, 'occupation': 'civil engineer' } persons = [john, jane] # In this example, the key for our search would be the passport number, because we would like to find the full information for a person with that particular piece of identification. # # To do that we could re-implement sequential search so that we provide to it the comparison function. # In[9]: def sequential_search_m(a, s, matches): for i, element in enumerate(a): if matches(element, s): return i return -1 def match_passport(person, passport_no): return person['passport_no'] == passport_no pos = sequential_search_m(persons, 'AI485713', match_passport) print(persons[pos], 'found at', pos) # Although you would probably use something more Pythonic like: # In[10]: results = [(i, p) for i, p in enumerate(persons) if p['passport_no'] == 'AI485713'] results # ## Self-Organizing Search # # In self-organizing search, we take advantage of an item's popularity to move it to the front of the collection in which we are performing our searches. # # In the move-to-front method, when we find an item we move it directly to the front. # # In the transposition method, when we find an item we swap it with its predecessor (if any). # We cannot implement directly the algorithms for lists given in the book (that is, algorithm 11.2 and algorithm 11.3) for the simple reason that Python hides the list implementation from us. # # Moreover, Python lists are *not* linked lists. They are variable-length arrays (see the online documentation for details on the [implementation of lists in Python](https://docs.python.org/3/faq/design.html#how-are-lists-implemented)). # # We can implement algorithm 11.3, which is the transposition method for arrays. # In[11]: def transposition_search(a, s): for i, item in enumerate(a): if item == s: if i > 0: a[i-1], a[i] = a[i], a[i-1] return i-1 else: return i return -1 # How can we test `transposition_search(a, s)`? # # We need to do some groundwork to emulate a situation of popularity-biased searches. # # In particular, we will create a setting where the items we are searching for are governed by Zipf's law. # # First, we'll write a function that provides the Zipf probability for $n$ items. # In[12]: def zipf(n): h = 0 for x in range(1, n+1): h += 1/x z = [ 1/x * 1/h for x in range(1, n+1) ] return z # We'll work with 1000 items: # In[13]: zipf_1000 = zipf(1000) # We can check that they sum up to 1, and see the first 20 of the probabilities. # In[14]: print(sum(zipf_1000)) print(zipf_1000[:20]) # Again we will be performing our searches on 1000 items, in random order. # In[15]: a = list(range(1000)) random.shuffle(a) print(a[:20]) # We will perform 100000 searches among these items. # # We want the searches to follow Zipf's law. # # First, we will create another list of 1000 items in random order. # In[16]: b = list(range(1000)) random.shuffle(b) print(b[:20]) # Then, we will select 100000 items from the second list, using the Zipf probabilities. # # That means that we will be selecting the first item with probability `zipf_1000[0]`, the second item with probability `zipf_1000[1]`, and so on. # # In[17]: searches = random.choices(b, weights=zipf_1000, k=100000) # Indeed, we can verify that the popularity of items in `searches` mirrors `b`: # In[18]: from collections import Counter counted = Counter(searches) counted.most_common(20) # So, we will perform 100000 searches in the first list, using as keys the items in `searches`. # # Because `transposition_search(a, s)` changes `a`, we will keep a copy of it to use it to compare the performance with simple sequential search. # # At the end, apart from displaying the time elapsed we will also show the first items of the changed `a`, to see how popular searches have gone to the beginning. # In[19]: a_copy = a[:] total_elapsed = 0 for s in searches: start = timeit.default_timer() index = transposition_search(a, s) end = timeit.default_timer() total_elapsed += end - start print(total_elapsed) print(a[:20]) # We will now perform the same searches with `a_copy` using simple sequential search. # In[20]: total_elapsed = 0 for s in searches: start = timeit.default_timer() index = sequential_search(a_copy, s) end = timeit.default_timer() total_elapsed += end - start print(total_elapsed) # ## The Secretary Problem # # The secretary problem requires selecting the best item when we have not seen, and we cannot wait to see, the full sets of items. # # The solution is an online algorithm. We find the best item among the first $n/e$, where $n$ is the total expected number of items, and $e \approx 2.71828$ is [Euler's number](https://en.wikipedia.org/wiki/E_(mathematical_constant). # # Then we select the first of the remaining items that is better than that. The probability that we'll indeed select the best item is $n/e \approx 37\%$. # # Here is how we can do that: # In[21]: import math def secretary_search(a): # Calculate |a|/n items. m = int((len(a) // math.e) + 1) # Find the best among the first |a|/n. c = 0 for i in range(1, m): if a[i] > a[c]: c = i # Get the first that is better from the one # we found, if possible. for i in range(m, len(a)): if a[i] > a[c]: return i return - 1 # Does `secretary_search(a)` find the best item in `a` about 37% of the time? # # To check that, we'll continue working in a similar fashion. We'll perform 1000 searches in 1000 items and see how often we do come up with the best item. # In[22]: total_found = 0 for i in range(1000): a = list(range(1000)) random.shuffle(a) index = secretary_search(a) max_index = a.index(max(a)) if index == max_index: total_found += 1 print(f"found {total_found} out of {i+1}, {100*total_found/(i+1)}%") # ## Binary Search # # Binary search is the most efficient way to search for an item when the search space is *ordered*. # # It is an iterative algorithm, where in each iteration we split the search space in half. # # We start by asking if the search item is in the middle of the search space. Let's assume that the items are ordered in ascending orded. # # If it is greater than the item in the middle, we repeat the question on the right part of the search space; it it is smaller, we repeat the question on the left part of the search space. We continue until we find the item, or we cannot perform a split any more. # In[23]: def binary_search(a, item): # Initialize borders of search space. low = 0 high = len(a) - 1 # While the search space is not empty: while low <= high: # Split the search space in the middle. mid = low + (high - low) // 2 # Compare with midpoint. c = (a[mid] > item) - (a[mid] < item) # If smaller, repeat on the left half. if c < 0: low = mid + 1 # If greater, repeat on the right half. elif c > 0: high = mid - 1 # If found, we are done. else: return mid return -1 # In Python 3 there is no `cmp(x, y)` function that compares `x` and `y` and returns -1, 0, or 1, if `x > y`, `x == y`, or `x < y`, respectively. We use the # ```python # (x > y) - (y < x)``` # idiom instead. # # Note also the line where we calculate the midpoint: # ```python # mid = low + (high - low) // 2 # ``` # # This guards against overflows. In Python that is not necessary, because there is no upper limit in integers, so it could be: # ```python # mid = (low + high) // 2 # ``` # # However, this is a problem in most other languages, so we'll stick with the foolproof version. # To see how binary search works we can add some tracing information in `binary_search(a, item)`: # In[24]: def binary_search_trace(a, item): print("Searching for", item) # Initialize borders of search space. low = 0 high = len(a) - 1 # While the search space is not empty: while low <= high: # Split the search space in the middle. mid = low + (high - low) // 2 # Compare with midpoint. c = (a[mid] > item) - (a[mid] < item) print(f"({low}, {a[low]}), ({mid}, {a[mid]}), ({high}, {a[high]})") # If smaller, repeat on the left half. if c < 0: low = mid + 1 # If greater, repeat on the right half. elif c > 0: high = mid - 1 # If found, we are done. else: return mid return -1 a = [4, 10, 31, 65, 114, 149, 181, 437, 480, 507, 551, 613, 680, 777, 782, 903] binary_search_trace(a, 149) binary_search_trace(a, 181) binary_search_trace(a, 583) binary_search_trace(a, 450) binary_search_trace(a, 3) # Binary search is very efficient—in fact, it is as efficient as a search method can be (there is a smalll caveat here, concerning searching in quantum computers, but we can leave that aside). # # If have have $n$ items, it will complete the search in $O(\lg(n))$. # # Once again, we can verify that theory agrees with practice. We will perform 1000 searches, 500 of them successful and 500 of them unsuccessful and count the average number of iterations required. To do that, we'll change `binary_search(a, item)` so that it also returns the number of iterations. # In[25]: def binary_search_count(a, item): # Initialize borders of search space. low = 0 high = len(a) - 1 i = 0 # While the search space is not empty: while low <= high: i += 1 # Split the search space in the middle. mid = low + (high - low) // 2 # Compare with midpoint. c = (a[mid] > item) - (a[mid] < item) # If smaller, repeat on the left half. if c < 0: low = mid + 1 # If greater, repeat on the right half. elif c > 0: high = mid - 1 # If found, we are done. else: return (i, mid) return (i, -1) # We build up our test suite. Our items will be 1000 random numbers in the range from 0 to 999,999. # # We will select 500 of them to perform matching searches, and another 500, not in them, to perform non-matching searches. # In[26]: num_range = 1000000 # Get 1000 random numbers from 0 to 999999. a = random.sample(range(num_range), k=1000) # Select 500 from them for our matching searches. existing = random.sample(a, k=500) # Select another 500 random numbers in the range, # not in the set a, for our non-matching searches non_existing = random.sample(set(range(num_range)) - set(a), k=500) # Verify that the matching and non-matchin sets are distinct. print(set(existing) & set(non_existing)) # So now we can see how the average number of iterations in practice fares compared to what predicted by theory. # In[27]: total_iters = 0 for matching, non_matching in zip(existing, non_existing): matching_iters, _ = binary_search_count(a, matching) non_matching_iters, _ = binary_search_count(a, non_matching) total_iters += (matching_iters + non_matching_iters) print(f"Average iterations:", total_iters / (len(existing) + len(non_existing))) print(f"lg(1000) = {math.log(1000, 2)}")