#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: from tf.fabric import Fabric from tf.convert.walker import CV import cProfile, pstats, io from pstats import SortKey # In[3]: TF_PATH = '_temp/tf' # # Make test set # In[4]: TF = Fabric(locations=TF_PATH, silent=True) # In[5]: slotType = 'slot' generic = { 'name': 'test set for query strategy testing', 'compiler': 'Dirk Roorda', } otext = { 'fmt:text-orig-full': '{num}{cat} ', 'sectionTypes': 'chunk', 'sectionFeatures': 'num', } intFeatures = { 'num', } featureMeta = { 'num': { 'description': 'node number', }, 'cat': { 'description': 'category: m f n', }, } nSlots = 400000 chunkSize = 4 cats = ['m', 'f', 'n'] def director(cv): c = None for n in range(nSlots): if n % chunkSize == 0: cv.terminate(c) c = cv.node('chunk') cv.feature(c, num=n // chunkSize) s = cv.slot() cv.feature(s, num=n, cat=cats[n % 3]) cv.terminate(c) cv = CV(TF) good = cv.walk( director, slotType, otext=otext, generic=generic, intFeatures=intFeatures, featureMeta=featureMeta, ) # # Load test set # In[4]: TF = Fabric(locations=TF_PATH, silent='deep') api = TF.loadAll() docs = api.makeAvailableIn(globals()) silentOff() # # Main test1 # # This query template consists of a `chunk` and its first and last nodes, # and an independent slot that is constrained between those nodes. # In[5]: query = ''' chunk =: a:slot < c:slot := s:slot a < s s < c ''' # First we run it with a few old strategies. # The strategies are not really documented, except from # comments in the code # because they are an implementation detail. # In case you're interested, click the strategy names to go to the code: # # * [`small_choice_first`](https://github.com/annotation/text-fabric/blob/85db305f357466d4735edc7aea4cdfaae6ef6774/tf/search/stitch.py#L152-L219) # * [`small_choice_multi`](https://github.com/annotation/text-fabric/blob/85db305f357466d4735edc7aea4cdfaae6ef6774/tf/search/stitch.py#L222-L347) # * [`by_yarn_size`](https://github.com/annotation/text-fabric/blob/85db305f357466d4735edc7aea4cdfaae6ef6774/tf/search/stitch.py#L350-L425) # # The third one `by_yarn_size` is virtually identical for the kind of queries we are testing here. # So we concentrate on the first two. # # When we run the experiments, we do these steps: # # * study # * show plan # * fetch 10 results under a profiler and collect statistics # ## Strategy: small choice first # In[6]: S.study(query, strategy='small_choice_first') # In[7]: S.showPlan(details=True) # In[8]: pr = cProfile.Profile() pr.enable() results = S.fetch(limit=10) pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) # In[9]: print('\n'.join(str(r) for r in results)) # In[10]: S.count(progress=1, limit=50) # ## Strategy: small choice multi # In[11]: S.study(query, strategy='small_choice_multi') # In[12]: S.showPlan(details=True) # Observe how two `< >` constraints have been taken together. # They will be tested in one pass. # In[13]: pr = cProfile.Profile() pr.enable() results = S.fetch(limit=10) pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) # In[14]: print('\n'.join(str(r) for r in results)) # In[15]: S.count(progress=1, limit=50) # ## Observations: # # `small_choice_multi` has a better performance. # # It does only 50% of the function calls that `small_choice_first` does: it cuts out nearly all calls to `stitchOn()` which is a recursive # function that generates new candidates. # # If you look at the primitive calls, then the gain is 30%. # # If you look at the time spent in the `stitchOn()` calls, then you see that `small_choice_first` spends 50% more time in it than # `small_choice_multi`. # # **N.B.** # # In `small_choice_first` 1,600,000 calls to `stitchOn()` take 1.5 seconds. # # In `small_choice_multi` 121 calls to `stitchOn()` take 1.0 seconds. # # That is remarkable. In order to compute the multi-edge, a lot of time per call is needed. # But the net result is positive. # # There is a price: the most time consuming bit is this line: # # Main test 2 # # We leave out something of the query. # In[46]: query = ''' chunk =: a:slot c:slot := s:slot a < s s < c ''' # It should not make a difference to the outcome that we omit the `a < c` condition, since all chunks have a length greater than 1, # so the first slot of a chunk is always before the last one (and not identical with it). # ## Strategy: small choice first # In[47]: S.study(query, strategy='small_choice_first') # In[48]: S.showPlan(details=True) # In[49]: pr = cProfile.Profile() pr.enable() results = S.fetch(limit=10) pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) # In[50]: print('\n'.join(str(r) for r in results)) # In[51]: S.count(progress=1, limit=50) # ## Strategy: small choice multi # In[52]: S.study(query, strategy='small_choice_multi') # In[53]: S.showPlan(details=True) # Observe how two `< >` constraints have been taken together. # They will be tested in one pass. # In[54]: pr = cProfile.Profile() pr.enable() results = S.fetch(limit=10) pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) # In[55]: print('\n'.join(str(r) for r in results)) # In[56]: S.count(progress=1, limit=50) # # Main test 3 # # We add something of the query. # In[57]: query = ''' chunk =: a:slot < b:slot < d:slot c:slot := s:slot b < s s < d ''' # It becomes more difficult to constrain s within the chunk. # # This is a heavy query. # ## Strategy: small choice first # In[58]: S.study(query, strategy='small_choice_first') # In[59]: S.showPlan(details=True) # In[60]: pr = cProfile.Profile() pr.enable() results = S.fetch(limit=10) pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) # In[61]: print('\n'.join(str(r) for r in results)) # ## Strategy: small choice multi # In[62]: S.study(query, strategy='small_choice_multi') # In[63]: S.showPlan(details=True) # In[64]: pr = cProfile.Profile() pr.enable() results = S.fetch(limit=10) pr.disable() s = io.StringIO() sortby = SortKey.CUMULATIVE ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) # In[65]: print('\n'.join(str(r) for r in results)) # # Observation # # Here is a query where the amount of time spent in the `stitchOn()` overtakes the time spent in the `all)` call. # # So we really have a mixed bag with these strategies. # # For now, I turn on the `small_choice_multi` because it makes really long queries a bit more bearable, and does not # make much of a difference for shorter queries. # # Main test 4 # # A quite different query. # In[66]: query = ''' chunk .num. slot ''' # ## Strategy: small choice first # In[67]: S.study(query, strategy='small_choice_first') # In[68]: S.showPlan(details=True) # In[69]: S.count(progress=1, limit=10) # ## Strategy: small choice multi # In[70]: S.study(query, strategy='small_choice_multi') # In[71]: S.showPlan(details=True) # In[72]: S.count(progress=1, limit=10) # ## Strategy: by yarn size # In[73]: S.study(query, strategy='by_yarn_size') # In[74]: S.showPlan(details=True) # In[75]: S.count(progress=1, limit=10) # # Main test 5 # # Yet another feature comparison query. # In[76]: query = ''' a:chunk n:slot < b:chunk m:slot n .cat. m ''' # ## Strategy: small choice first # In[77]: S.study(query, strategy='small_choice_first') # In[78]: S.showPlan(details=True) # In[79]: S.count(progress=100000, limit=1000000) # ## Strategy: small choice multi # In[80]: S.study(query, strategy='small_choice_multi') # In[81]: S.showPlan(details=True) # In[82]: S.count(progress=100000, limit=1000000) # ## Strategy: by yarn size # In[83]: S.study(query, strategy='by_yarn_size') # In[84]: S.showPlan(details=True) # In[85]: S.count(progress=100000, limit=1000000) # # Left overs # # Test use of shallow # In[5]: query = ''' chunk slot num=1 < slot ''' # In[6]: list(S.search(query)) # In[7]: list(S.search(query, shallow=True)) # In[8]: list(S.search(query, shallow=2)) # In[8]: query = ''' slot <: slot < slot <: slot < slot <: slot ''' # In[20]: from tf.app import use A = use('bhsa', mod='cmerwich/bh-reference-system/tf') # In[ ]: