#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


from tf.fabric import Fabric
from tf.convert.walker import CV
import cProfile, pstats, io
from pstats import SortKey


# In[3]:


TF_PATH = '_temp/tf'


# # Make test set

# In[4]:


TF = Fabric(locations=TF_PATH, silent=True)


# In[5]:


slotType = 'slot'
generic = {
    'name': 'test set for query strategy testing',
    'compiler': 'Dirk Roorda',
}
otext = {
    'fmt:text-orig-full': '{num}{cat} ',
    'sectionTypes': 'chunk',
    'sectionFeatures': 'num',
}
intFeatures = {
  'num',
}
featureMeta = {
    'num': {
        'description': 'node number',
    },
    'cat': {
        'description': 'category: m f n',
    },
}

nSlots = 400000
chunkSize = 4
cats = ['m', 'f', 'n']

def director(cv):
  c = None
  for n in range(nSlots):
    if n % chunkSize == 0:
      cv.terminate(c)
      c = cv.node('chunk')
      cv.feature(c, num=n // chunkSize)
    s = cv.slot()
    cv.feature(s, num=n, cat=cats[n % 3])
  cv.terminate(c)
    
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
)


# # Load test set

# In[4]:


TF = Fabric(locations=TF_PATH, silent='deep')
api = TF.loadAll()
docs = api.makeAvailableIn(globals())
silentOff()


# # Main test1
# 
# This query template consists of a `chunk` and its first and last nodes,
# and an independent slot that is constrained between those nodes.

# In[5]:


query = '''
chunk
  =: a:slot
  < c:slot
  :=

s:slot

a < s
s < c
'''


# First we run it with a few old strategies.
# The strategies are not really documented, except from 
# comments in the code
# because they are an implementation detail.
# In case you're interested, click the strategy names to go to the code:
# 
# * [`small_choice_first`](https://github.com/annotation/text-fabric/blob/85db305f357466d4735edc7aea4cdfaae6ef6774/tf/search/stitch.py#L152-L219)
# * [`small_choice_multi`](https://github.com/annotation/text-fabric/blob/85db305f357466d4735edc7aea4cdfaae6ef6774/tf/search/stitch.py#L222-L347)
# * [`by_yarn_size`](https://github.com/annotation/text-fabric/blob/85db305f357466d4735edc7aea4cdfaae6ef6774/tf/search/stitch.py#L350-L425)
# 
# The third one `by_yarn_size` is virtually identical for the kind of queries we are testing here.
# So we concentrate on the first two.
# 
# When we run the experiments, we do these steps:
# 
# * study
# * show plan
# * fetch 10 results under a profiler and collect statistics

# ## Strategy: small choice first

# In[6]:


S.study(query, strategy='small_choice_first')


# In[7]:


S.showPlan(details=True)


# In[8]:


pr = cProfile.Profile()
pr.enable()
results = S.fetch(limit=10)
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


# In[9]:


print('\n'.join(str(r) for r in results))


# In[10]:


S.count(progress=1, limit=50)


# ## Strategy: small choice multi

# In[11]:


S.study(query, strategy='small_choice_multi')


# In[12]:


S.showPlan(details=True)


# Observe how two `< >` constraints have been taken together.
# They will be tested in one pass.

# In[13]:


pr = cProfile.Profile()
pr.enable()
results = S.fetch(limit=10)
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


# In[14]:


print('\n'.join(str(r) for r in results))


# In[15]:


S.count(progress=1, limit=50)


# ## Observations:
# 
# `small_choice_multi` has a better performance.
# 
# It does only 50% of the function calls that `small_choice_first` does: it cuts out nearly all calls to `stitchOn()` which is a recursive
# function that generates new candidates.
# 
# If you look at the primitive calls, then the gain is 30%.
# 
# If you look at the time spent in the `stitchOn()` calls, then you see that `small_choice_first` spends 50% more time in it than
# `small_choice_multi`.
# 
# **N.B.** 
# 
# In `small_choice_first` 1,600,000 calls to `stitchOn()` take 1.5 seconds.
# 
# In `small_choice_multi` 121 calls to `stitchOn()` take 1.0 seconds.
# 
# That is remarkable. In order to compute the multi-edge, a lot of time per call is needed.
# But the net result is positive.
# 
# There is a price: the most time consuming bit is this line:

# # Main test 2
# 
# We leave out something of the query.

# In[46]:


query = '''
chunk
  =: a:slot
  c:slot
  :=

s:slot

a < s
s < c
'''


# It should not make a difference to the outcome that we omit the `a < c` condition, since all chunks have a length greater than 1,
# so the first slot of a chunk is always before the last one (and not identical with it).

# ## Strategy: small choice first

# In[47]:


S.study(query, strategy='small_choice_first')


# In[48]:


S.showPlan(details=True)


# In[49]:


pr = cProfile.Profile()
pr.enable()
results = S.fetch(limit=10)
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


# In[50]:


print('\n'.join(str(r) for r in results))


# In[51]:


S.count(progress=1, limit=50)


# ## Strategy: small choice multi

# In[52]:


S.study(query, strategy='small_choice_multi')


# In[53]:


S.showPlan(details=True)


# Observe how two `< >` constraints have been taken together.
# They will be tested in one pass.

# In[54]:


pr = cProfile.Profile()
pr.enable()
results = S.fetch(limit=10)
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


# In[55]:


print('\n'.join(str(r) for r in results))


# In[56]:


S.count(progress=1, limit=50)


# # Main test 3
# 
# We add something of the query.

# In[57]:


query = '''
chunk
  =: a:slot
  < b:slot
  < d:slot
  c:slot
  :=

s:slot

b < s
s < d
'''


# It becomes more difficult to constrain s within the chunk.
# 
# This is a heavy query.

# ## Strategy: small choice first

# In[58]:


S.study(query, strategy='small_choice_first')


# In[59]:


S.showPlan(details=True)


# In[60]:


pr = cProfile.Profile()
pr.enable()
results = S.fetch(limit=10)
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


# In[61]:


print('\n'.join(str(r) for r in results))


# ## Strategy: small choice multi

# In[62]:


S.study(query, strategy='small_choice_multi')


# In[63]:


S.showPlan(details=True)


# In[64]:


pr = cProfile.Profile()
pr.enable()
results = S.fetch(limit=10)
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


# In[65]:


print('\n'.join(str(r) for r in results))


# # Observation
# 
# Here is a query where the amount of time spent in the `stitchOn()` overtakes the time spent in the `all)` call.
# 
# So we really have a mixed bag with these strategies.
# 
# For now, I turn on the `small_choice_multi` because it makes really long queries a bit more bearable, and does not
# make much of a difference for shorter queries.

# # Main test 4
# 
# A quite different query.

# In[66]:


query = '''
chunk
.num. slot
'''


# ## Strategy: small choice first

# In[67]:


S.study(query, strategy='small_choice_first')


# In[68]:


S.showPlan(details=True)


# In[69]:


S.count(progress=1, limit=10)


# ## Strategy: small choice multi

# In[70]:


S.study(query, strategy='small_choice_multi')


# In[71]:


S.showPlan(details=True)


# In[72]:


S.count(progress=1, limit=10)


# ## Strategy: by yarn size

# In[73]:


S.study(query, strategy='by_yarn_size')


# In[74]:


S.showPlan(details=True)


# In[75]:


S.count(progress=1, limit=10)


# # Main test 5
# 
# Yet another feature comparison query.

# In[76]:


query = '''
a:chunk
  n:slot
< b:chunk
  m:slot

n .cat. m
'''


# ## Strategy: small choice first

# In[77]:


S.study(query, strategy='small_choice_first')


# In[78]:


S.showPlan(details=True)


# In[79]:


S.count(progress=100000, limit=1000000)


# ## Strategy: small choice multi

# In[80]:


S.study(query, strategy='small_choice_multi')


# In[81]:


S.showPlan(details=True)


# In[82]:


S.count(progress=100000, limit=1000000)


# ## Strategy: by yarn size

# In[83]:


S.study(query, strategy='by_yarn_size')


# In[84]:


S.showPlan(details=True)


# In[85]:


S.count(progress=100000, limit=1000000)


# # Left overs

# # Test use of shallow

# In[5]:


query = '''
chunk
  slot num=1
  < slot
'''


# In[6]:


list(S.search(query))


# In[7]:


list(S.search(query, shallow=True))


# In[8]:


list(S.search(query, shallow=2))


# In[8]:


query = '''
slot
<: slot
< slot
<: slot
< slot
<: slot
'''


# In[20]:


from tf.app import use
A = use('bhsa', mod='cmerwich/bh-reference-system/tf')


# In[ ]: