# Generating the signatures:
#ls -1 *.head | parallel -j32 sourmash compute -f -o sigs/{}.sig {}
cd -q ..
from collections import defaultdict
from glob import glob
import os
from functools import partial
from IPython.display import Image
from sourmash_lib import signature
from sbt import SBT, GraphFactory
from sbtmh import search_minhashes, SigLeaf
factory = GraphFactory(31, 1e5, 4)
sig_to_search = "mmetsp/SRR1296807.left.fq.head.sig"
with open(sig_to_search, 'r') as data:
to_search = signature.load_signatures(data)[0]
trees = {}
for d in (2, 5, 10):
trees[d] = SBT(factory, d=d)
for f in glob("mmetsp/*.sig"):
with open(f, 'r') as data:
sig = signature.load_signatures(data)
leaf = SigLeaf(os.path.basename(f), sig[0])
for tree in trees.values():
tree.add_node(leaf)
results = defaultdict(dict)
print('*' * 60)
print("{}:".format(sig_to_search))
for d in trees:
search = partial(search_minhashes, results=results[d])
print(*[(str(s.metadata), s.data.similarity(to_search))
for s in trees[d].find(search, to_search, 0.1)],
sep='\n')
print()
************************************************************ mmetsp/SRR1296807.left.fq.head.sig: ('SRR1296804.left.fq.head.sig', 0.30399999022483826) ('SRR1296807.left.fq.head.sig', 1.0) ('SRR1296805.left.fq.head.sig', 0.33000001311302185) ('SRR1296806.left.fq.head.sig', 0.28999999165534973) ('SRR1296805.left.fq.head.sig', 0.33000001311302185) ('SRR1296807.left.fq.head.sig', 1.0) ('SRR1296804.left.fq.head.sig', 0.30399999022483826) ('SRR1296806.left.fq.head.sig', 0.28999999165534973) ('SRR1296804.left.fq.head.sig', 0.30399999022483826) ('SRR1296805.left.fq.head.sig', 0.33000001311302185) ('SRR1296807.left.fq.head.sig', 1.0) ('SRR1296806.left.fq.head.sig', 0.28999999165534973)
for n in sorted(results):
used = sum(1 for n in trees[n].nodes if n is not None)
total = len(trees[n].nodes)
print("{}-ary: {} searches, {} nodes allocated ({} ({:.1f}%) used)".format(
n, len(results[n]), total, used,
round(used / total, 3) * 100))
2-ary: 59 searches, 1023 nodes allocated (995 (97.3%) used) 5-ary: 61 searches, 781 nodes allocated (623 (79.8%) used) 10-ary: 81 searches, 1111 nodes allocated (554 (49.9%) used)