#!/usr/bin/env python # coding: utf-8 # In[13]: import time import logging import structlog structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO)) import dotenv _ = dotenv.load_dotenv("../aleph.env") import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import copy from pprint import pprint from aleph.core import es # # Index settings same as Aleph # In[14]: index = "hl_query_aleph" es.indices.delete(index=index, ignore=[400, 404]) index_mappings = { "settings": { "index": { "number_of_shards": 1, "number_of_replicas": 0, "analysis": { "analyzer": { "latin_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "latinize" ] }, }, "filter": { "latinize": { "type": "icu_transform", "id": "Any-Latin; NFKD; Lower(); [:Nonspacing Mark:] Remove; NFKC" }, }, }, }, }, "mappings": { "properties": { "properties": { "properties": { "field1": { "type": "text", "index": False, "copy_to": ["text"] }, "field2": { "type": "text", "index": False, "copy_to": ["text"] } } }, "text": { "type": "text", "term_vector": "with_positions_offsets", "analyzer": "latin_analyzer", "search_analyzer": "latin_analyzer", "search_quote_analyzer": "latin_analyzer", } } } } es.indices.create(index=index, body=index_mappings) time.sleep(2) # ## Add some data # In[15]: _ = es.index(index=index, id=1, body={"properties.field1": "hello world", "properties.field2": "world peace"}) # ## Term vectors -> looks ok # In[16]: tv = es.termvectors(index=index, id=1, fields=["text"]) pprint(tv["term_vectors"]["text"]["terms"]["world"]) # ## Index settings and highlight same as Aleph -> looks ok # In[25]: query = { "match": { "text": "world" } } highlight = { "encoder": "html", "type": "unified", "fields": { "properties.*": { "require_field_match": False, } } } res = es.search(index=index, query=query, highlight=highlight) for hit in res['hits']['hits']: pprint(hit) # ## Index settings same as Aleph, highlight fvh(text) -> highlights missing # In[26]: query = { "match": { "text": "world" } } highlight = { "encoder": "html", "type": "fvh", "fields": { "text": {} } } res = es.search(index=index, query=query, highlight=highlight) for hit in res['hits']['hits']: pprint(hit) # ## Index settings same as Aleph, highlight unified(text) -> field1 and field2 twisted # In[27]: query = { "match": { "text": "world" } } highlight = { "encoder": "html", "type": "unified", "fields": { "text": {} } } res = es.search(index=index, query=query, highlight=highlight) for hit in res['hits']['hits']: pprint(hit) # ## Index settings same as Aleph, highlight plain(text) -> looks ok # In[28]: query = { "match": { "text": "world" } } highlight = { "encoder": "html", "type": "plain", "fields": { "text": {} } } res = es.search(index=index, query=query, highlight=highlight) for hit in res['hits']['hits']: pprint(hit) # # Custom index settings with `store=True` (v1) # In[30]: index_hl_query_poc_v1 = "hl_query_poc_v1" es.indices.delete(index=index_hl_query_poc_v1, ignore=[400, 404]) custom_index_mappings = copy.deepcopy(index_mappings) custom_index_mappings["mappings"]["properties"]["text"]["store"] = True es.indices.create(index=index_hl_query_poc_v1, body=custom_index_mappings) time.sleep(2) # ## Add some data # In[32]: _ = es.index(index=index_hl_query_poc_v1, id=1, body={"properties.field1": "hello world", "properties.field2": "world peace"}) # ## Term vectors -> looks ok # In[33]: tv = es.termvectors(index=index_hl_query_poc_v1, id=1, fields=["text"]) pprint(tv["term_vectors"]["text"]["terms"]["world"]) # ## Index settings same as Aleph, highlight fvh(text) -> now looks ok # In[34]: query = { "match": { "text": "world" } } highlight = { "encoder": "html", "type": "fvh", "fields": { "text": {} } } res = es.search(index=index_hl_query_poc_v1, query=query, highlight=highlight) for hit in res['hits']['hits']: pprint(hit) # ## Index settings same as Aleph, highlight unified(text) -> now looks ok # In[35]: query = { "match": { "text": "world" } } highlight = { "encoder": "html", "type": "unified", "fields": { "text": {} } } res = es.search(index=index_hl_query_poc_v1, query=query, highlight=highlight) for hit in res['hits']['hits']: pprint(hit)