import time
import logging
import structlog
structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO))
import dotenv
_ = dotenv.load_dotenv("../aleph.env")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import copy
from pprint import pprint
from aleph.core import es
index = "hl_query_aleph"
es.indices.delete(index=index, ignore=[400, 404])
index_mappings = {
"settings": {
"index": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"latin_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"latinize"
]
},
},
"filter": {
"latinize": {
"type": "icu_transform",
"id": "Any-Latin; NFKD; Lower(); [:Nonspacing Mark:] Remove; NFKC"
},
},
},
},
},
"mappings": {
"properties": {
"properties": {
"properties": {
"field1": {
"type": "text",
"index": False,
"copy_to": ["text"]
},
"field2": {
"type": "text",
"index": False,
"copy_to": ["text"]
}
}
},
"text": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "latin_analyzer",
"search_analyzer": "latin_analyzer",
"search_quote_analyzer": "latin_analyzer",
}
}
}
}
es.indices.create(index=index, body=index_mappings)
time.sleep(2)
_ = es.index(index=index, id=1, body={"properties.field1": "hello world",
"properties.field2": "world peace"})
tv = es.termvectors(index=index, id=1, fields=["text"])
pprint(tv["term_vectors"]["text"]["terms"]["world"])
{'term_freq': 2, 'tokens': [{'end_offset': 11, 'position': 1, 'start_offset': 6}, {'end_offset': 17, 'position': 102, 'start_offset': 12}]}
query = {
"match": {
"text": "world"
}
}
highlight = {
"encoder": "html",
"type": "unified",
"fields": {
"properties.*": {
"require_field_match": False,
}
}
}
res = es.search(index=index, query=query, highlight=highlight)
for hit in res['hits']['hits']:
pprint(hit)
{'_id': '1', '_index': 'hl_query_aleph', '_score': 0.39556286, '_source': {'properties.field1': 'hello world', 'properties.field2': 'world peace'}, '_type': '_doc', 'highlight': {'properties.field1': ['hello <em>world</em>'], 'properties.field2': ['<em>world</em> peace']}}
query = {
"match": {
"text": "world"
}
}
highlight = {
"encoder": "html",
"type": "fvh",
"fields": {
"text": {}
}
}
res = es.search(index=index, query=query, highlight=highlight)
for hit in res['hits']['hits']:
pprint(hit)
{'_id': '1', '_index': 'hl_query_aleph', '_score': 0.39556286, '_source': {'properties.field1': 'hello world', 'properties.field2': 'world peace'}, '_type': '_doc'}
query = {
"match": {
"text": "world"
}
}
highlight = {
"encoder": "html",
"type": "unified",
"fields": {
"text": {}
}
}
res = es.search(index=index, query=query, highlight=highlight)
for hit in res['hits']['hits']:
pprint(hit)
{'_id': '1', '_index': 'hl_query_aleph', '_score': 0.39556286, '_source': {'properties.field1': 'hello world', 'properties.field2': 'world peace'}, '_type': '_doc', 'highlight': {'text': ['world <em>peace</em>', '<em>hello</em> world']}}
query = {
"match": {
"text": "world"
}
}
highlight = {
"encoder": "html",
"type": "plain",
"fields": {
"text": {}
}
}
res = es.search(index=index, query=query, highlight=highlight)
for hit in res['hits']['hits']:
pprint(hit)
{'_id': '1', '_index': 'hl_query_aleph', '_score': 0.39556286, '_source': {'properties.field1': 'hello world', 'properties.field2': 'world peace'}, '_type': '_doc', 'highlight': {'text': ['<em>world</em> peace', 'hello <em>world</em>']}}
store=True
(v1)¶index_hl_query_poc_v1 = "hl_query_poc_v1"
es.indices.delete(index=index_hl_query_poc_v1, ignore=[400, 404])
custom_index_mappings = copy.deepcopy(index_mappings)
custom_index_mappings["mappings"]["properties"]["text"]["store"] = True
es.indices.create(index=index_hl_query_poc_v1, body=custom_index_mappings)
time.sleep(2)
_ = es.index(index=index_hl_query_poc_v1, id=1, body={"properties.field1": "hello world",
"properties.field2": "world peace"})
tv = es.termvectors(index=index_hl_query_poc_v1, id=1, fields=["text"])
pprint(tv["term_vectors"]["text"]["terms"]["world"])
{'term_freq': 2, 'tokens': [{'end_offset': 11, 'position': 1, 'start_offset': 6}, {'end_offset': 17, 'position': 102, 'start_offset': 12}]}
query = {
"match": {
"text": "world"
}
}
highlight = {
"encoder": "html",
"type": "fvh",
"fields": {
"text": {}
}
}
res = es.search(index=index_hl_query_poc_v1, query=query, highlight=highlight)
for hit in res['hits']['hits']:
pprint(hit)
{'_id': '1', '_index': 'hl_query_poc_v1', '_score': 0.25069216, '_source': {'properties.field1': 'hello world', 'properties.field2': 'world peace'}, '_type': '_doc', 'highlight': {'text': ['hello <em>world</em>', '<em>world</em> peace']}}
query = {
"match": {
"text": "world"
}
}
highlight = {
"encoder": "html",
"type": "unified",
"fields": {
"text": {}
}
}
res = es.search(index=index_hl_query_poc_v1, query=query, highlight=highlight)
for hit in res['hits']['hits']:
pprint(hit)
{'_id': '1', '_index': 'hl_query_poc_v1', '_score': 0.25069216, '_source': {'properties.field1': 'hello world', 'properties.field2': 'world peace'}, '_type': '_doc', 'highlight': {'text': ['hello <em>world</em>', '<em>world</em> peace']}}