!pip install opensearch-py requests-aws4auth "awswrangler[opensearch]" --quiet
import boto3
import json
import logging
import awswrangler as wr
import pandas as pd
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
以降の処理を実行する際に必要なヘルパー関数を定義しておきます。
def search_cloudformation_output(stackname, key):
cloudformation_client = boto3.client("cloudformation", region_name=default_region)
for output in cloudformation_client.describe_stacks(StackName=stackname)["Stacks"][0]["Outputs"]:
if output["OutputKey"] == key:
return output["OutputValue"]
raise ValueError(f"{key} is not found in outputs of {stackname}.")
default_region = boto3.Session().region_name
logging.getLogger().setLevel(logging.ERROR)
OpenSearch クラスターへのネットワーク接続性が確保されており、OpenSearch の Security 機能により API リクエストが許可されているかを確認します。
レスポンスに cluster_name や cluster_uuid が含まれていれば、接続確認が無事完了したと判断できます
cloudformation_stack_name = "search-lab-jp"
opensearch_cluster_endpoint = search_cloudformation_output(cloudformation_stack_name, "OpenSearchDomainEndpoint")
credentials = boto3.Session().get_credentials()
service_code = "es"
auth = AWSV4SignerAuth(credentials=credentials, region=default_region, service=service_code)
opensearch_client = OpenSearch(
hosts=[{"host": opensearch_cluster_endpoint, "port": 443}],
http_compress=True,
http_auth=auth,
use_ssl=True,
verify_certs=True,
connection_class = RequestsHttpConnection
)
opensearch_client.info()
{'name': '37fa7880d30e6918860bdb0e18c8e91d', 'cluster_name': '123456789012:opensearchservi-lsy27q89mdpe', 'cluster_uuid': 'yHC8ufTTRdWZqY-0J9kE9A', 'version': {'distribution': 'opensearch', 'number': '2.17.0', 'build_type': 'tar', 'build_hash': 'unknown', 'build_date': '2025-02-14T09:38:50.023788640Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}
インデックスを作成し、検索用のサンプルデータを格納します。
dataset_dir = "./dataset/sample-movies"
%mkdir -p $dataset_dir
# ファイルダウンロード
!curl -s -o $dataset_dir/sample-movies.zip https://docs.aws.amazon.com/opensearch-service/latest/developerguide/samples/sample-movies.zip
# zip ファイルから sample-movies.bulk のみを展開
!unzip -oq $dataset_dir/sample-movies.zip sample-movies.bulk -d $dataset_dir
# .bulk ファイルから実データ行だけを抜き出して jsonl ファイルとして保存
!grep -v "_index" $dataset_dir/sample-movies.bulk > $dataset_dir/sample-movies.jsonl
本ラボでは、 OpenSearch の基本操作と基本概念 をベースに、全文検索を意識した Analyzer や Normalizer の設定を追加しています。詳細は以降のセクションで順次解説していきます。
index_name = "movies"
payload = {
"settings": {
"index": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"analysis": {
"normalizer": {
"lowercase_normalizer": {
"type": "custom",
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"properties": {
"id": { "type": "keyword" },
"directors": { "type": "text" },
"release_date": {"type": "date"},
"rating": {"type": "scaled_float", "scaling_factor": 10},
"genres": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"image_url": { "type": "keyword" },
"plot": {
"type": "text",
"analyzer": "english"
},
"title": { "type": "text" },
"rank": { "type": "integer" },
"running_time_secs": { "type": "integer" },
"actors": { "type": "text" },
"year": {"type": "short"},
"type": { "type": "keyword" }
}
}
}
try:
# 既に同名のインデックスが存在する場合、いったん削除を行う
print("# delete index")
response = opensearch_client.indices.delete(index=index_name)
print(json.dumps(response, indent=2))
except Exception as e:
print(e)
# インデックスの作成を行う
print("# create index")
response = opensearch_client.indices.create(index=index_name, body=payload)
print(json.dumps(response, indent=2))
# delete index NotFoundError(404, 'index_not_found_exception', 'no such index [movies]', movies, index_or_alias) # create index { "acknowledged": true, "shards_acknowledged": true, "index": "movies" }
ドキュメントのロードを行います。ドキュメントのロードは "OpenSearch の基本概念・基本操作の理解" でも解説した通り bulk API を使用することで効率よく進められますが、データ処理フレームワークを利用することでより簡単にデータを取り込むことも可能です。本ワークショップでは、AWS SDK for Pandas を使用したデータ取り込みを行います。
%%time
index_name = "movies"
file_path = f"{dataset_dir}/sample-movies.jsonl"
response = wr.opensearch.index_json(
client=opensearch_client,
path=file_path,
use_threads=True, #クライアントの CPU 数に応じて自動で書き込みを並列化
id_keys=["id"],
index=index_name,
bulk_size=200, # 200 件ずつ書き込み,
refresh=False # 書き込み処理実行中の refresh (セグメントの書き出し) を無効化
)
CPU times: user 402 ms, sys: 16.8 ms, total: 419 ms Wall time: 1.25 s
response["success"] の値が jsonl ファイルの行数と一致しているかを確認します。True が表示される場合は全件登録に成功していると判断できます。
response["success"] == len(open(file_path).readlines())
True
本ラボではデータ登録時に意図的に Refresh オプションを無効化しているため、念のため Refresh API を実行し、登録されたドキュメントが確実に検索可能となるようにします
index_name = "movies"
response = opensearch_client.indices.refresh(index_name)
response = opensearch_client.indices.forcemerge(index_name, max_num_segments=1)
逐次検索とは、検索対象の複数のデータを順次照合することで対象となるデータを抽出する方法です。事前にインデックスを作成せずに検索を行うことができますが、順次データの走査を行うためデータサイズに応じて検索時間が伸びていきます。UNIX の grep コマンドは逐次検索にあたります。
一方、OpenSearch は全文検索向けに、Apache Lucene が提供する転置インデックスを使用しています。転置インデックスは特定の見出し語に対応する文書 ID と出現位置の情報を持っています。これにより、特定語句にマッチする検索結果を素早く返すことを可能としています。
全文検索の対象となるデータは、text 型のフィールドに格納される必要があります。text 型のフィールドに格納されたデータは、以下の流れで処理され、転置インデックスに登録されます
これらの処理を通じて作成されたトークンがインデックスに格納されることで、キーワードによる文字列の検索が可能となります。この一連の処理をアナライズといい、これらのコンポーネントの集合体をアナライザーと呼びます。
アナライザーは、インデックスにドキュメントを格納する際の文字列の処理、また検索時の検索キーワードの処理に用いられます。一般的には格納と検索に同じアナライザーを用いますが、オートコンプリートなど一部のユースケースでは格納時と検索時で異なるアナライザーを使用します。
アナライザーは text 型フィールドの新規定義時にのみ指定が可能です。text 型フィールドに対してアナライザーを明示的に指定しない場合、OpenSearch はデフォルトで standard アナライザーを設定します。フィールドに設定されたアナライザーを後から変更することはできないため、事前に検索要件を確認した上でカスタムアナライザーを定義することがより良い検索体験の提供に繋がります。
OpenSearch では、いくつかの言語向けにアナライザーが標準で提供されています。これらのアナライザーは、予め決められた Character Filter, Tokenizer, Token Filter で構成されています。
デフォルトのアナライザーである Standard Analyzer は以下のコンポーネントで構成されており、主に英語などの半角スペースでターム(語句)が区切られている言語の文章解析に使用できます。トークンは Lower Case Token Filter によりすべて小文字に変換され格納されるため、大文字・小文字の違いに影響されない検索が可能です。
OpenSearch では英語やフランス語、日本語など各言語に応じたアナライザーのプリセットを提供しています。本ワークショップでは plot フィールドに english アナライザーをセットすることで、解説文による部分検索の精度を高めています。
標準で提供されているアナライザーが要件を満たさない場合は、任意の Character Filter, Tokenizer, Token Filter を組み合わせたカスタムアナライザーを使用できます。
例えば、standard analyzer に加えて以下の処理を追加したい場合は、カスタムアナライザーを定義します。
トークンのステミング(sampling -> sample や cars -> car といった語幹の統一)を行う a や the などのストップワードを除去する。ストップワード判定の際、大文字小文字の違いは無視する カスタムアナライザーは、インデックス作成時の anaysis オプション配下で定義可能です。カスタムアナライザーをテストする場合は、カスタムアナライザーが定義されているインデックスに対して _analyze API を発行します。
standard アナライザーに対する _analyze API 結果と比較してみると、on などの不要なトークンが削除され、意味のある単語のみが抽出されていることが分かります。また一部の単語についてはステミングされていることも確認できます。
_analyze API を使用することで、アナライザーの動作確認を行えます。standard と english など、アナライザー毎の処理結果の違いを見ることにも役立ちます
payload = {
"text": "OpenSearch is a distributed search and analytics engine based on Apache Lucene.",
"analyzer": "standard"
}
response = opensearch_client.indices.analyze(
body=payload
)
df_standard = pd.json_normalize(response["tokens"])
print(json.dumps(response, indent=2))
{ "tokens": [ { "token": "opensearch", "start_offset": 0, "end_offset": 10, "type": "<ALPHANUM>", "position": 0 }, { "token": "is", "start_offset": 11, "end_offset": 13, "type": "<ALPHANUM>", "position": 1 }, { "token": "a", "start_offset": 14, "end_offset": 15, "type": "<ALPHANUM>", "position": 2 }, { "token": "distributed", "start_offset": 16, "end_offset": 27, "type": "<ALPHANUM>", "position": 3 }, { "token": "search", "start_offset": 28, "end_offset": 34, "type": "<ALPHANUM>", "position": 4 }, { "token": "and", "start_offset": 35, "end_offset": 38, "type": "<ALPHANUM>", "position": 5 }, { "token": "analytics", "start_offset": 39, "end_offset": 48, "type": "<ALPHANUM>", "position": 6 }, { "token": "engine", "start_offset": 49, "end_offset": 55, "type": "<ALPHANUM>", "position": 7 }, { "token": "based", "start_offset": 56, "end_offset": 61, "type": "<ALPHANUM>", "position": 8 }, { "token": "on", "start_offset": 62, "end_offset": 64, "type": "<ALPHANUM>", "position": 9 }, { "token": "apache", "start_offset": 65, "end_offset": 71, "type": "<ALPHANUM>", "position": 10 }, { "token": "lucene", "start_offset": 72, "end_offset": 78, "type": "<ALPHANUM>", "position": 11 } ] }
payload = {
"text": "OpenSearch is a distributed search and analytics engine based on Apache Lucene.",
"analyzer": "english"
}
response = opensearch_client.indices.analyze(
body=payload
)
df_english = pd.json_normalize(response["tokens"])
print(json.dumps(response, indent=2))
{ "tokens": [ { "token": "opensearch", "start_offset": 0, "end_offset": 10, "type": "<ALPHANUM>", "position": 0 }, { "token": "distribut", "start_offset": 16, "end_offset": 27, "type": "<ALPHANUM>", "position": 3 }, { "token": "search", "start_offset": 28, "end_offset": 34, "type": "<ALPHANUM>", "position": 4 }, { "token": "analyt", "start_offset": 39, "end_offset": 48, "type": "<ALPHANUM>", "position": 6 }, { "token": "engin", "start_offset": 49, "end_offset": 55, "type": "<ALPHANUM>", "position": 7 }, { "token": "base", "start_offset": 56, "end_offset": 61, "type": "<ALPHANUM>", "position": 8 }, { "token": "apach", "start_offset": 65, "end_offset": 71, "type": "<ALPHANUM>", "position": 10 }, { "token": "lucen", "start_offset": 72, "end_offset": 78, "type": "<ALPHANUM>", "position": 11 } ] }
各アナライザーによって出力されるトークンを表形式で比較してみます。
standard analyzer はシンプルに文章の単語分割が行われています。一方、 english analyzer では is や a といった検索上意味を持たないワード(ストップワード)の除去やステミング(distrubuted -> distribut など、単語の変化しない前方部分のみの抽出)なども行われています。
english analyzer の方が一見すると検索精度が良いようにみえますが、"To be, or not to be, that is the question" を english analyzer で処理すると "question" しか残らないなど、対象のテキストによっては思わぬ副作用を生みます。
例えば映画のタイトル検索では、"avengers" と "avenger" は区別したいという要件がある場合、english analyzer ではなく standard analyzer を使用することが望ましいと考えられます。一方で映画のプロット検索ではそこまで厳密な区別が要求されないと考えられること、ストップワードを除去した方がノイズが少ないと考えられることから、english analyzer を使用することが望ましいと考えられます。
以上の点を踏まえて、本ラボでは、title フィールドについては standard analyzer を、plot フィールドについては english analyzer をセットしていきます。
pd.merge(df_standard, df_english, on=["start_offset","end_offset", "position", "type"], how="left").rename(columns={"token_x": "token_standard", "token_y": "token_english"}).reindex(["start_offset","end_offset","position","type","token_standard","token_english"],axis=1).fillna("")
start_offset | end_offset | position | type | token_standard | token_english | |
---|---|---|---|---|---|---|
0 | 0 | 10 | 0 | <ALPHANUM> | opensearch | opensearch |
1 | 11 | 13 | 1 | <ALPHANUM> | is | |
2 | 14 | 15 | 2 | <ALPHANUM> | a | |
3 | 16 | 27 | 3 | <ALPHANUM> | distributed | distribut |
4 | 28 | 34 | 4 | <ALPHANUM> | search | search |
5 | 35 | 38 | 5 | <ALPHANUM> | and | |
6 | 39 | 48 | 6 | <ALPHANUM> | analytics | analyt |
7 | 49 | 55 | 7 | <ALPHANUM> | engine | engin |
8 | 56 | 61 | 8 | <ALPHANUM> | based | base |
9 | 62 | 64 | 9 | <ALPHANUM> | on | |
10 | 65 | 71 | 10 | <ALPHANUM> | apache | apach |
11 | 72 | 78 | 11 | <ALPHANUM> | lucene | lucen |
match query は、単一フィールドに対する全文検索を実行するものです。以下は "avengers" を含むタイトルの映画を検索するクエリです。
index_name = "movies"
payload = {
"query": {
"match": {
"title": "avengers"
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
print(json.dumps(response, indent=2))
{ "took": 1, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 3, "relation": "eq" }, "max_score": 8.117743, "hits": [ { "_index": "movies", "_id": "tt0848228", "_score": 8.117743, "_source": { "directors": [ "Joss Whedon" ], "release_date": "2012-04-11T00:00:00Z", "rating": 8.2, "genres": [ "Action", "Fantasy" ], "image_url": "https://m.media-amazon.com/images/M/MV5BMTk2NTI1MTU4N15BMl5BanBnXkFtZTcwODg0OTY0Nw@@._V1_SX400_.jpg", "plot": "Nick Fury of S.H.I.E.L.D. assembles a team of superhumans to save the planet from Loki and his army.", "title": "The Avengers", "rank": 48, "running_time_secs": 8580, "actors": [ "Robert Downey Jr.", "Chris Evans", "Scarlett Johansson" ], "year": 2012, "id": "tt0848228", "type": "add" } }, { "_index": "movies", "_id": "tt0118661", "_score": 8.117743, "_source": { "directors": [ "Jeremiah S. Chechik" ], "release_date": "1998-08-13T00:00:00Z", "rating": 3.5, "genres": [ "Action", "Adventure", "Sci-Fi", "Thriller" ], "image_url": "https://m.media-amazon.com/images/M/MV5BMjA3NzcxODYzNV5BMl5BanBnXkFtZTcwOTcwOTcxMQ@@._V1_SX400_.jpg", "plot": "Two British agents (John Steed and Emma Peel) team up to stop Sir August De Wynter from destroying the world with a weather changing machine.", "title": "The Avengers", "rank": 3276, "running_time_secs": 5340, "actors": [ "Ralph Fiennes", "Uma Thurman", "Sean Connery" ], "year": 1998, "id": "tt0118661", "type": "add" } }, { "_index": "movies", "_id": "tt2395427", "_score": 5.3775997, "_source": { "directors": [ "Joss Whedon" ], "release_date": "2015-04-29T00:00:00Z", "genres": [ "Action", "Adventure", "Fantasy", "Sci-Fi" ], "image_url": "https://m.media-amazon.com/images/M/MV5BNjA3MTY1NzQxM15BMl5BanBnXkFtZTgwMjMwMDUzMDE@._V1_SX400_.jpg", "title": "The Avengers: Age of Ultron", "rank": 40, "year": 2015, "actors": [ "Scarlett Johansson", "Chris Hemsworth", "James Spader" ], "id": "tt2395427", "type": "add" } } ] } }
OpenSearch の検索 API の実行結果は JSON 形式で返却されます。これを表形式に変換、出力した結果は以下の通りです。
本ワークショップでは、以降見やすさを重視して結果を表形式で出力します。pd.json_normalize
から始まる行をコメントアウトし、コメントアウトされている print
から始まる行のコメントを解除することで JSON による出力結果を確認することもできます
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0848228 | 8.117743 | [Joss Whedon] | 2012-04-11T00:00:00Z | 8.2 | [Action, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTk2NT... | Nick Fury of S.H.I.E.L.D. assembles a team of ... | The Avengers | 48 | 8580.0 | [Robert Downey Jr., Chris Evans, Scarlett Joha... | 2012 | tt0848228 | add |
1 | movies | tt0118661 | 8.117743 | [Jeremiah S. Chechik] | 1998-08-13T00:00:00Z | 3.5 | [Action, Adventure, Sci-Fi, Thriller] | https://m.media-amazon.com/images/M/MV5BMjA3Nz... | Two British agents (John Steed and Emma Peel) ... | The Avengers | 3276 | 5340.0 | [Ralph Fiennes, Uma Thurman, Sean Connery] | 1998 | tt0118661 | add |
2 | movies | tt2395427 | 5.377600 | [Joss Whedon] | 2015-04-29T00:00:00Z | NaN | [Action, Adventure, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BNjA3MT... | NaN | The Avengers: Age of Ultron | 40 | NaN | [Scarlett Johansson, Chris Hemsworth, James Sp... | 2015 | tt2395427 | add |
title フィールドについては avenger と avengers は分けて扱われるため、avengers ではなく avenger で検索すると異なる結果が得られます
index_name = "movies"
payload = {
"query": {
"match": {
"title": "avenger"
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0458339 | 6.004808 | [Joe Johnston] | 2011-07-19T00:00:00Z | 6.8 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTYzOT... | After being deemed unfit for military service,... | Captain America: The First Avenger | 253 | 7440 | [Chris Evans, Hugo Weaving, Samuel L. Jackson] | 2011 | tt0458339 | add |
一方、plot フィールドは english analyzer によりステミングが行われるため、superhero で検索した際に superheros を含むドキュメントもヒットします
Highlight と呼ばれる機能を使うことで、フィールド内のどの文字列(トークン)でヒットしたかを確認することができます。
デフォルトでは、ヒットしたトークンは em タグで囲まれて出力されます。タグは変更が可能です。
index_name = "movies"
payload = {
"size": 3,
"query": {
"match": {
"plot": "superhero"
}
},
"highlight": {
"fields": {
"plot": {}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0383060 | 8.186255 | [Peter Hewitt] | 2006-08-11T00:00:00Z | 3.9 | [Action, Adventure, Family, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0Nj... | Former superhero Jack is called back to work t... | Zoom | 1395 | 5580.0 | [Tim Allen, Courteney Cox, Chevy Chase] | 2006.0 | tt0383060 | add | [Former <em>superhero</em> Jack is called back... |
1 | movies | tt0132347 | 8.049664 | [Kinka Usher] | 1999-07-22T00:00:00Z | 5.9 | [Action, Comedy, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM2Nj... | A group of inept amateur superheroes must try ... | Mystery Men | 2279 | 7260.0 | [Ben Stiller, Janeane Garofalo, William H. Macy] | 1999.0 | tt0132347 | add | [A group of inept amateur <em>superheroes</em>... |
2 | movies | tt0451279 | 7.012846 | NaN | NaN | NaN | [Action, Adventure, Fantasy, Sci-Fi] | NaN | An Amazon princess comes to the world of Man t... | Wonder Woman | 2806 | NaN | NaN | NaN | tt0451279 | add | [An Amazon princess comes to the world of Man ... |
テキスト検索を行う際、複数のキーワードによる OR or AND 検索を行う要件があります。match query も複数キーワード検索に対応しています。 複数キーワードを入力した場合、デフォルトでは OR 検索となります。
index_name = "movies"
payload = {
"query": {
"match": {
"title": {
"query": "avenger avengers"
}
}
},
"highlight": {
"fields": {
"plot": {}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0848228 | 8.117743 | [Joss Whedon] | 2012-04-11T00:00:00Z | 8.2 | [Action, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTk2NT... | Nick Fury of S.H.I.E.L.D. assembles a team of ... | The Avengers | 48 | 8580.0 | [Robert Downey Jr., Chris Evans, Scarlett Joha... | 2012 | tt0848228 | add |
1 | movies | tt0118661 | 8.117743 | [Jeremiah S. Chechik] | 1998-08-13T00:00:00Z | 3.5 | [Action, Adventure, Sci-Fi, Thriller] | https://m.media-amazon.com/images/M/MV5BMjA3Nz... | Two British agents (John Steed and Emma Peel) ... | The Avengers | 3276 | 5340.0 | [Ralph Fiennes, Uma Thurman, Sean Connery] | 1998 | tt0118661 | add |
2 | movies | tt0458339 | 6.004808 | [Joe Johnston] | 2011-07-19T00:00:00Z | 6.8 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTYzOT... | After being deemed unfit for military service,... | Captain America: The First Avenger | 253 | 7440.0 | [Chris Evans, Hugo Weaving, Samuel L. Jackson] | 2011 | tt0458339 | add |
3 | movies | tt2395427 | 5.377600 | [Joss Whedon] | 2015-04-29T00:00:00Z | NaN | [Action, Adventure, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BNjA3MT... | NaN | The Avengers: Age of Ultron | 40 | NaN | [Scarlett Johansson, Chris Hemsworth, James Sp... | 2015 | tt2395427 | add |
AND 検索は、operator オプションに明示的に and と指定することで実装可能です。同オプションのデフォルトは or となっています。
index_name = "movies"
payload = {
"query": {
"match": {
"plot": {
"query": "superhero transform academy",
"operator": "and"
}
}
},
"highlight": {
"fields": {
"plot": {}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0383060 | 20.229496 | [Peter Hewitt] | 2006-08-11T00:00:00Z | 3.9 | [Action, Adventure, Family, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0Nj... | Former superhero Jack is called back to work t... | Zoom | 1395 | 5580 | [Tim Allen, Courteney Cox, Chevy Chase] | 2006 | tt0383060 | add | [Former <em>superhero</em> Jack is called back... |
minimum_should_match オプションを使うことで、クエリに含まれるキーワードにおいて、特定数、もしくは特定割合のワード数マッチすればヒットしたとみなすことも可能です。
index_name = "movies"
payload = {
"query": {
"match": {
"plot": {
"query": "superhero transform academy",
"minimum_should_match": "2"
}
}
},
"highlight": {
"fields": {
"plot": {}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
pd.json_normalize(response["hits"]["hits"])
#print(json.dumps(response, indent=2))
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0383060 | 20.229496 | [Peter Hewitt] | 2006-08-11T00:00:00Z | 3.9 | [Action, Adventure, Family, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0Nj... | Former superhero Jack is called back to work t... | Zoom | 1395 | 5580 | [Tim Allen, Courteney Cox, Chevy Chase] | 2006 | tt0383060 | add | [Former <em>superhero</em> Jack is called back... |
1 | movies | tt0120903 | 12.028833 | [Bryan Singer] | 2000-07-13T00:00:00Z | 7.4 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTYxMT... | Two mutants come to a private academy for thei... | X-Men | 471 | 6240 | [Patrick Stewart, Hugh Jackman, Ian McKellen] | 2000 | tt0120903 | add | [Two mutants come to a private <em>academy</em... |
2 | movies | tt1512235 | 9.683128 | [James Gunn] | 2010-09-10T00:00:00Z | 6.7 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTcxMD... | After his wife falls under the influence of a ... | Super | 1449 | 5760 | [Rainn Wilson, Ellen Page, Liv Tyler] | 2010 | tt1512235 | add | [After his wife falls under the influence of a... |
同一の検索条件で複数フィールドにまたがった検索を実行したい場合は、Multi-match クエリを使用します。
title もしくは plot フィールドに wind の文字列を含むドキュメントを検索する場合、Multi-match を利用して以下のように記述することができます。
index_name = "movies"
payload = {
"size": 3,
"query": {
"multi_match": {
"query": "wind",
"fields": ["title", "plot"]
}
},
"highlight": {
"fields": {
"title": {},
"plot": {}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload,
filter_path="hits.hits"
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | highlight.title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0204175 | 6.769348 | [Robert Iscove] | 2000-06-16T00:00:00Z | 5.0 | [Comedy, Romance, Drama] | https://m.media-amazon.com/images/M/MV5BMTgyNj... | A friendship is put to the ultimate test when ... | Boys and Girls | 4520 | 5640 | [Freddie Prinze Jr., Claire Forlani, Brendon R... | 2000 | tt0204175 | add | [A friendship is put to the ultimate test when... | NaN |
1 | movies | tt0031381 | 6.340030 | [Victor Fleming, George Cukor, Sam Wood] | 1939-12-15T00:00:00Z | 8.2 | [Drama, Romance, War] | https://m.media-amazon.com/images/M/MV5BNDUwMj... | A manipulative Southern belle carries on a tur... | Gone with the Wind | 799 | 14280 | [Clark Gable, Vivien Leigh, Thomas Mitchell] | 1939 | tt0031381 | add | NaN | [Gone with the <em>Wind</em>] |
2 | movies | tt1210042 | 6.266635 | [Antoine Fuqua] | 2009-01-16T00:00:00Z | 6.7 | [Crime, Drama, Thriller] | https://m.media-amazon.com/images/M/MV5BMTUyMT... | Three unconnected Brooklyn cops wind up at the... | Brooklyn's Finest | 2296 | 7920 | [Richard Gere, Don Cheadle, Ethan Hawke] | 2009 | tt1210042 | add | [Three unconnected Brooklyn cops <em>wind</em>... | NaN |
語句が特定の順序で並んでいるドキュメントのみを検索したい場合、フレーズ検索の機能が有用です。 以下のクエリは "iron man" を match query で検索した例ですが、"Iron man" シリーズだけではなく "The Man with the Iron Fists" などもヒットしています。
index_name = "movies"
payload = {
"query": {
"match": {
"title": {
"query": "iron man",
"operator": "and"
}
}
},
"size": 5
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0371746 | 11.561799 | [Jon Favreau] | 2008-04-14T00:00:00Z | 7.9 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTczNT... | When wealthy industrialist Tony Stark is force... | Iron Man | 171 | 7560 | [Robert Downey Jr., Gwyneth Paltrow, Terrence ... | 2008 | tt0371746 | add |
1 | movies | tt1300854 | 9.883154 | [Shane Black] | 2013-04-18T00:00:00Z | 7.4 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMjIzMz... | When Tony Stark's world is torn apart by a for... | Iron Man 3 | 22 | 7800 | [Robert Downey Jr., Guy Pearce, Gwyneth Paltrow] | 2013 | tt1300854 | add |
2 | movies | tt1228705 | 9.883154 | [Jon Favreau] | 2010-04-26T00:00:00Z | 7.0 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0MD... | Tony Stark has declared himself Iron Man and i... | Iron Man 2 | 276 | 7440 | [Robert Downey Jr., Mickey Rourke, Gwyneth Pal... | 2010 | tt1228705 | add |
3 | movies | tt1258972 | 6.884497 | [RZA] | 2012-11-02T00:00:00Z | 5.4 | [Action] | https://m.media-amazon.com/images/M/MV5BMTg5OD... | On the hunt for a fabled treasure of gold, a b... | The Man with the Iron Fists | 513 | 5700 | [Russell Crowe, Cung Le, Lucy Liu] | 2012 | tt1258972 | add |
4 | movies | tt0120744 | 6.884497 | [Randall Wallace] | 1998-03-13T00:00:00Z | 6.3 | [Action, Adventure] | https://m.media-amazon.com/images/M/MV5BMTczMD... | The cruel King Louis XIV of France has a secre... | The Man in the Iron Mask | 1276 | 7920 | [Leonardo DiCaprio, Jeremy Irons, John Malkovich] | 1998 | tt0120744 | add |
match_phrase クエリを使用することで、特定の並び順でトークンが配置されているドキュメントのみを検索することが可能です。
index_name = "movies"
payload = {
"query": {
"match_phrase": {
"title": {
"query": "iron man",
}
}
},
"size": 5
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0371746 | 11.561798 | [Jon Favreau] | 2008-04-14T00:00:00Z | 7.9 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTczNT... | When wealthy industrialist Tony Stark is force... | Iron Man | 171 | 7560 | [Robert Downey Jr., Gwyneth Paltrow, Terrence ... | 2008 | tt0371746 | add |
1 | movies | tt1300854 | 9.883154 | [Shane Black] | 2013-04-18T00:00:00Z | 7.4 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMjIzMz... | When Tony Stark's world is torn apart by a for... | Iron Man 3 | 22 | 7800 | [Robert Downey Jr., Guy Pearce, Gwyneth Paltrow] | 2013 | tt1300854 | add |
2 | movies | tt1228705 | 9.883154 | [Jon Favreau] | 2010-04-26T00:00:00Z | 7.0 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0MD... | Tony Stark has declared himself Iron Man and i... | Iron Man 2 | 276 | 7440 | [Robert Downey Jr., Mickey Rourke, Gwyneth Pal... | 2010 | tt1228705 | add |
slop パラメーターを変更することで、ワードの順序が入れ替わっていてもフレーズ検索にマッチさせることが可能です。デフォルトは 0 であるため、厳密なマッチングが要求されます。
slop を 2 に変更すると、2 つのキーワードが入れ替えの対象になります。
index_name = "movies"
payload = {
"query": {
"match_phrase": {
"title": {
"query": "man iron 2",
"slop": 2
}
}
},
"size": 5
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt1228705 | 6.305923 | [Jon Favreau] | 2010-04-26T00:00:00Z | 7 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0MD... | Tony Stark has declared himself Iron Man and i... | Iron Man 2 | 276 | 7440 | [Robert Downey Jr., Mickey Rourke, Gwyneth Pal... | 2010 | tt1228705 | add |
3 つのキーワードを入れ替えたい場合は、slop を 3 にセットします。
index_name = "movies"
payload = {
"query": {
"match_phrase": {
"title": {
"query": "man 2 iron",
"slop": 3
}
}
},
"size": 5
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt1228705 | 4.983189 | [Jon Favreau] | 2010-04-26T00:00:00Z | 7 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0MD... | Tony Stark has declared himself Iron Man and i... | Iron Man 2 | 276 | 7440 | [Robert Downey Jr., Mickey Rourke, Gwyneth Pal... | 2010 | tt1228705 | add |
検索キーワードに完全に一致するドキュメントを取得する場合は、Term query を使用します。主に keyword タイプのフィールドに対する検索で使用します。 以下のクエリは、genres フィールドに "Comedy" という値を持つドキュメントを取得しています。検索結果の件数は size と呼ばれるパラメーターで制御可能です。
index_name = "movies"
payload = {
"size": 3,
"query": {
"term": {
"genres": {
"value": "Comedy"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.398089 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
1 | movies | tt1245492 | 1.398089 | [Evan Goldberg, Seth Rogen] | 2013-06-03T00:00:00Z | 7.2 | [Comedy, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTQxOD... | While attending a party at James Franco's hous... | This Is the End | 6 | 6420 | [James Franco, Jonah Hill, Seth Rogen] | 2013 | tt1245492 | add |
2 | movies | tt1723121 | 1.398089 | [Rawson Marshall Thurber] | 2013-08-03T00:00:00Z | 7.2 | [Comedy, Crime] | https://m.media-amazon.com/images/M/MV5BMjA5Nj... | A veteran pot dealer creates a fake family as ... | We're the Millers | 13 | 6600 | [Jason Sudeikis, Jennifer Aniston, Emma Roberts] | 2013 | tt1723121 | add |
完全一致検索の中でも、case-sensitive (大文字小文字の区別) の要否が分かれる場合があります。今回のようにジャンル名での検索であれば、Comedy ではなく comedy でも同様の結果を得られた方が好ましい場合があります。こうした要件に対応するために、OpenSearch では Normalizer と呼ばれる機能を提供しています。Normalizer はデータ格納時、およびクエリ実行時に文字列の正規化を行う機能です。
本ワークショップでは、genres フィールドに lowercase_normalizer と呼ばれる Normalizer をセットしています。この Normalizer は lowercase filter により入力された文字を自動的に小文字に変換するよう設定されています。
Normalizer はインデックスの設定(settings/analysis)内で定義します。定義した Normalizer は、フィールドの normalizer オプションで指定することができます。以下はインデックス設定の抜粋です。
{
"settings": {
"analysis": {
"normalizer": {
"lowercase_normalizer": {
"type": "custom",
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"properties": {
"genres": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
}
}
}
}
Normalizer により入力された文字が小文字に統一されるため、以下のように "cOmEdY" という文字列で一致検索を行っても、"Comedy" で検索した時と同様の結果が得られます。
index_name = "movies"
payload = {
"size": 3,
"query": {
"term": {
"genres": {
"value": "cOmEdY"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
print(json.dumps(response, indent=2))
{ "took": 0, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 1776, "relation": "eq" }, "max_score": 1.3980892, "hits": [ { "_index": "movies", "_id": "tt2229499", "_score": 1.3980892, "_source": { "directors": [ "Joseph Gordon-Levitt" ], "release_date": "2013-01-18T00:00:00Z", "rating": 7.4, "genres": [ "Comedy", "Drama" ], "image_url": "https://m.media-amazon.com/images/M/MV5BMTQxNTc3NDM2MF5BMl5BanBnXkFtZTcwNzQ5NTQ3OQ@@._V1_SX400_.jpg", "plot": "A New Jersey guy dedicated to his family, friends, and church, develops unrealistic expectations from watching porn and works to find happiness and intimacy with his potential true love.", "title": "Don Jon", "rank": 1, "running_time_secs": 5400, "actors": [ "Joseph Gordon-Levitt", "Scarlett Johansson", "Julianne Moore" ], "year": 2013, "id": "tt2229499", "type": "add" } }, { "_index": "movies", "_id": "tt1245492", "_score": 1.3980892, "_source": { "directors": [ "Evan Goldberg", "Seth Rogen" ], "release_date": "2013-06-03T00:00:00Z", "rating": 7.2, "genres": [ "Comedy", "Fantasy" ], "image_url": "https://m.media-amazon.com/images/M/MV5BMTQxODE3NjM1Ml5BMl5BanBnXkFtZTcwMzkzNjc4OA@@._V1_SX400_.jpg", "plot": "While attending a party at James Franco's house, Seth Rogen, Jay Baruchel and many other celebrities are faced with the apocalypse.", "title": "This Is the End", "rank": 6, "running_time_secs": 6420, "actors": [ "James Franco", "Jonah Hill", "Seth Rogen" ], "year": 2013, "id": "tt1245492", "type": "add" } }, { "_index": "movies", "_id": "tt1723121", "_score": 1.3980892, "_source": { "directors": [ "Rawson Marshall Thurber" ], "release_date": "2013-08-03T00:00:00Z", "rating": 7.2, "genres": [ "Comedy", "Crime" ], "image_url": "https://m.media-amazon.com/images/M/MV5BMjA5Njc0NDUxNV5BMl5BanBnXkFtZTcwMjYzNzU1OQ@@._V1_SX400_.jpg", "plot": "A veteran pot dealer creates a fake family as part of his plan to move a huge shipment of weed into the U.S. from Mexico.", "title": "We're the Millers", "rank": 13, "running_time_secs": 6600, "actors": [ "Jason Sudeikis", "Jennifer Aniston", "Emma Roberts" ], "year": 2013, "id": "tt1723121", "type": "add" } } ] } }
index_name = "movies"
payload = {
"size": 3,
"query": {
"terms": {
"genres": ["comedy", "drama"]
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
1 | movies | tt1979320 | 1.0 | [Ron Howard] | 2013-09-02T00:00:00Z | 8.3 | [Action, Biography, Drama, Sport] | https://m.media-amazon.com/images/M/MV5BMTQyMD... | A re-creation of the merciless 1970s rivalry b... | Rush | 2 | 7380 | [Daniel Brühl, Chris Hemsworth, Olivia Wilde] | 2013 | tt1979320 | add |
2 | movies | tt1392214 | 1.0 | [Denis Villeneuve] | 2013-08-30T00:00:00Z | 8.2 | [Crime, Drama, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0NT... | When Keller Dover's daughter and her friend go... | Prisoners | 3 | 9180 | [Hugh Jackman, Jake Gyllenhaal, Viola Davis] | 2013 | tt1392214 | add |
Terms は配列内のいずれかの文字列と完全一致していればマッチした文書とみなして検索結果を返していました。一方で、配列内のすべて、もしくは一部の文字列と完全一致している場合にマッチしたとみなしたいケースもあります。 こうしたケースでは Terms set クエリが有用です。以下は comedy, drama, sci-fi, family の要素をすべて含むジャンルの映画を検索するものです
index_name = "movies"
payload = {
"size": 3,
"query": {
"terms_set": {
"genres": {
"terms": ["comedy", "drama", "sci-fi", "family"],
"minimum_should_match_script": {
"source": "params.num_terms"
}
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0100758 | 8.41433 | [Steve Barron] | 1990-03-30T00:00:00Z | 6.4 | [Action, Adventure, Comedy, Crime, Drama, Fami... | https://m.media-amazon.com/images/M/MV5BMTk3OT... | A quartet of mutated humanoid turtles clash wi... | Teenage Mutant Ninja Turtles | 440 | 5580 | [Judith Hoag, Elias Koteas, Josh Pais] | 1990 | tt0100758 | add |
Terms lookup を利用することで、他インデックスのデータを使用した Terms query が実行可能です。
以下のサンプルコードでは、ユーザーごとのお気に入り情報を格納した movie-users インデックスを作成・データを格納し、ユーザーに対応したお気に入りジャンルの映画を検索しています
lookup_index_name = "movie-users"
payload = {
"settings": {
"index": {
"number_of_shards": 1,
"number_of_replicas": 0
}
},
"mappings": {
"properties": {
"userid": { "type": "keyword" },
"favorite-genres": { "type": "keyword" }
}
}
}
try:
# 既に同名のインデックスが存在する場合、いったん削除を行う
print("# delete index")
response = opensearch_client.indices.delete(index=lookup_index_name)
print(json.dumps(response, indent=2))
except Exception as e:
print(e)
# インデックスの作成を行う
print("# create index")
response = opensearch_client.indices.create(index=lookup_index_name, body=payload)
print(json.dumps(response, indent=2))
# delete index { "acknowledged": true } # create index { "acknowledged": true, "shards_acknowledged": true, "index": "movie-users" }
lookup_index_name = "movie-users"
payload = {
"userid": "00000001",
"favorite-genres": ["comedy", "drama"]
}
response = opensearch_client.index(
index = lookup_index_name,
body = payload,
id = "00000001",
refresh = True
)
print(json.dumps(response, indent=2))
{ "_index": "movie-users", "_id": "00000001", "_version": 1, "result": "created", "forced_refresh": true, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "_seq_no": 0, "_primary_term": 1 }
index_name = "movies"
lookup_index_name = "movie-users"
payload = {
"size": 3,
"query": {
"terms": {
"genres": {
"index": lookup_index_name,
"id": "00000001",
"path": "favorite-genres"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
1 | movies | tt1979320 | 1.0 | [Ron Howard] | 2013-09-02T00:00:00Z | 8.3 | [Action, Biography, Drama, Sport] | https://m.media-amazon.com/images/M/MV5BMTQyMD... | A re-creation of the merciless 1970s rivalry b... | Rush | 2 | 7380 | [Daniel Brühl, Chris Hemsworth, Olivia Wilde] | 2013 | tt1979320 | add |
2 | movies | tt1392214 | 1.0 | [Denis Villeneuve] | 2013-08-30T00:00:00Z | 8.2 | [Crime, Drama, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0NT... | When Keller Dover's daughter and her friend go... | Prisoners | 3 | 9180 | [Hugh Jackman, Jake Gyllenhaal, Viola Davis] | 2013 | tt1392214 | add |
Terms lookup で与える要素数が 10,000 を超える場合は、パフォーマンス向上のために Bitmap Filtering の利用を検討してください。
Keyword フィールドに対して部分的に一致する検索条件を記載したい場合や、多少の表記ゆれや誤字脱字をフォローした検索を行いたい場合の検索手法について解説します。
全ドキュメントに対して横断的に検索が実行されるため、通常の term クエリと比較してパフォーマンスは大きく劣ります。
これらの検索は、インデックス内の全ドキュメントが検索対象となる可能性があり、パフォーマンスへの影響があります。
対象フィールドに対する検索クエリの大部分を部分一致検索やあいまい検索が占める場合は、text 型フィールドを使用した全文検索に切り替えるか、対象のフィールドを wildcard 型に変更することを検討してください。
Prefix query は、前方一致に基づく検索を行うクエリです。特定の文字列から始まるテキストを検索する際に役立ちます。
以下のサンプルクエリでは、com から始まる genres の映画を検索します。
index_name = "movies"
payload = {
"size": 3,
"query": {
"prefix": {
"genres": "com"
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
1 | movies | tt1245492 | 1.0 | [Evan Goldberg, Seth Rogen] | 2013-06-03T00:00:00Z | 7.2 | [Comedy, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTQxOD... | While attending a party at James Franco's hous... | This Is the End | 6 | 6420 | [James Franco, Jonah Hill, Seth Rogen] | 2013 | tt1245492 | add |
2 | movies | tt1723121 | 1.0 | [Rawson Marshall Thurber] | 2013-08-03T00:00:00Z | 7.2 | [Comedy, Crime] | https://m.media-amazon.com/images/M/MV5BMjA5Nj... | A veteran pot dealer creates a fake family as ... | We're the Millers | 13 | 6600 | [Jason Sudeikis, Jennifer Aniston, Emma Roberts] | 2013 | tt1723121 | add |
Regexp query は、正規表現に基づく検索を行うクエリです。特定のパターンに一致する文字列を抽出する際に使用できます。
以下のサンプルクエリでは、アルファベット 5 文字の genres の映画を検索します。
index_name = "movies"
payload = {
"size": 1,
"query": {
"regexp": {
"genres": "[a-z]{5}"
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
Wildcard query は、ワイルドカードを使用した部分一致検索を提供します。以下のクエリでは、 c から始まって y で終わる genres の映画を検索します。
index_name = "movies"
payload = {
"size": 1,
"query": {
"wildcard": {
"genres": "c*y"
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
Fuzzy query は、以下のような表記ゆれをクエリ側で吸収する機能です。
以下のサンプルクエリでは、comedy から二文字かけた cmdy でも検索にヒットするように fuziness を 2 にセットして Fuzzy query を実行しています。
fuziness はデフォルトで AUTO
という値がセットされており、OpenSearch 側で語句の長さから自動的に補完文字数をセットする仕様になっています。fuziness に数値を明示的にセットすることで、何文字まで補完されるかをコントロールすることが可能です。
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "cmdy",
"fuzziness": 2
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 0.699045 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
以下のサンプルクエリでは、comedy のタイプミスである ocmedy で検索しています。
タイプミスにより隣り合った文字が入れ替わったケースに対応するかどうかは、transpositions オプションで制御します。同オプションのデフォルト値は true となっているため、Fuzzy query のデフォルトの動作としては、タイプミスをフォローする形になっているといえます。
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "ocmedy",
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.165074 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
文字の入れ替わりを何か所まで許容するかは、fuzziness パラメーターで制御可能です。
ocemdy で comedy が含まれるドキュメントをヒットさせたい場合、fuzziness が 1
ではヒットしません。2
以上ではヒットします
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "ocemdy",
"fuzziness": 1
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "ocemdy",
"fuzziness": 2
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 0.932059 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
タイプミスを Fuzzy query でフォローしない場合、transpositions に false
をセットします。
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "ocemdy",
"fuzziness": 2,
"transpositions": False
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
transpositions による補正は隣り合った文字同士でのみ機能します。mocedy のように飛び石で入れ替わってしまったケースは対応できません。
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "mocedy",
"fuzziness": 1,
"transpositions": True
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
このようなケースでも、Fuzzy query は誤字の補正対応で対応可能です。 fuzziness の値に対応した文字数までであれば、入れ替わりや誤字に対して対処可能です。
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "mocedy",
"fuzziness": 2,
"transpositions": False
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 0.932059 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
以下のケースでは、入れ替わりに加えて m ではなく n のタイプミスも加わっていますが、nocedy のうち n と c の 2 文字が fuzziness = 2 の設定により補完され、comedy とマッチすると判定されています。
index_name = "movies"
payload = {
"size": 1,
"query": {
"fuzzy": {
"genres": {
"value": "nocedy",
"fuzziness": 2,
"transpositions": False
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 0.932059 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
本セクションでは、検索結果の並べ替え手法について解説します。
OpenSearch では、キーワード検索における文書の関連度計算に Okapi BM25 を使用しています。BM25 は、クエリ内に出現する単語の語彙検索を実行するキーワードベースのアルゴリズムです。
文書の関連性を判断する際、BM25 は用語頻度(TF - Term Frequency) および 逆文書頻度 (IDF - Inverse Document Frequency) を考慮します。
TF(用語頻度)は、特定文書における、特定の単語の出現頻度を示します。検索対象の用語がより頻繁に出現する文書を、関連性が高い文書として扱います。 一方、IDF(逆文書頻度)は、コーパス内のすべての文書に共通して頻出する単語の重みを低くするものです。the や a のような冠詞が該当します。
計算後の score は "_score" フィールドに格納されています。Search API 実行時に explain オプションを付与することで、詳細なスコア計算の過程を確認することが可能です。
index_name = "movies"
payload = {
"size": 3,
"query": {
"match": {
"plot": "superhero"
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"explain": True
}
response = opensearch_client.search(
index=index_name,
body=payload
)
# print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_shard | _node | _index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | ... | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | _explanation.value | _explanation.description | _explanation.details | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [movies][0] | kdnLueFaT0quni1aPfT5-g | movies | tt0383060 | 8.186255 | [Peter Hewitt] | 2006-08-11T00:00:00Z | 3.9 | [Action, Adventure, Family, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0Nj... | ... | 1395 | 5580.0 | [Tim Allen, Courteney Cox, Chevy Chase] | 2006.0 | tt0383060 | add | [Former <em>superhero</em> Jack is called back... | 8.186255 | weight(plot:superhero in 1394) [PerFieldSimila... | [{'value': 8.1862545, 'description': 'score(fr... |
1 | [movies][0] | kdnLueFaT0quni1aPfT5-g | movies | tt0132347 | 8.049664 | [Kinka Usher] | 1999-07-22T00:00:00Z | 5.9 | [Action, Comedy, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM2Nj... | ... | 2279 | 7260.0 | [Ben Stiller, Janeane Garofalo, William H. Macy] | 1999.0 | tt0132347 | add | [A group of inept amateur <em>superheroes</em>... | 8.049664 | weight(plot:superhero in 2278) [PerFieldSimila... | [{'value': 8.049664, 'description': 'score(fre... |
2 | [movies][0] | kdnLueFaT0quni1aPfT5-g | movies | tt0451279 | 7.012846 | NaN | NaN | NaN | [Action, Adventure, Fantasy, Sci-Fi] | NaN | ... | 2806 | NaN | NaN | NaN | tt0451279 | add | [An Amazon princess comes to the world of Man ... | 7.012846 | weight(plot:superhero in 2805) [PerFieldSimila... | [{'value': 7.012846, 'description': 'score(fre... |
3 rows × 22 columns
Multi match query など、複数フィールドに対して横断的に検索を行う場合、フィールドごとに重みづけを設定することが可能です。以下の例では、title と plot に対する横断検索を行う際、title フィールドの重みを 3 倍にしています。
index_name = "movies"
payload = {
"size": 3,
"query": {
"multi_match": {
"query": "wind",
"fields": ["title^3", "plot"]
}
},
"highlight": {
"fields": {
"title": {},
"plot": {}
}
},
"explain": True
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_shard | _node | _index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | ... | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.title | _explanation.value | _explanation.description | _explanation.details | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [movies][0] | kdnLueFaT0quni1aPfT5-g | movies | tt0031381 | 19.020092 | [Victor Fleming, George Cukor, Sam Wood] | 1939-12-15T00:00:00Z | 8.2 | [Drama, Romance, War] | https://m.media-amazon.com/images/M/MV5BNDUwMj... | ... | 14280 | [Clark Gable, Vivien Leigh, Thomas Mitchell] | 1939 | tt0031381 | add | [Gone with the <em>Wind</em>] | 19.020092 | max of: | [{'value': 19.020092, 'description': 'weight(t... | NaN |
1 | [movies][0] | kdnLueFaT0quni1aPfT5-g | movies | tt0460989 | 15.172821 | [Ken Loach] | 2006-05-18T00:00:00Z | 7.4 | [Drama, History, War] | https://m.media-amazon.com/images/M/MV5BMTc1Mj... | ... | 7620 | [Cillian Murphy, Padraic Delaney, Liam Cunning... | 2006 | tt0460989 | add | [The <em>Wind</em> That Shakes the Barley] | 15.172821 | max of: | [{'value': 15.172821, 'description': 'weight(t... | NaN |
2 | [movies][0] | kdnLueFaT0quni1aPfT5-g | movies | tt0204175 | 6.769348 | [Robert Iscove] | 2000-06-16T00:00:00Z | 5.0 | [Comedy, Romance, Drama] | https://m.media-amazon.com/images/M/MV5BMTgyNj... | ... | 5640 | [Freddie Prinze Jr., Claire Forlani, Brendon R... | 2000 | tt0204175 | add | NaN | 6.769348 | max of: | [{'value': 6.7693477, 'description': 'weight(p... | [A friendship is put to the ultimate test when... |
3 rows × 23 columns
Script score クエリを使用することで、スコア計算を script ベースで行うことも可能です。以下の例では、関連度に基づいて算出された score を rank の値で割った値を最終的な score としています。
index_name = "movies"
payload = {
"size": 3,
"query": {
"script_score": {
"query": {
"match": {
"plot": "superhero"
}
},
"script": {
"source": "_score / doc['rank'].value"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0317705 | 0.056092 | [Brad Bird] | 2004-10-27T00:00:00Z | 8.0 | [Animation, Action, Adventure, Family] | https://m.media-amazon.com/images/M/MV5BMTY5OT... | A family of undercover superheroes, while tryi... | The Incredibles | 112 | 6900 | [Craig T. Nelson, Samuel L. Jackson, Holly Hun... | 2004 | tt0317705 | add |
1 | movies | tt0458339 | 0.020116 | [Joe Johnston] | 2011-07-19T00:00:00Z | 6.8 | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTYzOT... | After being deemed unfit for military service,... | Captain America: The First Avenger | 253 | 7440 | [Chris Evans, Hugo Weaving, Samuel L. Jackson] | 2011 | tt0458339 | add |
2 | movies | tt0409459 | 0.015745 | [Zack Snyder] | 2009-02-23T00:00:00Z | 7.6 | [Action, Drama, Mystery, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTc0Nj... | In an alternate 1985 where former superheroes ... | Watchmen | 304 | 9720 | [Jackie Earle Haley, Patrick Wilson, Carla Gug... | 2009 | tt0409459 | add |
Script score では sigmoid といったいくつかの関数も提供しています。
index_name = "movies"
payload = {
"size": 3,
"query": {
"script_score": {
"query": {
"match": {
"plot": "superhero"
}
},
"script": {
"source": "sigmoid(_score, 2, 1)"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0383060 | 0.803657 | [Peter Hewitt] | 2006-08-11T00:00:00Z | 3.9 | [Action, Adventure, Family, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0Nj... | Former superhero Jack is called back to work t... | Zoom | 1395 | 5580.0 | [Tim Allen, Courteney Cox, Chevy Chase] | 2006.0 | tt0383060 | add |
1 | movies | tt0132347 | 0.800988 | [Kinka Usher] | 1999-07-22T00:00:00Z | 5.9 | [Action, Comedy, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM2Nj... | A group of inept amateur superheroes must try ... | Mystery Men | 2279 | 7260.0 | [Ben Stiller, Janeane Garofalo, William H. Macy] | 1999.0 | tt0132347 | add |
2 | movies | tt0451279 | 0.778095 | NaN | NaN | NaN | [Action, Adventure, Fantasy, Sci-Fi] | NaN | An Amazon princess comes to the world of Man t... | Wonder Woman | 2806 | NaN | NaN | NaN | tt0451279 | add |
index_name = "movies"
payload = {
"size": 3,
"query": {
"boosting": {
"positive": {
"match": {
"plot": "superhero"
}
},
"negative": {
"match": {
"plot": "private"
}
},
"negative_boost": 0.9
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0132347 | 8.049664 | [Kinka Usher] | 1999-07-22T00:00:00Z | 5.9 | [Action, Comedy, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM2Nj... | A group of inept amateur superheroes must try ... | Mystery Men | 2279 | 7260.0 | [Ben Stiller, Janeane Garofalo, William H. Macy] | 1999.0 | tt0132347 | add |
1 | movies | tt0383060 | 7.367629 | [Peter Hewitt] | 2006-08-11T00:00:00Z | 3.9 | [Action, Adventure, Family, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTM0Nj... | Former superhero Jack is called back to work t... | Zoom | 1395 | 5580.0 | [Tim Allen, Courteney Cox, Chevy Chase] | 2006.0 | tt0383060 | add |
2 | movies | tt0451279 | 7.012846 | NaN | NaN | NaN | [Action, Adventure, Fantasy, Sci-Fi] | NaN | An Amazon princess comes to the world of Man t... | Wonder Woman | 2806 | NaN | NaN | NaN | tt0451279 | add |
index_name = "movies"
payload = {
"size": 5,
"query": {
"match_all": {}
},
"sort": [
{
"year": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | sort | _source.directors | _source.genres | _source.plot | _source.title | _source.rank | _source.year | _source.id | _source.type | _source.release_date | _source.actors | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt1502407 | None | [2018] | [Patrick Lussier] | [Horror] | The plot is unknown at this time. | Halloween III | 4896 | 2018 | tt1502407 | add | NaN | NaN |
1 | movies | tt0974015 | None | [2017] | NaN | [Action, Adventure, Fantasy, Sci-Fi] | The world's greatest heroes are assembled to f... | Justice League | 1295 | 2017 | tt0974015 | add | 2017-01-01T00:00:00Z | NaN |
2 | movies | tt1790809 | None | [2016] | [Joachim Rønning, Espen Sandberg] | [Action, Adventure, Comedy, Fantasy] | The fifth installment of the blockbuster franc... | Pirates of the Caribbean: Dead Men Tell No Tales | 384 | 2016 | tt1790809 | add | NaN | [Johnny Depp] |
3 | movies | tt1630029 | None | [2016] | [James Cameron] | [Action, Adventure, Fantasy, Sci-Fi] | A sequel to the 2009 global phenomenon. | Avatar 2 | 909 | 2016 | tt1630029 | add | 2016-12-01T00:00:00Z | [Zoe Saldana, Sigourney Weaver, Sam Worthington] |
4 | movies | tt0439572 | None | [2016] | [Greg Berlanti] | [Action, Adventure, Drama, Fantasy, Sci-Fi] | NaN | The Flash | 2281 | 2016 | tt0439572 | add | 2016-01-01T00:00:00Z | NaN |
sort には複数条件を記載することができます。上記のクエリでは 2016 年の映画が下位 3 件を占めていました。2016 年に公開された映画を ID 順に並び替えたいと思います。 以下のように year によるソート条件の下に id によるソート条件を追加することで、year によるソートの後に id によるソートを実行することが可能です。複数条件で sort を行う場合は、先に実行したいソート条件を上に記載します。
index_name = "movies"
payload = {
"size": 5,
"query": {
"match_all": {}
},
"sort": [
{
"year": {
"order": "desc"
}
},
{
"id": {
"order": "asc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | sort | _source.directors | _source.genres | _source.plot | _source.title | _source.rank | _source.year | _source.id | _source.type | _source.release_date | _source.actors | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt1502407 | None | [2018, tt1502407] | [Patrick Lussier] | [Horror] | The plot is unknown at this time. | Halloween III | 4896 | 2018 | tt1502407 | add | NaN | NaN |
1 | movies | tt0974015 | None | [2017, tt0974015] | NaN | [Action, Adventure, Fantasy, Sci-Fi] | The world's greatest heroes are assembled to f... | Justice League | 1295 | 2017 | tt0974015 | add | 2017-01-01T00:00:00Z | NaN |
2 | movies | tt0439572 | None | [2016, tt0439572] | [Greg Berlanti] | [Action, Adventure, Drama, Fantasy, Sci-Fi] | NaN | The Flash | 2281 | 2016 | tt0439572 | add | 2016-01-01T00:00:00Z | NaN |
3 | movies | tt1630029 | None | [2016, tt1630029] | [James Cameron] | [Action, Adventure, Fantasy, Sci-Fi] | A sequel to the 2009 global phenomenon. | Avatar 2 | 909 | 2016 | tt1630029 | add | 2016-12-01T00:00:00Z | [Zoe Saldana, Sigourney Weaver, Sam Worthington] |
4 | movies | tt1790809 | None | [2016, tt1790809] | [Joachim Rønning, Espen Sandberg] | [Action, Adventure, Comedy, Fantasy] | The fifth installment of the blockbuster franc... | Pirates of the Caribbean: Dead Men Tell No Tales | 384 | 2016 | tt1790809 | add | NaN | [Johnny Depp] |
数値や日付を元に連続した範囲で検索を行いたい場合は Range query を使用します。 以下の例では 1920 年から 1923 年に公開された映画を検索しています。
index_name = "movies"
payload = {
"query": {
"range": {
"year": {
"gte": 1920,
"lte": 1923
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0013442 | 1.0 | [F.W. Murnau] | 1922-02-17T00:00:00Z | 8.0 | [Horror] | https://m.media-amazon.com/images/M/MV5BMTYyNj... | Vampire Count Orlok expresses interest in a ne... | Nosferatu, eine Symphonie des Grauens | 2993 | 5640 | [Max Schreck, Greta Schröder, Ruth Landshoff] | 1922 | tt0013442 | add |
1 | movies | tt0012349 | 1.0 | [Charles Chaplin] | 1921-01-21T00:00:00Z | 8.3 | [Comedy, Drama, Family] | https://m.media-amazon.com/images/M/MV5BMTg2Nj... | The Tramp cares for an abandoned child, but ev... | The Kid | 4925 | 4080 | [Charles Chaplin, Edna Purviance, Jackie Coogan] | 1921 | tt0012349 | add |
2 | movies | tt0010323 | 1.0 | [Robert Wiene] | 1920-02-26T00:00:00Z | 8.0 | [Horror] | https://m.media-amazon.com/images/M/MV5BMTQwMz... | Dr. Caligari's somnambulist, Cesare, and his d... | Das Cabinet des Dr. Caligari | 4950 | 4680 | [Werner Krauss, Conrad Veidt, Friedrich Feher] | 1920 | tt0010323 | add |
Boolean query を利用することで、複数の条件を組み合わせた検索を行うことが可能です。以下の要素を使用します。
実際に複合条件による検索を行っていきます。まず、クエリに含まれる条件を列挙します
index_name = "movies"
payload = {
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0332379 | None | [7.0] | [Richard Linklater] | 2003-09-09T00:00:00Z | 7.0 | [Comedy, Music] | https://m.media-amazon.com/images/M/MV5BMjEwOT... | A wannabe rock star in need of cash poses as a... | The School of Rock | 727 | 6480 | [Jack Black, Mike White, Joan Cusack] | 2003 | tt0332379 | add | [A wannabe rock star in need of cash poses as ... |
1 | movies | tt0981227 | None | [6.7] | [Peter Sollett] | 2008-09-06T00:00:00Z | 6.7 | [Comedy, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BOTY2Mj... | High school student Nick O'Leary, member of th... | Nick and Norah's Infinite Playlist | 1701 | 5400 | [Michael Cera, Kat Dennings, Aaron Yoo] | 2008 | tt0981227 | add | [High <em>school</em> student Nick O'Leary, me... |
2 | movies | tt0462590 | None | [6.2] | [Anne Fletcher] | 2006-08-07T00:00:00Z | 6.2 | [Crime, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTIxMD... | Tyler Gage receives the opportunity of a lifet... | Step Up | 947 | 6240 | [Channing Tatum, Jenna Dewan-Tatum, Damaine Ra... | 2006 | tt0462590 | add | [Tyler Gage receives the opportunity of a life... |
3 | movies | tt1023481 | None | [5.800000000000001] | [Jon M. Chu] | 2008-02-14T00:00:00Z | 5.8 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTc0Nz... | Romantic sparks occur between two dance studen... | Step Up 2: The Streets | 1427 | 5880 | [Robert Hoffman, Briana Evigan, Cassie Ventura] | 2008 | tt1023481 | add | [Romantic sparks occur between two dance stude... |
4 | movies | tt0361696 | None | [5.5] | [Sean McNamara] | 2004-10-03T00:00:00Z | 5.5 | [Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTM1MT... | A girl from a small town heads to the big city... | Raise Your Voice | 3425 | 6180 | [Hilary Duff, John Corbett, Rebecca De Mornay] | 2004 | tt0361696 | add | [from a small town heads to the big city of Lo... |
検索においては、match クエリに与えられるトークンが実質的に 0 件という状況が発生します。例えば以下のようなケースが考えられます。
この場合、OpenSearch のデフォルトの挙動は検索を行わない、つまり 0 件ヒットとなります。
しかしながら、キーワードが入力されない場合はフィルタ条件として入力したジャンルや公開年に基づいて絞り込まれた映画の、rating が高いものを返却するのが自然な挙動と考えられます。 この挙動は、zero_terms_query のオプションに all と指定することで実現可能です。以下 2 つのクエリを実行することで、zero_terms_query の設定による結果の差異を確認することが可能です。1 つ目のクエリは zero_terms_query オプションにデフォルトの none を、2 つ目のクエリは all をセットしています。
index_name = "movies"
payload = {
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": {
"query": "",
"zero_terms_query": "none"
}
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
index_name = "movies"
payload = {
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": {
"query": "",
"zero_terms_query": "all"
}
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0907657 | None | [7.9] | [John Carney] | 2006-07-15T00:00:00Z | 7.9 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTEwNj... | A modern-day musical about a busker and an imm... | Once | 1863 | 5100 | [Glen Hansard, Markéta Irglová, Hugh Walsh] | 2006 | tt0907657 | add |
1 | movies | tt0358273 | None | [7.800000000000001] | [James Mangold] | 2005-09-04T00:00:00Z | 7.8 | [Biography, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMjIyOT... | A chronicle of country music legend Johnny Cas... | Walk the Line | 794 | 8160 | [Joaquin Phoenix, Reese Witherspoon, Ginnifer ... | 2005 | tt0358273 | add |
2 | movies | tt0857191 | None | [7.7] | [Thomas McCarthy] | 2007-09-07T00:00:00Z | 7.7 | [Crime, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTIzNT... | A college professor travels to New York City t... | The Visitor | 4743 | 6240 | [Richard Jenkins, Haaz Sleiman, Danai Gurira] | 2007 | tt0857191 | add |
3 | movies | tt0249462 | None | [7.6000000000000005] | [Stephen Daldry] | 2000-05-19T00:00:00Z | 7.6 | [Comedy, Drama, Music] | https://m.media-amazon.com/images/M/MV5BMjA3Mz... | A talented young boy becomes torn between his ... | Billy Elliot | 2268 | 6600 | [Jamie Bell, Julie Walters, Jean Heywood] | 2000 | tt0249462 | add |
4 | movies | tt0146882 | None | [7.5] | [Stephen Frears] | 2000-03-17T00:00:00Z | 7.5 | [Comedy, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTgxMT... | Rob, a record store owner and compulsive list ... | High Fidelity | 2134 | 6780 | [John Cusack, Iben Hjejle, Todd Louiso] | 2000 | tt0146882 | add |
ユーザーが検索を行う際、キーワードを入力して検索する方法とは別に、ジャンルなどの分類を元に絞り込んでいく方法があります。このような検索方法をファセットナビゲーションやファセット検索と呼びます。
ファセットナビゲーションにより、ユーザーはどのような分類が存在するのか、各分類ごとに検索対象のドキュメント件数がどの程度存在するかを事前に確認することが可能です。またおおまかな検索結果に対してファセットナビゲーションを併用することで検索結果のフィルタリングを行うことも可能です。
AWS のドキュメント検索 でもファセットナビゲーションが採用されています。上部のプルダウンメニューには検索結果から得られたサービス一覧などがあり、このメニューからサービスごとのドキュメントに絞った検索を行うことができます。
OpenSearch では、Aggregationsaggregations クエリで集計を実行します。Aggregations は、ファセット検索で有用なキーワードの集計から、平均値(avg) や最大値(max) など統計値を出力するなど様々な集計方法を提供しています。Aggregations クエリを使用する場合、Search API で aggs オプションを使用します
以下のサンプルでは、plot に school を含みかつ genres に Music を持つ映画について、ジャンルごとのドキュメント件数を集計しています。
index_name = "movies"
payload = {
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": {
"query": "school"
}
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
],
"aggs": {
"genres": {
"terms": {
"field": "genres"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["aggregations"]["genres"]["buckets"])
key | doc_count | |
---|---|---|
0 | music | 14 |
1 | drama | 8 |
2 | romance | 8 |
3 | comedy | 6 |
4 | family | 4 |
5 | adventure | 1 |
6 | animation | 1 |
7 | crime | 1 |
8 | fantasy | 1 |
上記の集計結果を見て、family もフィルタリング対象のジャンルに追加したとします。この場合、以下のようにクエリを書き換えて実行することで、ファセットによるドリルダウン相当の処理を実現できます。
index_name = "movies"
payload = {
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": {
"query": "school"
}
}
},
{
"terms_set": {
"genres": {
"terms": ["music", "family"],
"minimum_should_match_script": {
"source": "params.num_terms"
}
}
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
],
"aggs": {
"genres": {
"terms": {
"field": "genres"
}
}
}
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["aggregations"]["genres"]["buckets"])
key | doc_count | |
---|---|---|
0 | family | 4 |
1 | music | 4 |
2 | comedy | 3 |
3 | romance | 2 |
4 | adventure | 1 |
5 | animation | 1 |
6 | fantasy | 1 |
from + size は最もシンプルな実装方法です。 from に指定した開始位置から size 件数分のデータを返却します。from に 100、size に 20 と指定した場合は、先頭から数えて 100 件目から 20 件のデータが返却されます。SQL の OFFSET と LIMIT の組み合わせと似ているところがあります。
以下のサンプルでは、8 件の検索結果を 2 つのクエリで分割取得しています。1 つめのクエリで from に 0(デフォルト)、size に 4 を指定し先頭 4 件の検索結果を、2 つめのクエリで from に 5、size に 4 を指定して後続の 4 件の検索結果を取得しています。
index_name = "movies"
payload = {
"from": 0,
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0332379 | None | [7.0] | [Richard Linklater] | 2003-09-09T00:00:00Z | 7.0 | [Comedy, Music] | https://m.media-amazon.com/images/M/MV5BMjEwOT... | A wannabe rock star in need of cash poses as a... | The School of Rock | 727 | 6480 | [Jack Black, Mike White, Joan Cusack] | 2003 | tt0332379 | add | [A wannabe rock star in need of cash poses as ... |
1 | movies | tt0981227 | None | [6.7] | [Peter Sollett] | 2008-09-06T00:00:00Z | 6.7 | [Comedy, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BOTY2Mj... | High school student Nick O'Leary, member of th... | Nick and Norah's Infinite Playlist | 1701 | 5400 | [Michael Cera, Kat Dennings, Aaron Yoo] | 2008 | tt0981227 | add | [High <em>school</em> student Nick O'Leary, me... |
2 | movies | tt0462590 | None | [6.2] | [Anne Fletcher] | 2006-08-07T00:00:00Z | 6.2 | [Crime, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTIxMD... | Tyler Gage receives the opportunity of a lifet... | Step Up | 947 | 6240 | [Channing Tatum, Jenna Dewan-Tatum, Damaine Ra... | 2006 | tt0462590 | add | [Tyler Gage receives the opportunity of a life... |
3 | movies | tt1023481 | None | [5.800000000000001] | [Jon M. Chu] | 2008-02-14T00:00:00Z | 5.8 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTc0Nz... | Romantic sparks occur between two dance studen... | Step Up 2: The Streets | 1427 | 5880 | [Robert Hoffman, Briana Evigan, Cassie Ventura] | 2008 | tt1023481 | add | [Romantic sparks occur between two dance stude... |
payload = {
"from": 4,
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0361696 | None | [5.5] | [Sean McNamara] | 2004-10-03T00:00:00Z | 5.5 | [Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTM1MT... | A girl from a small town heads to the big city... | Raise Your Voice | 3425 | 6180 | [Hilary Duff, John Corbett, Rebecca De Mornay] | 2004 | tt0361696 | add | [from a small town heads to the big city of Lo... |
1 | movies | tt0306841 | None | [4.9] | [Jim Fall] | 2003-04-26T00:00:00Z | 4.9 | [Adventure, Comedy, Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTMzND... | Lizzie McGuire has graduated from middle schoo... | The Lizzie McGuire Movie | 3219 | 5640 | [Hilary Duff, Adam Lamberg, Clayton Snyder] | 2003 | tt0306841 | add | [Lizzie McGuire has graduated from middle <em>... |
2 | movies | tt1231580 | None | [4.2] | [Betty Thomas] | 2009-12-11T00:00:00Z | 4.2 | [Animation, Comedy, Family, Fantasy, Music] | https://m.media-amazon.com/images/M/MV5BMjI0NT... | The world famous singing pre-teen chipmunk tri... | Alvin and the Chipmunks: The Squeakquel | 3223 | 5280 | [Jason Lee, Zachary Levi, David Cross] | 2009 | tt1231580 | add | [The world famous singing pre-teen chipmunk tr... |
3 | movies | tt0804452 | None | [2.7] | [Sean McNamara] | 2007-08-03T00:00:00Z | 2.7 | [Comedy, Family, Music] | https://m.media-amazon.com/images/M/MV5BMTUxND... | During their first year of high school, four b... | Bratz | 3064 | 6600 | [Skyler Shaye, Janel Parrish, Logan Browning] | 2007 | tt0804452 | add | [During their first year of high <em>school</e... |
このように最もシンプルにページングを実装できる from + to ですが、以下の懸念事項があります
index.max_result_window
のインデックス設定を変更することでこの制限を緩和できますが、パフォーマンスへの影響があるため一般的には推奨されません。数万件の結果を取得し paging を行う場合は、異なる手法を検討する必要があります。
index_name = "movies"
payload = {
"from": 9999,
"size": 2,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
try:
response = opensearch_client.search(
index=index_name,
body=payload
)
print(json.dumps(response, indent=2))
except Exception as e:
print(e)
RequestError(400, 'search_phase_execution_exception', 'Result window is too large, from + size must be less than or equal to: [10000] but was [10001]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.')
scroll は機械学習ジョブといった PB クラスのデータを取得するバッチ処理に向いている手法です。リクエストヘッダに scroll パラメーターを付与することで有効化されます。scroll はリクエスト時点のスナップショットを取得するため、メモリを大量に消費する場合があります。この特性上、頻繁に実行されるクエリには向いていません。
scroll オプションを使用して search クエリを実行すると、実行結果に "_scroll_id" が付与されます。_scroll_id を次回のリクエストに付与することで、後続の結果を得ることができます。
index_name = "movies"
payload = {
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": {
"query": "school"
}
}
}
]
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload,
scroll="10m"
)
scroll_id = response["_scroll_id"]
#print(json.dumps(response, indent=2))
print("ScrollId: " + scroll_id)
pd.json_normalize(response["hits"]["hits"])
ScrollId: FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFjUtVkhIeWVUUTI2RTN3SlNRWEU1ancAAAAAAABCyxZrZG5MdWVGYVQwcXVuaTFhUGZUNS1n
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0088847 | None | [7.9] | [John Hughes] | 1985-02-07T00:00:00Z | 7.9 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMzYyNT... | Five high school students, all different stere... | The Breakfast Club | 120 | 5820 | [Emilio Estevez, Judd Nelson, Molly Ringwald] | 1985 | tt0088847 | add |
1 | movies | tt1714206 | None | [7.7] | [James Ponsoldt] | 2013-01-18T00:00:00Z | 7.7 | [Comedy, Drama, Romance] | https://m.media-amazon.com/images/M/MV5BMjA5MT... | A hard-partying high school senior's philosoph... | The Spectacular Now | 140 | 5700 | [Miles Teller, Shailene Woodley, Kyle Chandler] | 2013 | tt1714206 | add |
2 | movies | tt0074285 | None | [7.4] | [Brian De Palma] | 1976-11-03T00:00:00Z | 7.4 | [Horror, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg1Nj... | A young, abused and timid 17-year-old girl dis... | Carrie | 147 | 5880 | [Sissy Spacek, Piper Laurie, Amy Irving] | 1976 | tt0074285 | add |
3 | movies | tt1981677 | None | [7.1000000000000005] | [Jason Moore] | 2012-09-28T00:00:00Z | 7.1 | [Comedy, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTcyMT... | Beca, a freshman at Barden University, is cajo... | Pitch Perfect | 46 | 6720 | [Anna Kendrick, Brittany Snow, Rebel Wilson] | 2012 | tt1981677 | add |
4 | movies | tt1650554 | None | [7.0] | [Jeff Wadlow] | 2013-08-14T00:00:00Z | 7.0 | [Action, Comedy, Crime] | https://m.media-amazon.com/images/M/MV5BMTQ4OT... | The costumed high-school hero Kick-Ass joins w... | Kick-Ass 2 | 53 | 6180 | [Aaron Taylor-Johnson, Chloë Grace Moretz, Chr... | 2013 | tt1650554 | add |
payload = {
scroll_id: scroll_id,
"size": 5,
"query": {
"bool": {
"must": [
{
"match": {
"plot": {
"query": "school"
}
}
}
]
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
scroll="10m",
)
scroll_id = response["_scroll_id"]
#print(json.dumps(response, indent=2))
print("ScrollId: " + scroll_id)
pd.json_normalize(response["hits"]["hits"])
ScrollId: FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFjUtVkhIeWVUUTI2RTN3SlNRWEU1ancAAAAAAABCzBZrZG5MdWVGYVQwcXVuaTFhUGZUNS1n
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400.0 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
1 | movies | tt1979320 | 1.0 | [Ron Howard] | 2013-09-02T00:00:00Z | 8.3 | [Action, Biography, Drama, Sport] | https://m.media-amazon.com/images/M/MV5BMTQyMD... | A re-creation of the merciless 1970s rivalry b... | Rush | 2 | 7380.0 | [Daniel Brühl, Chris Hemsworth, Olivia Wilde] | 2013 | tt1979320 | add |
2 | movies | tt1392214 | 1.0 | [Denis Villeneuve] | 2013-08-30T00:00:00Z | 8.2 | [Crime, Drama, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0NT... | When Keller Dover's daughter and her friend go... | Prisoners | 3 | 9180.0 | [Hugh Jackman, Jake Gyllenhaal, Viola Davis] | 2013 | tt1392214 | add |
3 | movies | tt1951264 | 1.0 | [Francis Lawrence] | 2013-11-11T00:00:00Z | NaN | [Action, Adventure, Sci-Fi, Thriller] | https://m.media-amazon.com/images/M/MV5BMTAyMj... | Katniss Everdeen and Peeta Mellark become targ... | The Hunger Games: Catching Fire | 4 | 8760.0 | [Jennifer Lawrence, Josh Hutcherson, Liam Hems... | 2013 | tt1951264 | add |
4 | movies | tt1981115 | 1.0 | [Alan Taylor] | 2013-10-30T00:00:00Z | NaN | [Action, Adventure, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTQyNz... | Faced with an enemy that even Odin and Asgard ... | Thor: The Dark World | 5 | NaN | [Chris Hemsworth, Natalie Portman, Tom Hiddles... | 2013 | tt1981115 | add |
5 | movies | tt1245492 | 1.0 | [Evan Goldberg, Seth Rogen] | 2013-06-03T00:00:00Z | 7.2 | [Comedy, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTQxOD... | While attending a party at James Franco's hous... | This Is the End | 6 | 6420.0 | [James Franco, Jonah Hill, Seth Rogen] | 2013 | tt1245492 | add |
6 | movies | tt2226417 | 1.0 | [James Wan] | 2013-09-13T00:00:00Z | 7.1 | [Horror, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0OT... | The haunted Lambert family seeks to uncover th... | Insidious: Chapter 2 | 7 | 6360.0 | [Patrick Wilson, Rose Byrne, Barbara Hershey] | 2013 | tt2226417 | add |
7 | movies | tt0816711 | 1.0 | [Marc Forster] | 2013-06-02T00:00:00Z | 7.1 | [Action, Adventure, Horror, Sci-Fi, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0NT... | United Nations employee Gerry Lane traverses t... | World War Z | 8 | 6960.0 | [Brad Pitt, Mireille Enos, Daniella Kertesz] | 2013 | tt0816711 | add |
8 | movies | tt1877832 | 1.0 | [Bryan Singer] | 2014-05-21T00:00:00Z | NaN | [Action, Adventure, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTQ0Nz... | The X-Men send Wolverine to the past to change... | X-Men: Days of Future Past | 9 | NaN | [Jennifer Lawrence, Hugh Jackman, Michael Fass... | 2014 | tt1877832 | add |
9 | movies | tt2109248 | 1.0 | [Michael Bay] | 2014-06-25T00:00:00Z | NaN | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTQyMD... | A mechanic and his daughter make a discovery t... | Transformers: Age of Extinction | 10 | NaN | [Mark Wahlberg, Nicola Peltz, Jack Reynor] | 2014 | tt2109248 | add |
Search after は from + size に似たアプローチですが、前回取得した範囲は検索結果から除外できるメリットがあります。検索結果を全件取得してから部分的に切り出しを行う from + size と比較して、ページングによるオーバーヘッドを抑えられます。
以下は Search after のサンプルです。全部で 8 件の検索結果を、4 件ずつに分けて取得しています。
1 つ目のクエリでは rating フィールドで降順ソートされた結果から size = 4 で先頭 4 件を取得しています。2 つ目のクエリでは、search_after オプションに 1 つ目のクエリ実行結果の 4 件目の ratings の値を入れています。これにより、1 つ目のクエリで取得した結果より後の結果から 4 件のドキュメントを取得することに成功しています。
index_name = "movies"
payload = {
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
search_after_rating = response["hits"]["hits"][-1]["sort"]
search_after_rating
#print(json.dumps(response, indent=2))
print("search_after_rating: " + str(search_after_rating))
pd.json_normalize(response["hits"]["hits"])
search_after_rating: [5.800000000000001]
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0332379 | None | [7.0] | [Richard Linklater] | 2003-09-09T00:00:00Z | 7.0 | [Comedy, Music] | https://m.media-amazon.com/images/M/MV5BMjEwOT... | A wannabe rock star in need of cash poses as a... | The School of Rock | 727 | 6480 | [Jack Black, Mike White, Joan Cusack] | 2003 | tt0332379 | add | [A wannabe rock star in need of cash poses as ... |
1 | movies | tt0981227 | None | [6.7] | [Peter Sollett] | 2008-09-06T00:00:00Z | 6.7 | [Comedy, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BOTY2Mj... | High school student Nick O'Leary, member of th... | Nick and Norah's Infinite Playlist | 1701 | 5400 | [Michael Cera, Kat Dennings, Aaron Yoo] | 2008 | tt0981227 | add | [High <em>school</em> student Nick O'Leary, me... |
2 | movies | tt0462590 | None | [6.2] | [Anne Fletcher] | 2006-08-07T00:00:00Z | 6.2 | [Crime, Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTIxMD... | Tyler Gage receives the opportunity of a lifet... | Step Up | 947 | 6240 | [Channing Tatum, Jenna Dewan-Tatum, Damaine Ra... | 2006 | tt0462590 | add | [Tyler Gage receives the opportunity of a life... |
3 | movies | tt1023481 | None | [5.800000000000001] | [Jon M. Chu] | 2008-02-14T00:00:00Z | 5.8 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTc0Nz... | Romantic sparks occur between two dance studen... | Step Up 2: The Streets | 1427 | 5880 | [Robert Hoffman, Briana Evigan, Cassie Ventura] | 2008 | tt1023481 | add | [Romantic sparks occur between two dance stude... |
index_name = "movies"
payload = {
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
],
"should": [
{
"term": {
"genres": {
"value": "Romance"
}
}
},
{
"term": {
"genres": {
"value": "Comedy"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"range": {
"year": {
"gte": 2000,
"lte": 2009
}
}
}
],
"must_not": [
{
"terms": {
"genres": ["Thriller", "Horror"]
}
}
]
}
},
"highlight": {
"fields": {
"plot": {}
}
},
"sort": [
{
"rating": {
"order": "desc"
}
}
],
"search_after": search_after_rating
}
response = opensearch_client.search(
index=index_name,
body=payload
)
search_after_rating = response["hits"]["hits"][-1]["sort"]
search_after_rating
#print(json.dumps(response, indent=2))
print("search_after_rating: " + str(search_after_rating))
pd.json_normalize(response["hits"]["hits"])
search_after_rating: [2.7]
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | highlight.plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0361696 | None | [5.5] | [Sean McNamara] | 2004-10-03T00:00:00Z | 5.5 | [Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTM1MT... | A girl from a small town heads to the big city... | Raise Your Voice | 3425 | 6180 | [Hilary Duff, John Corbett, Rebecca De Mornay] | 2004 | tt0361696 | add | [from a small town heads to the big city of Lo... |
1 | movies | tt0306841 | None | [4.9] | [Jim Fall] | 2003-04-26T00:00:00Z | 4.9 | [Adventure, Comedy, Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTMzND... | Lizzie McGuire has graduated from middle schoo... | The Lizzie McGuire Movie | 3219 | 5640 | [Hilary Duff, Adam Lamberg, Clayton Snyder] | 2003 | tt0306841 | add | [Lizzie McGuire has graduated from middle <em>... |
2 | movies | tt1231580 | None | [4.2] | [Betty Thomas] | 2009-12-11T00:00:00Z | 4.2 | [Animation, Comedy, Family, Fantasy, Music] | https://m.media-amazon.com/images/M/MV5BMjI0NT... | The world famous singing pre-teen chipmunk tri... | Alvin and the Chipmunks: The Squeakquel | 3223 | 5280 | [Jason Lee, Zachary Levi, David Cross] | 2009 | tt1231580 | add | [The world famous singing pre-teen chipmunk tr... |
3 | movies | tt0804452 | None | [2.7] | [Sean McNamara] | 2007-08-03T00:00:00Z | 2.7 | [Comedy, Family, Music] | https://m.media-amazon.com/images/M/MV5BMTUxND... | During their first year of high school, four b... | Bratz | 3064 | 6600 | [Skyler Shaye, Janel Parrish, Logan Browning] | 2007 | tt0804452 | add | [During their first year of high <em>school</e... |
デフォルトでは OpenSearch はスコア順にドキュメントのソートを行います。明示的にソート対象のフィールドに "_score" を指定した場合と挙動は同じです。この特性を利用して、sort に _score による降順ソートの条件を書くことでスコアによるソートと Search after を両立できます。ただし、ソート条件が _score だけだとエラーになってしまうため、id など何かしらのフィールドとセットで sort 条件を書く必要があります。
index_name = "movies"
payload = {
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"id": {
"order": "asc"
}
}
]
}
response = opensearch_client.search(
index=index_name,
body=payload
)
search_after_score = response["hits"]["hits"][-1]["sort"]
#print(json.dumps(response, indent=2))
print("search_after_score: " + str(search_after_score))
pd.json_normalize(response["hits"]["hits"])
search_after_score: [8.158286, 'tt0080716']
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0113862 | 8.766865 | [8.766865, tt0113862] | [Stephen Herek] | 1995-12-29T00:00:00Z | 7.1 | [Drama, Music] | https://m.media-amazon.com/images/M/MV5BMTUxOD... | A frustrated composer finds fulfillment as a h... | Mr. Holland's Opus | 1841 | 8580 | [Richard Dreyfuss, Glenne Headly, Jay Thomas] | 1995 | tt0113862 | add |
1 | movies | tt0085549 | 8.437494 | [8.437494, tt0085549] | [Adrian Lyne] | 1983-04-15T00:00:00Z | 5.8 | [Drama, Romance, Music] | https://m.media-amazon.com/images/M/MV5BMjA5Nj... | A Pittsburgh woman with two jobs as a welder a... | Flashdance | 1423 | 5700 | [Jennifer Beals, Michael Nouri, Lilia Skala] | 1983 | tt0085549 | add |
2 | movies | tt1023481 | 8.246508 | [8.246508, tt1023481] | [Jon M. Chu] | 2008-02-14T00:00:00Z | 5.8 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTc0Nz... | Romantic sparks occur between two dance studen... | Step Up 2: The Streets | 1427 | 5880 | [Robert Hoffman, Briana Evigan, Cassie Ventura] | 2008 | tt1023481 | add |
3 | movies | tt0080716 | 8.158286 | [8.158286, tt0080716] | [Alan Parker] | 1980-05-16T00:00:00Z | 6.4 | [Drama, Music] | https://m.media-amazon.com/images/M/MV5BMjAyOT... | A chronicle of the lives of several teenagers ... | Fame | 3755 | 8040 | [Eddie Barth, Irene Cara, Lee Curreri] | 1980 | tt0080716 | add |
index_name = "movies"
payload = {
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
},
"id": {
"order": "asc"
}
}
],
"search_after": search_after_score
}
response = opensearch_client.search(
index=index_name,
body=payload
)
search_after_score = response["hits"]["hits"][-1]["sort"]
#print(json.dumps(response, indent=2))
print("search_after_score: " + str(search_after_score))
pd.json_normalize(response["hits"]["hits"])
search_after_score: [7.8460574, 'tt1447972']
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0361696 | 8.074432 | [8.074432, tt0361696] | [Sean McNamara] | 2004-10-03T00:00:00Z | 5.5 | [Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTM1MT... | A girl from a small town heads to the big city... | Raise Your Voice | 3425 | 6180 | [Hilary Duff, John Corbett, Rebecca De Mornay] | 2004 | tt0361696 | add |
1 | movies | tt0332379 | 7.994629 | [7.994629, tt0332379] | [Richard Linklater] | 2003-09-09T00:00:00Z | 7.0 | [Comedy, Music] | https://m.media-amazon.com/images/M/MV5BMjEwOT... | A wannabe rock star in need of cash poses as a... | The School of Rock | 727 | 6480 | [Jack Black, Mike White, Joan Cusack] | 2003 | tt0332379 | add |
2 | movies | tt1231580 | 7.846057 | [7.8460574, tt1231580] | [Betty Thomas] | 2009-12-11T00:00:00Z | 4.2 | [Animation, Comedy, Family, Fantasy, Music] | https://m.media-amazon.com/images/M/MV5BMjI0NT... | The world famous singing pre-teen chipmunk tri... | Alvin and the Chipmunks: The Squeakquel | 3223 | 5280 | [Jason Lee, Zachary Levi, David Cross] | 2009 | tt1231580 | add |
3 | movies | tt1447972 | 7.846057 | [7.8460574, tt1447972] | [Max Giwa, Dania Pasquini] | 2010-05-19T00:00:00Z | 5.6 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMzc4MD... | In order to win the Street Dance Championships... | StreetDance 3D | 1210 | 5880 | [Nichola Burley, Richard Winsor, Ukweli Roach] | 2010 | tt1447972 | add |
PIT(Point in Time) は、インデックスのある時点の固定された状態を作り出す機能です。PIT と Search after を組み合わせることで一貫性のあるページングされた結果を取得することができます。
PIT を使用した検索の流れは以下の通りです。PIT 作成時に keep_alive パラメーターで PIT の保持時間を指定するため、PIT の削除は任意です。
PIT を使用する場合、PIT 側に Index の情報が入っているため、Search API 実行時にインデックス名の指定は行いません。リクエストパラメーターにインデックス名を含めてはいけない点に注意が必要です。
response = opensearch_client.create_point_in_time(
index=index_name,
keep_alive="10m"
)
pit_id = response.get("pit_id")
print('\n Point in time ID: '+ pit_id)
Point in time ID: 87mEQQEGbW92aWVzFlhUVXJsS1VIU2N1NHpLTFNkUndZSUEAFmtkbkx1ZUZhVDBxdW5pMWFQZlQ1LWcAAAAAAAAAQtEWNS1WSEh5ZVRRMjZFM3dKU1FYRTVqdwEWWFRVcmxLVUhTY3U0ektMU2RSd1lJQQAA
index_name = "movies"
payload = {
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
},
"id": {
"order": "asc"
}
}
],
"pit": {
"id": pit_id,
"keep_alive": "10m"
},
}
response = opensearch_client.search(
body=payload
)
search_after_score = response["hits"]["hits"][-1]["sort"]
print("search_after_score: " + str(search_after_score))
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
search_after_score: [8.158286, 'tt0080716']
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0113862 | 8.766865 | [8.766865, tt0113862] | [Stephen Herek] | 1995-12-29T00:00:00Z | 7.1 | [Drama, Music] | https://m.media-amazon.com/images/M/MV5BMTUxOD... | A frustrated composer finds fulfillment as a h... | Mr. Holland's Opus | 1841 | 8580 | [Richard Dreyfuss, Glenne Headly, Jay Thomas] | 1995 | tt0113862 | add |
1 | movies | tt0085549 | 8.437494 | [8.437494, tt0085549] | [Adrian Lyne] | 1983-04-15T00:00:00Z | 5.8 | [Drama, Romance, Music] | https://m.media-amazon.com/images/M/MV5BMjA5Nj... | A Pittsburgh woman with two jobs as a welder a... | Flashdance | 1423 | 5700 | [Jennifer Beals, Michael Nouri, Lilia Skala] | 1983 | tt0085549 | add |
2 | movies | tt1023481 | 8.246508 | [8.246508, tt1023481] | [Jon M. Chu] | 2008-02-14T00:00:00Z | 5.8 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTc0Nz... | Romantic sparks occur between two dance studen... | Step Up 2: The Streets | 1427 | 5880 | [Robert Hoffman, Briana Evigan, Cassie Ventura] | 2008 | tt1023481 | add |
3 | movies | tt0080716 | 8.158286 | [8.158286, tt0080716] | [Alan Parker] | 1980-05-16T00:00:00Z | 6.4 | [Drama, Music] | https://m.media-amazon.com/images/M/MV5BMjAyOT... | A chronicle of the lives of several teenagers ... | Fame | 3755 | 8040 | [Eddie Barth, Irene Cara, Lee Curreri] | 1980 | tt0080716 | add |
index_name = "movies"
payload = {
"size": 4,
"query": {
"bool": {
"must": [
{
"match": {
"plot": "school"
}
},
{
"term": {
"genres": {
"value": "Music"
}
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
},
"id": {
"order": "asc"
}
}
],
"pit": {
"id": pit_id,
"keep_alive": "10m"
},
"search_after": search_after_score
}
response = opensearch_client.search(
body=payload
)
search_after_score = response["hits"]["hits"][-1]["sort"]
print("search_after_score: " + str(search_after_score))
#print(json.dumps(response, indent=2))
pd.json_normalize(response["hits"]["hits"])
search_after_score: [7.8460574, 'tt1447972']
_index | _id | _score | sort | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt0361696 | 8.074432 | [8.074432, tt0361696] | [Sean McNamara] | 2004-10-03T00:00:00Z | 5.5 | [Family, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMTM1MT... | A girl from a small town heads to the big city... | Raise Your Voice | 3425 | 6180 | [Hilary Duff, John Corbett, Rebecca De Mornay] | 2004 | tt0361696 | add |
1 | movies | tt0332379 | 7.994629 | [7.994629, tt0332379] | [Richard Linklater] | 2003-09-09T00:00:00Z | 7.0 | [Comedy, Music] | https://m.media-amazon.com/images/M/MV5BMjEwOT... | A wannabe rock star in need of cash poses as a... | The School of Rock | 727 | 6480 | [Jack Black, Mike White, Joan Cusack] | 2003 | tt0332379 | add |
2 | movies | tt1231580 | 7.846057 | [7.8460574, tt1231580] | [Betty Thomas] | 2009-12-11T00:00:00Z | 4.2 | [Animation, Comedy, Family, Fantasy, Music] | https://m.media-amazon.com/images/M/MV5BMjI0NT... | The world famous singing pre-teen chipmunk tri... | Alvin and the Chipmunks: The Squeakquel | 3223 | 5280 | [Jason Lee, Zachary Levi, David Cross] | 2009 | tt1231580 | add |
3 | movies | tt1447972 | 7.846057 | [7.8460574, tt1447972] | [Max Giwa, Dania Pasquini] | 2010-05-19T00:00:00Z | 5.6 | [Drama, Music, Romance] | https://m.media-amazon.com/images/M/MV5BMzc4MD... | In order to win the Street Dance Championships... | StreetDance 3D | 1210 | 5880 | [Nichola Burley, Richard Winsor, Ukweli Roach] | 2010 | tt1447972 | add |
大規模なデータ検索、特にウォームノードや複数のリモートクラスターにまたがって検索が実行される場合、完了までに時間がかかることがあります。 完了までクライアントが待機するにはタイムアウトを延長するなどの措置が必要ですが、何らかの問題で接続が切断された場合は改めて検索リクエストを発行する必要があります。
このような課題に対処するために、OpenSearch の非同期検索を使用することができます。非同期検索は、バックグラウンドで実行される検索リクエストを送信できます。検索の進行状況は監視可能であり、結果を段階的に取得することも可能です。検索結果は任意の期間保存することが可能であり、後から取得することもできます。
index_name = "movies"
payload = {
"index": index_name
}
response = opensearch_client.http.post(
url = "/_plugins/_asynchronous_search?index=" + index_name
)
asynchronous_search_id = response["id"]
#print(json.dumps(response, indent=2))
print("asynchronous_search_id: " + asynchronous_search_id)
pd.json_normalize(response["response"]["hits"]["hits"])
asynchronous_search_id: Fmtkbkx1ZUZhVDBxdW5pMWFQZlQ1LWcGMzc5MDU5FG1zNkhnNVVCX2dZZmp1N2R2T05EATI=
_index | _id | _score | _source.directors | _source.release_date | _source.rating | _source.genres | _source.image_url | _source.plot | _source.title | _source.rank | _source.running_time_secs | _source.actors | _source.year | _source.id | _source.type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | movies | tt2229499 | 1.0 | [Joseph Gordon-Levitt] | 2013-01-18T00:00:00Z | 7.4 | [Comedy, Drama] | https://m.media-amazon.com/images/M/MV5BMTQxNT... | A New Jersey guy dedicated to his family, frie... | Don Jon | 1 | 5400.0 | [Joseph Gordon-Levitt, Scarlett Johansson, Jul... | 2013 | tt2229499 | add |
1 | movies | tt1979320 | 1.0 | [Ron Howard] | 2013-09-02T00:00:00Z | 8.3 | [Action, Biography, Drama, Sport] | https://m.media-amazon.com/images/M/MV5BMTQyMD... | A re-creation of the merciless 1970s rivalry b... | Rush | 2 | 7380.0 | [Daniel Brühl, Chris Hemsworth, Olivia Wilde] | 2013 | tt1979320 | add |
2 | movies | tt1392214 | 1.0 | [Denis Villeneuve] | 2013-08-30T00:00:00Z | 8.2 | [Crime, Drama, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0NT... | When Keller Dover's daughter and her friend go... | Prisoners | 3 | 9180.0 | [Hugh Jackman, Jake Gyllenhaal, Viola Davis] | 2013 | tt1392214 | add |
3 | movies | tt1951264 | 1.0 | [Francis Lawrence] | 2013-11-11T00:00:00Z | NaN | [Action, Adventure, Sci-Fi, Thriller] | https://m.media-amazon.com/images/M/MV5BMTAyMj... | Katniss Everdeen and Peeta Mellark become targ... | The Hunger Games: Catching Fire | 4 | 8760.0 | [Jennifer Lawrence, Josh Hutcherson, Liam Hems... | 2013 | tt1951264 | add |
4 | movies | tt1981115 | 1.0 | [Alan Taylor] | 2013-10-30T00:00:00Z | NaN | [Action, Adventure, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTQyNz... | Faced with an enemy that even Odin and Asgard ... | Thor: The Dark World | 5 | NaN | [Chris Hemsworth, Natalie Portman, Tom Hiddles... | 2013 | tt1981115 | add |
5 | movies | tt1245492 | 1.0 | [Evan Goldberg, Seth Rogen] | 2013-06-03T00:00:00Z | 7.2 | [Comedy, Fantasy] | https://m.media-amazon.com/images/M/MV5BMTQxOD... | While attending a party at James Franco's hous... | This Is the End | 6 | 6420.0 | [James Franco, Jonah Hill, Seth Rogen] | 2013 | tt1245492 | add |
6 | movies | tt2226417 | 1.0 | [James Wan] | 2013-09-13T00:00:00Z | 7.1 | [Horror, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0OT... | The haunted Lambert family seeks to uncover th... | Insidious: Chapter 2 | 7 | 6360.0 | [Patrick Wilson, Rose Byrne, Barbara Hershey] | 2013 | tt2226417 | add |
7 | movies | tt0816711 | 1.0 | [Marc Forster] | 2013-06-02T00:00:00Z | 7.1 | [Action, Adventure, Horror, Sci-Fi, Thriller] | https://m.media-amazon.com/images/M/MV5BMTg0NT... | United Nations employee Gerry Lane traverses t... | World War Z | 8 | 6960.0 | [Brad Pitt, Mireille Enos, Daniella Kertesz] | 2013 | tt0816711 | add |
8 | movies | tt1877832 | 1.0 | [Bryan Singer] | 2014-05-21T00:00:00Z | NaN | [Action, Adventure, Fantasy, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTQ0Nz... | The X-Men send Wolverine to the past to change... | X-Men: Days of Future Past | 9 | NaN | [Jennifer Lawrence, Hugh Jackman, Michael Fass... | 2014 | tt1877832 | add |
9 | movies | tt2109248 | 1.0 | [Michael Bay] | 2014-06-25T00:00:00Z | NaN | [Action, Adventure, Sci-Fi] | https://m.media-amazon.com/images/M/MV5BMTQyMD... | A mechanic and his daughter make a discovery t... | Transformers: Age of Extinction | 10 | NaN | [Mark Wahlberg, Nicola Peltz, Jack Reynor] | 2014 | tt2109248 | add |
index_name = "movies"
payload = {
"index": index_name
}
response = opensearch_client.http.get(
url = "/_plugins/_asynchronous_search/stats"
)
print(json.dumps(response, indent=2))
{ "_nodes": { "total": 1, "successful": 1, "failed": 0 }, "cluster_name": "123456789012:opensearchservi-lsy27q89mdpe", "nodes": { "kdnLueFaT0quni1aPfT5-g": { "asynchronous_search_stats": { "submitted": 2, "initialized": 2, "running_current": 0, "persisted": 0, "search_failed": 0, "search_completed": 2, "rejected": 0, "persist_failed": 0, "cancelled": 0 } } } }
OpenSearch では、query で抽出したドキュメントの一括更新や削除を行うことが可能です。以降のセクションで、具体的な実行方法を確認していきます。
Update by query を使用することで、特定条件に一致する複数ドキュメントのデータ更新をまとめて行うことが可能です。条件は Search API と同様に query で指定します。
以下のサンプルでは、year が 1920 から 1923 までの movies 内のドキュメントについて、rating の値を 0.1 増加させています。レスポンスの total フィールドに、処理対象となったドキュメントの件数が記載されています。
index_name = "movies"
payload = {
"query": {
"range": {
"year": {
"gte": 1920,
"lte": 1923
}
}
},
"script" : {
"source": "ctx._source.rating += params.delta",
"lang": "painless",
"params" : {
"delta" : 0.1
}
}
}
response = opensearch_client.update_by_query(
index = index_name,
body = payload,
refresh = True
)
print(json.dumps(response, indent=2))
{ "took": 15, "timed_out": false, "total": 3, "updated": 3, "deleted": 0, "batches": 1, "version_conflicts": 0, "noops": 0, "retries": { "bulk": 0, "search": 0 }, "throttled_millis": 0, "requests_per_second": -1.0, "throttled_until_millis": 0, "failures": [] }
Delete by query は、特定条件に一致する複数ドキュメントを一括で削除します。条件は Search API と同様に query で指定します。
以下のサンプルでは、year が 1989 までの movies 内のドキュメントを削除します。レスポンスの total フィールドに、処理対象となったドキュメントの件数が記載されています。
index_name = "movies"
payload = {
"query": {
"range": {
"year": {
"lte": 1989
}
}
}
}
response = opensearch_client.delete_by_query(
index = index_name,
body = payload,
refresh = True
)
print(json.dumps(response, indent=2))
{ "took": 134, "timed_out": false, "total": 782, "deleted": 782, "batches": 1, "version_conflicts": 0, "noops": 0, "retries": { "bulk": 0, "search": 0 }, "throttled_millis": 0, "requests_per_second": -1.0, "throttled_until_millis": 0, "failures": [] }
本ラボでは、OpenSearch の基本的な検索クエリについて、ユースケースと実際の使い方を解説しました。本ラボで学習した内容を元に、次のステップとして以下のラボを実行してみましょう。
本ワークショップで使用したインデックスを削除します。インデックスの削除は Delete index API で行います。インデックスを削除するとインデックス内のドキュメントも削除されます。
index_name = "movies"
try:
response = opensearch_client.indices.delete(index=index_name)
print(json.dumps(response, indent=2))
except Exception as e:
print(e)
{ "acknowledged": true }
ダウンロードしたデータセットを削除します。./dataset ディレクトリ配下に何もない場合は、./dataset ディレクトリも合わせて削除します。
%rm -rf {dataset_dir}
%rmdir ./dataset