💡
ElasticSearch(OpenSearch)の日本語の全文検索設定【雑メモ】
概要
- OpenSearch2.11
- opensearch-ruby
client生成
def self.client
if Rails.env.development?
@@client ||= OpenSearch::Client.new(
url: ENV["OPEN_SEARCH_HOST"],
retry_on_failure: 5,
request_timeout: 120,
log: false
)
else
signer = Aws::Sigv4::Signer.new(service: 'es',
region: 'ap-northeast-1',
access_key_id: ENV["OPEN_SEARCH_ACCESS_KEY"],
secret_access_key: ENV["OPEN_SEARCH_SECRET_ACCESS_KEY"])
@@client||= OpenSearch::Aws::Sigv4Client.new({
host: ENV["OPEN_SEARCH_HOST"],
log: true,
}, signer)
end
@@client
end
index生成
def self.create_index
client.indices.delete(index: INDEX_NAME) if client.indices.exists?(index: INDEX_NAME)
client.indices.create(
index: INDEX_NAME,
body: {
settings: {
number_of_shards: 1,
number_of_replicas: 1,
analysis: {
filter: {
kuromoji_multiplexer: {
type: 'multiplexer',
filters: [
'lowercase', # 小文字に変換(AbC → abc)
# 'lowercase,porter_stem', # 基本形に変換。(running → run)
# 寿司 → 「寿司」「スシ」。読みがなを間違えると思うようにいかない。カタカナでしか検索できない。明かるい → メイ・カルイ。 「飼う」、「買う」がどちらもヒット
# 'kuromoji_readingform'
]
},
# 長音「ー」排除 プリンター → プリンタのように長音を削除(※minimumlengthなどの設定ができる。)
# https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-stemmer.html)
my_katakana_stemmer: {
type: 'kuromoji_stemmer',
minimum_length: 4
},
# 気になる打ち間違いはここで設定する(※投入されるデータが変わるのでそのまま使うのはNG。辞書登録したい。)
char_filter: {
my_char_filter: {
type: 'mapping',
mappings: [
'ヴァ=>バ',
'ヴィ=>ビ',
'ヴゥ=>ブ',
'ヴェ=>ベ',
'ヴォ=>ボ',
]
}
},
tokenizer: {
ja_kuromoji_tokenizer: {
mode: "search",
discard_compound_token: true, # 複合語出力なし
type: "kuromoji_tokenizer"
},
ja_ngram_tokenizer: {
type: "ngram",
min_gram: 2,
max_gram: 3,
token_chars: [
"letter",
"digit"
]
}
},
# user_dictionary_rules、synonyms
analyzer: {
kuromoji_analyzer: {
type: 'custom',
tokenizer: 'ja_kuromoji_tokenizer',
# tokenizer: 'kuromoji_tokenizer',
filter: [
"kuromoji_baseform",
# 寿司がおいしいね → 「寿司」「おいしい」だけ残して、「が」と「ね」
'kuromoji_part_of_speech',
'cjk_width', # 文字幅正規
"ja_stop",
'my_katakana_stemmer',
'lowercase',
# 'kuromoji_multiplexer',
# 'ngram_filter'
# 'kuromoji_number', # 一〇〇〇(漢数字のゼロ)→1000にする(※反応しない)
],
# 半角・全角(㌀ → アパート)
char_filter: [
'icu_normalizer', # 文字の正規化
"html_strip", # html排除
'my_char_filter'
]
},
"ja_ngram_analyzer": {
type: "custom",
char_filter: [
'icu_normalizer', # 文字の正規化
"html_strip", # html排除
'my_char_filter'
],
tokenizer: "ja_ngram_tokenizer",
filter: [
"lowercase"
]
},
}
}
}
}
)
end
mapping
- kuromoji解析とngram解析のフィールドを用意
client.indices.put_mapping(
index: INDEX_NAME,
body: {
properties: {
# 雇用区分
employment: {
type: 'nested',
properties: {
code: { type: 'keyword' }, # key2
type: { type: 'keyword' }, # 正社員、契約社員、アルバイトなど
year: { type: 'keyword' } # 対象年度
}
},
corporation_name: {
type: 'text', analyzer: 'kuromoji_analyzer',
fields: {
ngram: { type: 'text', analyzer: 'ja_ngram_analyzer' }
}
},
# 省略
}
)
検索クエリ
- or検索でkuromojiフィールドとngramフィールドを繋ぐ
{
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"multi_match": {
"query": "ニューアワジ",
"fields": ["offer", "catch_copy", "corporation_name", "corporation_name_furigana", "desc_work", "ideal_type", "location", "occupation", "search_tags"],
"operator": "and"
}
},
{
"multi_match": {
"query": "エンジニア",
"fields": ["offer", "catch_copy", "corporation_name", "corporation_name_furigana", "desc_work", "ideal_type", "location", "occupation", "search_tags"],
"operator": "and"
}
}
]
}
},
{
"bool": {
"must": [
{
"multi_match": {
"query": "ニューアワジ",
"fields": ["offer.ngram", "catch_copy.ngram", "corporation_name.ngram", "corporation_name_furigana.ngram", "desc_work.ngram", "ideal_type.ngram", "location.ngram", "occupation.ngram", "search_tags.ngram"],
"operator": "and",
"type": "phrase"
}
},
{
"multi_match": {
"query": "エンジニア",
"fields": ["offer.ngram", "catch_copy.ngram", "corporation_name.ngram", "corporation_name_furigana.ngram", "desc_work.ngram", "ideal_type.ngram", "location.ngram", "occupation.ngram", "search_tags.ngram"],
"operator": "and",
"type": "phrase"
}
}
]
}
}
]
}
},
"size": 10000,
"sort": [{ "_id": "asc" }],
"_source": ["_id"]
}
動的に生成
base_query = { bool: { filter: [] } } # ※スコア計算が必要な場合はfilter → mustにする
# 【フリーワード】
if search_params[:keyword].present?
set_free_word_query(base_query: base_query, keyword: search_params[:keyword])
end
# フリーワード検索
def set_free_word_query(base_query:, keyword:)
# スペース分割。複数単語はandで繋ぐ
keywords = keyword.split(/[[:space:]]+/)
keywords.each do |keyword|
keyword_condition = { bool: { should: [] } }
keyword_condition[:bool][:should] << {
bool: {
should: [
{
multi_match: {
query: keyword,
fields: JobOpenSearch::MULTI_MATCH_FIELDS,
operator: "and",
}
},
{
multi_match: {
query: keyword,
fields: JobOpenSearch::MULTI_MATCH_NGRAM_FIELDS, # ngram
operator: "and",
type: "phrase"
}
},
]
}
}
base_query[:bool][:filter] << keyword_condition
end
end
Discussion