💡

ElasticSearch(OpenSearch)の日本語の全文検索設定【雑メモ】

2024/02/12に公開

概要

  • OpenSearch2.11
  • opensearch-ruby

client生成

  def self.client
    if Rails.env.development?
      @@client ||= OpenSearch::Client.new(
          url: ENV["OPEN_SEARCH_HOST"],
          retry_on_failure: 5,
          request_timeout: 120,
          log: false
        )
    else
      signer = Aws::Sigv4::Signer.new(service: 'es',
        region: 'ap-northeast-1',
        access_key_id: ENV["OPEN_SEARCH_ACCESS_KEY"],
        secret_access_key: ENV["OPEN_SEARCH_SECRET_ACCESS_KEY"])

      @@client||= OpenSearch::Aws::Sigv4Client.new({
      host: ENV["OPEN_SEARCH_HOST"],
      log: true,
      }, signer)
    end
    @@client
  end

index生成

  def self.create_index
    client.indices.delete(index: INDEX_NAME) if client.indices.exists?(index: INDEX_NAME)
    client.indices.create(
      index: INDEX_NAME,
      body: {
        settings: {
          number_of_shards:   1,
          number_of_replicas: 1,
          analysis: {
            filter: {
              kuromoji_multiplexer: {
                type: 'multiplexer',
                filters: [
                  'lowercase', # 小文字に変換(AbC → abc)
                  # 'lowercase,porter_stem', # 基本形に変換。(running → run)
                  # 寿司 → 「寿司」「スシ」。読みがなを間違えると思うようにいかない。カタカナでしか検索できない。明かるい → メイ・カルイ。 「飼う」、「買う」がどちらもヒット
                  # 'kuromoji_readingform'
                ]
              },
              # 長音「ー」排除 プリンター → プリンタのように長音を削除(※minimumlengthなどの設定ができる。)
              # https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-stemmer.html)
              my_katakana_stemmer: {
                type: 'kuromoji_stemmer',
                minimum_length: 4
              },
            # 気になる打ち間違いはここで設定する(※投入されるデータが変わるのでそのまま使うのはNG。辞書登録したい。)
            char_filter: {
              my_char_filter: {
                type: 'mapping',
                mappings: [
                  'ヴァ=>バ',
                  'ヴィ=>ビ',
                  'ヴゥ=>ブ',
                  'ヴェ=>ベ',
                  'ヴォ=>ボ',
                ]
              }
            },
            tokenizer: {
              ja_kuromoji_tokenizer: {
                mode: "search",
                discard_compound_token: true, # 複合語出力なし
                type: "kuromoji_tokenizer"
              },
              ja_ngram_tokenizer: {
                type: "ngram",
                min_gram: 2,
                max_gram: 3,
                token_chars: [
                  "letter",
                  "digit"
                ]
              }
            },
            # user_dictionary_rules、synonyms
            analyzer: {
              kuromoji_analyzer: {
                type: 'custom',
                tokenizer: 'ja_kuromoji_tokenizer',
                # tokenizer: 'kuromoji_tokenizer',
                filter: [
                  "kuromoji_baseform",
                  # 寿司がおいしいね → 「寿司」「おいしい」だけ残して、「が」と「ね」
                  'kuromoji_part_of_speech',
                  'cjk_width', # 文字幅正規
                  "ja_stop",
                  'my_katakana_stemmer',
                  'lowercase',
                  # 'kuromoji_multiplexer',
                  # 'ngram_filter'
                  # 'kuromoji_number', # 一〇〇〇(漢数字のゼロ)→1000にする(※反応しない)
                ],
                # 半角・全角(㌀ → アパート)
                char_filter: [
                  'icu_normalizer', # 文字の正規化
                  "html_strip", # html排除
                  'my_char_filter'
                ]
              },
              "ja_ngram_analyzer": {
                type: "custom",
                char_filter: [
                  'icu_normalizer', # 文字の正規化
                  "html_strip", # html排除
                  'my_char_filter'
                ],
                tokenizer: "ja_ngram_tokenizer",
                filter: [
                  "lowercase"
                ]
              },
            }
          }
        }
      }
    )
  end

mapping

  • kuromoji解析とngram解析のフィールドを用意
    client.indices.put_mapping(
      index: INDEX_NAME,
      body: {
        properties: {
          # 雇用区分
          employment: {
            type: 'nested',
            properties: {
              code: { type: 'keyword' }, # key2
              type: { type: 'keyword' }, # 正社員、契約社員、アルバイトなど
              year: { type: 'keyword' }  # 対象年度
            }
          },
          corporation_name: {
            type: 'text', analyzer: 'kuromoji_analyzer',
            fields: {
              ngram: { type: 'text', analyzer: 'ja_ngram_analyzer' }
            }
          },
         # 省略
        }
     )

検索クエリ

  • or検索でkuromojiフィールドとngramフィールドを繋ぐ
{
  "query": {
    "bool": {
      "should": [
        {
          "bool": {
            "must": [
              {
                "multi_match": {
                  "query": "ニューアワジ",
                  "fields": ["offer", "catch_copy", "corporation_name", "corporation_name_furigana", "desc_work", "ideal_type", "location", "occupation", "search_tags"],
                  "operator": "and"
                }
              },
              {
                "multi_match": {
                  "query": "エンジニア",
                  "fields": ["offer", "catch_copy", "corporation_name", "corporation_name_furigana", "desc_work", "ideal_type", "location", "occupation", "search_tags"],
                  "operator": "and"
                }
              }
            ]
          }
        },
        {
          "bool": {
            "must": [
              {
                "multi_match": {
                  "query": "ニューアワジ",
                  "fields": ["offer.ngram", "catch_copy.ngram", "corporation_name.ngram", "corporation_name_furigana.ngram", "desc_work.ngram", "ideal_type.ngram", "location.ngram", "occupation.ngram", "search_tags.ngram"],
                  "operator": "and",
                  "type": "phrase"
                }
              },
              {
                "multi_match": {
                  "query": "エンジニア",
                  "fields": ["offer.ngram", "catch_copy.ngram", "corporation_name.ngram", "corporation_name_furigana.ngram", "desc_work.ngram", "ideal_type.ngram", "location.ngram", "occupation.ngram", "search_tags.ngram"],
                  "operator": "and",
                  "type": "phrase"
                }
              }
            ]
          }
        }
      ]
    }
  },
  "size": 10000,
  "sort": [{ "_id": "asc" }],
  "_source": ["_id"]
}

動的に生成

      base_query =  { bool: { filter: [] } } # ※スコア計算が必要な場合はfilter → mustにする

      # 【フリーワード】
      if search_params[:keyword].present?
        set_free_word_query(base_query: base_query, keyword: search_params[:keyword])
      end
    # フリーワード検索
    def set_free_word_query(base_query:, keyword:)
      # スペース分割。複数単語はandで繋ぐ
      keywords = keyword.split(/[[:space:]]+/)
      keywords.each do |keyword|
        keyword_condition = { bool: { should: [] } }
        keyword_condition[:bool][:should]  << {
          bool: {
            should: [
              {
                multi_match: {
                  query: keyword,
                  fields: JobOpenSearch::MULTI_MATCH_FIELDS,
                  operator: "and",
                }
              },
              {
                multi_match: {
                  query: keyword,
                  fields: JobOpenSearch::MULTI_MATCH_NGRAM_FIELDS, # ngram
                  operator: "and",
                  type: "phrase"
                }
              },
            ]
          }
        }
        base_query[:bool][:filter] << keyword_condition
      end
    end

Discussion