Open6
sudachi

初期化
mkdir py-sudachi
cd py-sudachi
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
touch main.py

sudachiインストール
pip install sudachipy
pip install sudachidict_small

ユーザー辞書込みでの形態素解析
from sudachipy import tokenizer
from sudachipy import dictionary
import json
# Convert the config dictionary to a JSON string
config_json = json.dumps({
"user": ["./user_dict.dic"]
})
# Pass the JSON string to the Dictionary constructor
tokenizer_obj = dictionary.Dictionary(dict='small', config=config_json).create()
# use longest match
mode = tokenizer.Tokenizer.SplitMode.A
def extract_noun(sentence:str):
# trim
sentence = sentence.strip()
# tokenized list
words = tokenizer_obj.tokenize(sentence, mode)
list = []
for word in words:
# append if word is noun
if word.part_of_speech()[0] == '名詞':
list.append(word.dictionary_form())
return list
sentence = """
これはテストです
"""
result = extract_noun(sentence)
print(result)

ユーザー辞書の作り方
- csvファイルを作成
アカネ,5146,5146,8000,アカネ,名詞,固有名詞,人名,名,*,*,アカネ,アカネ,*,*,*,*,*
レイ,5146,5146,8000,レイ,名詞,固有名詞,人名,名,*,*,レイ,レイ,*,*,*,*,*
- shコマンドでcsvから辞書を作成
sudachipy ubuild -o user_dict.dic -s ./venv/lib/python3.10/site-packages/sudachidict_small/resources/system.dic ./akane_and_rei.csv

csvファイル作成スクリプト
import csv
def create_user_dict_csv(words, filename):
with open(filename, "w", encoding="utf-8", newline="") as file:
writer = csv.writer(file)
for word in words:
row = [
word, 5146, 5146, 8000, word,
"名詞", "固有名詞", "*", "*", "*", "*", word, word,
"*", "*", "*", "*", "*"
]
writer.writerow(row)

python内からshを実行
import subprocess
def run_sudachipy_ubuild(userDictFileName, csvFileName):
cmd = [
'sudachipy', 'ubuild',
'-o', userDictFileName,
'-s', './venv/lib/python3.10/site-packages/sudachidict_small/resources/system.dic',
csvFileName
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print('Success:', result.stdout)
else:
print('Error:', result.stderr)
ログインするとコメントできます