Closed2021/02/26にクローズ3

分類問題のためのテキストの前処理クラス実装

yagiyuki 2021/02/13に更新

分類問題を解くためのテキストデータの前処理を実装する。

yagiyuki 2021/02/13に更新

最終形(暫定)

from unicodedata import normalize
import string
from sudachipy import tokenizer
from sudachipy import dictionary

from sklearn.feature_extraction.text import CountVectorizer

#
# 前処理
#
class TextPreprocessing(object):
    def __init__(self, max_features=None):
        self.tokenizer_obj = dictionary.Dictionary().create()
        self.mode = tokenizer.Tokenizer.SplitMode.C
        punctuation = string.punctuation + '。、×÷ 【】『』 「」”“'
        self.noises = str.maketrans(
            {k: ' ' for k in normalize('NFKC', punctuation)})
        self.vectorizer = CountVectorizer(
            max_features=max_features, vocabulary=None, stop_words=[])

    #
    # ユニコード正規化を実施したうえで、トークン化を実施
    #
    def _preprocess(self, text):
        # unicode正規化とノイズ除去
        text = normalize('NFKC', text).lower()
        text = text.translate(self.noises).strip()
 
        # トークン化
        morphs = []
        for m in self.tokenizer_obj.tokenize(text, self.mode):
            if m.part_of_speech()[0] == '名詞':
                morphs.append(m.surface())
        return " ".join(morphs)


    def get_vectorizer(self, text_series, mode='train'):
        text_series = text_series.map(self._preprocess)
        if mode == 'train':
            bag = self.vectorizer.fit_transform(text_series)
        else:
            bag = self.vectorizer.transform(text_series)

        return bag, text_series
tp = TextPreprocessing()
tp._preprocess("【Sports Watch】ノムさん、斎藤佑樹のニックネームを考案!?") # out->sports watch ノム 斎藤 佑樹 ニックネーム 考案

yagiyuki 2021/02/13

最低限の前処理

形態素解析(sudachi)
bag of wordでのベクトル変換

from unicodedata import normalize
from sudachipy import tokenizer
from sudachipy import dictionary
import string

from sklearn.feature_extraction.text import CountVectorizer

#
# 前処理
#
class TextPreprocessing(object):
    def __init__(self):
        self.tokenizer_obj = dictionary.Dictionary().create()
        self.mode = tokenizer.Tokenizer.SplitMode.C
        self.vectorizer = CountVectorizer()

    #
    # テキストに対して前処理を実施
    #
    def _preprocess(self, text): 
        # トークン化
        morphs = []
        for m in self.tokenizer_obj.tokenize(text, self.mode):
            morphs.append(m.surface())


        return " ".join(morphs)


    def get_vectorizer(self, text_series, mode='train'):
        text_series = text_series.map(self._preprocess)

        if mode == 'train':
            # 辞書作成と文章データの行列を作成
            bag = self.vectorizer.fit_transform(text_series)
        else:
            # 文章データの行列を作成 ※ 辞書はtrainでつくったものを使用
            bag = self.vectorizer.transform(text_series)

        return bag, text_series

tp = TextPreprocessing()
tp._preprocess("【Sports Watch】ノムさん、斎藤佑樹のニックネームを考案!?")

このスクラップは2021/02/26にクローズされました