📖
EmbeddingGemma-300mでテキスト分類(LIBLINEAR)
初めに
GoogleからEmbeddingGemma-300mが出ましたので、試してみました。
EmbeddingGemma-300mでテキスト分類(LoRA)では、accuracy = 95.7%でした。
今度はembeddings取得後にLIBLINEARを試してみます。
データセットは同様にlivedoor ニュースコーパスです。
今回の結果は、accuracy = 76.6%でした。
EmbeddingGemma-300mのMLP分類と比べても悪い結果となりました。
インストール
%%capture
!pip install liblinear-official
データ準備
livedoor ニュースコーパスを取得
%%capture
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar xvf ldcc-20140209.tar.gz
ジャンル別に読み込む
import os
import glob
livedoor_news = {}
for folder in glob.glob("text/*"):
if os.path.isdir(folder):
texts = []
for txt in glob.glob(os.path.join(folder, "*.txt")):
text = []
with open(txt, "r") as f:
lines = f.readlines()
texts.append('\n'.join([line.strip() for line in lines[3:]]))
label = os.path.basename(folder)
livedoor_news[label] = texts
訓練データセットとテストデータセットを作成
from datasets import Dataset, concatenate_datasets
classes = list(livedoor_news.keys())
train_dataset = None
val_dataset = None
for label, texts in livedoor_news.items():
data = []
for text in texts:
data.append({"text": text, "label": label, "labels": classes.index(label)})
dataset = Dataset.from_list(data)
tmp_train = dataset.train_test_split(test_size=0.25, shuffle=True, seed=0)
if train_dataset is None:
train_dataset = tmp_train["train"]
val_dataset = tmp_train["test"]
else:
train_dataset = concatenate_datasets([train_dataset, tmp_train["train"]])
val_dataset = concatenate_datasets([val_dataset, tmp_train["test"]])
埋め込みモデル作成
from transformers import AutoTokenizer, AutoModel
import torch
model_name = "google/embeddinggemma-300m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
埋め込み生成
@torch.no_grad()
def compute_embedding(batch):
inputs = tokenizer(batch["text"], truncation=True, padding=True, max_length=2048, return_tensors="pt").to(device)
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).cpu()
batch["embeddings"] = [e.numpy() for e in embeddings]
return batch
train_ds = train_dataset.map(compute_embedding, batched=True, batch_size=64)
val_ds = val_dataset.map(compute_embedding, batched=True, batch_size=64)
学習
学習!
import numpy as np
from liblinear.liblinearutil import train, predict, problem, parameter
y_train = np.array(train_ds['labels'])
x_train = np.array(train_ds['embeddings'])
prob = problem(y_train, x_train)
param = parameter('-s 0 -c 1.0 -B 1')
model = train(prob, param)
推論
y_val = np.array(val_ds['labels'])
x_val = np.array(val_ds['embeddings'])
p_label, p_acc, p_vals = predict(y_val, x_val, model, '-b 0')
集計
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
t = [classes[i] for i in y_val]
p = [classes[int(i)] for i in p_label]
figure, ax1 = plt.subplots()
SVM_confusion_df = pd.crosstab(t, p, rownames=['Actual'], normalize='index')
sn.heatmap(SVM_confusion_df, annot=True, cmap="YlGnBu", ax=ax1, cbar=False)

from sklearn.metrics import classification_report
report = classification_report(y_pred=p, y_true=t, target_names=classes, output_dict=True)
pd.DataFrame(report).T

Discussion