📖

EmbeddingGemma-300mでテキスト分類(MLP)

に公開

初めに

GoogleからEmbeddingGemma-300mが出ましたので、試してみました。
EmbeddingGemma-300mでテキスト分類(LoRA)では、accuracy = 95.7%でした。

今度はembeddings取得後にMLP分類を試してみます。
データセットは同様にlivedoor ニュースコーパスです。
今回の結果は、accuracy = 80%でした。
EmbeddingGemma-300mのロジスティック回帰と比べてもだいぶ悪い結果となりました。


データ準備

livedoor ニュースコーパスを取得

%%capture
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar xvf ldcc-20140209.tar.gz

ジャンル別に読み込む

import os
import glob

livedoor_news = {}
for folder in glob.glob("text/*"):
  if os.path.isdir(folder):
    texts = []
    for txt in glob.glob(os.path.join(folder, "*.txt")):
      text = []
      with open(txt, "r") as f:
        lines = f.readlines()
        texts.append('\n'.join([line.strip() for line in lines[3:]]))

    label = os.path.basename(folder)
    livedoor_news[label] = texts

訓練データセットとテストデータセットを作成

from datasets import Dataset, concatenate_datasets

classes = list(livedoor_news.keys())

train_dataset = None
val_dataset = None

for label, texts in livedoor_news.items():
  data = []
  for text in texts:
    data.append({"text": text, "label": label, "labels": classes.index(label)})
  dataset = Dataset.from_list(data)
  tmp_train = dataset.train_test_split(test_size=0.25, shuffle=True, seed=0)
  if train_dataset is None:
    train_dataset = tmp_train["train"]
    val_dataset = tmp_train["test"]
  else:
    train_dataset = concatenate_datasets([train_dataset, tmp_train["train"]])
    val_dataset = concatenate_datasets([val_dataset, tmp_train["test"]])

埋め込みモデル作成

from transformers import AutoTokenizer, AutoModel
import torch

model_name = "google/embeddinggemma-300m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

埋め込み生成

@torch.no_grad()
def compute_embedding(batch):
    inputs = tokenizer(batch["text"], truncation=True, padding=True, max_length=2048, return_tensors="pt").to(device)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu()
    batch["embeddings"] = [e.numpy() for e in embeddings]
    return batch

train_ds = train_dataset.map(compute_embedding, batched=True, batch_size=64)
val_ds = val_dataset.map(compute_embedding, batched=True, batch_size=64)

DataLoader作成

from torch.utils.data import DataLoader

def collate_fn(batch):
    embeddings = torch.tensor([b["embeddings"] for b in batch], dtype=torch.float)
    labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
    return {"embeddings": embeddings, "labels": labels}

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)

学習

MLP分類器定義

import torch.nn as nn

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_labels)
        )

    def forward(self, x):
        return self.model(x)

embedding_dim = model.config.hidden_size
print(f"EmbeddingGemma hidden size: {embedding_dim}")

classifier = MLPClassifier(embedding_dim, hidden_dim=256, num_labels=len(classes)).to("cuda")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)

学習!

from tqdm import tqdm

epochs = 20
for epoch in range(epochs):
    classifier.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        embeddings = batch["embeddings"].to("cuda")
        labels = batch["labels"].to("cuda")
        logits = classifier(embeddings)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

推論

from sklearn.metrics import classification_report

classifier.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        embeddings = batch["embeddings"].to("cuda")
        labels = batch["labels"].cpu().numpy()
        logits = classifier(embeddings)
        preds = logits.argmax(dim=1).cpu().numpy()
        y_true.extend(labels)
        y_pred.extend(preds)

with open('predict.txt', 'w') as f:
    for t, p in zip(y_true, y_pred):
      f.write(f"{classes[t]},{classes[p]}\n")

集計

import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

results = pd.read_csv('predict.txt', header=None)

figure, ax1 = plt.subplots()

SVM_confusion_df = pd.crosstab(results[0], results[1], rownames=['Actual'], normalize='index')
sn.heatmap(SVM_confusion_df, annot=True, cmap="YlGnBu", ax=ax1, cbar=False)

from sklearn.metrics import classification_report

report = classification_report(y_pred=results[1], y_true=results[0], target_names=classes, output_dict=True)
pd.DataFrame(report).T

Discussion