🗂

Transformersの基礎

2024/03/07に公開

transformers

LLM

tech

目的： transformersの最低限をsimpleに理解する

Model

huggingface hubを利用

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

model_name= "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

model.to("cuda")

modelとtokenizerをセットで用意する。model_nameはここから探す
AutoModelForCausalLM: 次の単語を予測するモデル
AutoModelForSequenceClassification: 分類問題
AutoModelForQuestionAnswering: 質問に答える

Dataset

huggingface hubを利用

from datasets import list_datasets, load_dataset

print(list_datasets())  # 利用可能なデータセット一覧
dataset = load_dataset('squad', split='train')

split=validationとするとvalidationのデータセットになる。
split='train[:10%]' : 学習分割の最初の10％のみをロード。
split='train[:100]+validation[:100]' : 学習分割の最初の100例と検証分割の最初の100例から分割を作成。
全部のデータセットを確認する方法　import huggingface_hub;[dat for dat in huggingface_hub.list_datasets()]

中身はnumpyのようにsliceがつかえる高機能jsonのような振る舞い

> dataset[:2]

> {'id': ['5733be284776f41900661182', '5733be284776f4190066117f'],
 'title': ['University_of_Notre_Dame', 'University_of_Notre_Dame'],
 'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
  'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'],
 'question': ['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
  'What is in front of the Notre Dame Main Building?'],
 'answers': [{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]},
  {'text': ['a copper statue of Christ'], 'answer_start': [188]}]}

glueのようなサブデータセットまで指定しないといけないものもある

from datasets import load_dataset
dataset = load_dataset('glue', 'sst2')
print(dataset)

local dataを利用

# pandas
# df = pd.DataFrame({"a": [1, 2, 3]})
from datasets import Dataset
dataset = Dataset.from_pandas(df)

# csv
from datasets import load_dataset
dataset = load_dataset('csv', data_files='my_file.csv')
dataset = load_dataset('csv', data_files=['my_file_1.csv', 'my_file_2.csv', 'my_file_3.csv'])
dataset = load_dataset('csv', data_files={'train': ['my_train_file_1.csv', 'my_train_file_2.csv'], 'test': 'my_test_file.csv'})

# json
dataset = load_dataset('json', data_files='my_file.json')

# text
dataset = load_dataset('text', data_files={'train': ['my_text_1.txt', 'my_text_2.txt'], 'test': 'my_test_file.txt'})

推論

# model_name= "distilgpt2"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

input_text = "The market for personal computers"

# torch tensor化
token_ids = tokenizer.encode(input_text, return_tensors="pt")
token_ids = token_ids.to("cuda")

# 推論
output_ids = model.generate(token_ids, temperature=0.0001, max_new_tokens=200)

#  
raw_pred = tokenizer.decode(output_ids[0])

modelの入出力はtoken idが並ぶtorch tensor

token_ids = tensor([[ 464, 1910,  329, 2614, 9061]])
# 以下で元に戻せる
tokenizer.decode(token_ids[0])

dataset[:2]['title']のようにデータセットに合わせてtextを引っ張ってくる

学習

from transformers import TrainingArguments, Trainer


# modelの学習用config
print(model.generation_config)
# modelの学習用configを上書き
training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Trainerを使った評価

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# データセットとメトリックのロード
dataset = load_dataset("glue", "cola")
metric = load_metric("glue", "cola")

# トークナイザーとモデルのロード
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# データセットの前処理
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

# 評価用のTrainerの設定
training_args = TrainingArguments(
    output_dir="./results",
    do_train=False,   # トレーニングは行わない
    do_eval=True,
)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

# 評価の実行
eval_result = trainer.evaluate()

# 結果の表示
print(eval_result)

metricもtaskに合わせたmetricが用意されていて。
predictionsとlabelから計算できる。

bertは2つの文章まで入力することができる。
glueはタスクによって２つの文章を使うかどうかが変わる。

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

例えばmrpcだとpreprocess_functionを以下のように変更する

def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding=True)

Transformersの基礎

目的： transformersの最低限をsimpleに理解する

Model

huggingface hubを利用

Dataset

huggingface hubを利用

local dataを利用

推論

学習

Trainerを使った評価

参考文献

Discussion