🗂
Transformersの基礎
目的: transformersの最低限をsimpleに理解する
Model
huggingface hubを利用
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
model_name= "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model.to("cuda")
modelとtokenizerをセットで用意する。model_nameはここから探す
AutoModelForCausalLM: 次の単語を予測するモデル
AutoModelForSequenceClassification: 分類問題
AutoModelForQuestionAnswering: 質問に答える
Dataset
huggingface hubを利用
from datasets import list_datasets, load_dataset
print(list_datasets()) # 利用可能なデータセット一覧
dataset = load_dataset('squad', split='train')
split=validationとするとvalidationのデータセットになる。
split='train[:10%]' : 学習分割の最初の10%のみをロード。
split='train[:100]+validation[:100]' : 学習分割の最初の100例と検証分割の最初の100例から分割を作成。
全部のデータセットを確認する方法 import huggingface_hub;[dat for dat in huggingface_hub.list_datasets()]
中身はnumpyのようにsliceがつかえる高機能jsonのような振る舞い
> dataset[:2]
> {'id': ['5733be284776f41900661182', '5733be284776f4190066117f'],
'title': ['University_of_Notre_Dame', 'University_of_Notre_Dame'],
'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'],
'question': ['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
'What is in front of the Notre Dame Main Building?'],
'answers': [{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]},
{'text': ['a copper statue of Christ'], 'answer_start': [188]}]}
glueのようなサブデータセットまで指定しないといけないものもある
from datasets import load_dataset
dataset = load_dataset('glue', 'sst2')
print(dataset)
local dataを利用
# pandas
# df = pd.DataFrame({"a": [1, 2, 3]})
from datasets import Dataset
dataset = Dataset.from_pandas(df)
# csv
from datasets import load_dataset
dataset = load_dataset('csv', data_files='my_file.csv')
dataset = load_dataset('csv', data_files=['my_file_1.csv', 'my_file_2.csv', 'my_file_3.csv'])
dataset = load_dataset('csv', data_files={'train': ['my_train_file_1.csv', 'my_train_file_2.csv'], 'test': 'my_test_file.csv'})
# json
dataset = load_dataset('json', data_files='my_file.json')
# text
dataset = load_dataset('text', data_files={'train': ['my_text_1.txt', 'my_text_2.txt'], 'test': 'my_test_file.txt'})
推論
# model_name= "distilgpt2"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
input_text = "The market for personal computers"
# torch tensor化
token_ids = tokenizer.encode(input_text, return_tensors="pt")
token_ids = token_ids.to("cuda")
# 推論
output_ids = model.generate(token_ids, temperature=0.0001, max_new_tokens=200)
#
raw_pred = tokenizer.decode(output_ids[0])
modelの入出力はtoken idが並ぶtorch tensor
token_ids = tensor([[ 464, 1910, 329, 2614, 9061]])
# 以下で元に戻せる
tokenizer.decode(token_ids[0])
dataset[:2]['title']のようにデータセットに合わせてtextを引っ張ってくる
学習
from transformers import TrainingArguments, Trainer
# modelの学習用config
print(model.generation_config)
# modelの学習用configを上書き
training_args = TrainingArguments(
output_dir="my_awesome_qa_model",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_squad["train"],
eval_dataset=tokenized_squad["test"],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
Trainerを使った評価
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
# データセットとメトリックのロード
dataset = load_dataset("glue", "cola")
metric = load_metric("glue", "cola")
# トークナイザーとモデルのロード
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
# データセットの前処理
def preprocess_function(examples):
return tokenizer(examples["sentence"], truncation=True, padding=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
# 評価用のTrainerの設定
training_args = TrainingArguments(
output_dir="./results",
do_train=False, # トレーニングは行わない
do_eval=True,
)
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
model=model,
args=training_args,
eval_dataset=encoded_dataset["validation"],
compute_metrics=compute_metrics,
)
# 評価の実行
eval_result = trainer.evaluate()
# 結果の表示
print(eval_result)
metricもtaskに合わせたmetricが用意されていて。
predictionsとlabelから計算できる。
Discussion