Hugging Face NLP Course - 7. MAIN NLP TASKS 後編
概要
後編。
Training a causal language model from scratch
まったく新しいモデルをゼロから訓練するケース。
データがたくさんあり、利用可能なモデルに使用される事前学習データとは大きく異なる場合に取るべきアプローチ。
計算リソースも多く必要になる。
このセクションではコード生成モデルを作成する。
matplotlib、seaborn、pandas、scikit-learnなどのコードを1行ずつ生成することを目標とする。
Gathering the data
データサイエンス系のコードに限定、かつストリーミングで適宜ダウンロードできるようにする。
フィルタするコード。
def any_keyword_in_string(string, keywords):
for keyword in keywords:
if keyword in string:
return True
return False
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"
print(
any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)
False True
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset
def filter_streaming_dataset(dataset, filters):
filtered_dict = defaultdict(list)
total = 0
for sample in tqdm(iter(dataset)):
total += 1
if any_keyword_in_string(sample["content"], filters):
for k, v in sample.items():
filtered_dict[k].append(v)
print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
return Dataset.from_dict(filtered_dict)
ストリーミングで適応する。
6 GB
600,000 Python scripts
になる。
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
from datasets import load_dataset
split = "train" # "valid"
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)
3.26% of data after filtering.
実際にフィルタすると2,3時間かかるので、フィルタ済みの物を使用する。
※最初は.shuffle().select(行を外して少ない量で訓練を行うと良い。
from datasets import load_dataset, DatasetDict
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
raw_datasets = DatasetDict(
{
"train": ds_train, # .shuffle().select(range(50000)),
"valid": ds_valid, # .shuffle().select(range(500))
}
)
raw_datasets
DatasetDict({
train: Dataset({
features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
num_rows: 606720
})
valid: Dataset({
features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
num_rows: 3322
})
})
データの確認。
for key in raw_datasets["train"][0]:
print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")
'REPO_NAME: kmike/scikit-learn'
'PATH: sklearn/utils/__init__.py'
'COPIES: 3'
'SIZE: 10094'
'''CONTENT: """
The :mod:`sklearn.utils` module includes various utilites.
"""
from collections import Sequence
import numpy as np
from scipy.sparse import issparse
import warnings
from .murmurhash import murm
LICENSE: bsd-3-clause'''
Preparing the dataset
コード1行分を予測すれば良いので、コンテキストサイズは比較的小さくできる。
それによりメモリサイズが小さくなるし、速度も早くなる。
ここでは128トークンに固定することにする。
return_overflowing_tokensを使って文章全体をトークン化し、その後チャンク化する方法を取る。
from transformers import AutoTokenizer
context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
outputs = tokenizer(
raw_datasets["train"][:2]["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
def tokenize(element):
outputs = tokenizer(
element["content"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
input_batch = []
for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
if length == context_length:
input_batch.append(input_ids)
return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(
tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets
DatasetDict({
train: Dataset({
features: ['input_ids'],
num_rows: 16702061
})
valid: Dataset({
features: ['input_ids'],
num_rows: 93164
})
})
Initializing a new model
モデル設定のロード。
GPT-2 modelと同じ設定を利用する。
※事前学習された設定をロードする。
トーカナイザのサイズと、モデルの語彙サイズが同じになる様にする。
bosとeos(シーケンスの開始と終了)のトークンIDを設定する。
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
config = AutoConfig.from_pretrained(
"gpt2",
vocab_size=len(tokenizer),
n_ctx=context_length,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
モデルを初期化する。
ゼロベースの学習なのでfrom_pretrainedではない。
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")
GPT-2 size: 124.2M parameters
言語モデリング専用に設計されたDataCollatorForLanguageModelingコレーターを使う。
DataCollatorForLanguageModelingはマスク言語モデリング(MLM)と因果言語モデリング(CLM)の両方をサポートしている。デフォルトではMLM用、引数mlm=Falseを設定することでCLMに切り替える。
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
適用してみる。
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
print(f"{key} shape: {out[key].shape}")
input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])
ログイン。
from huggingface_hub import notebook_login
notebook_login()
コンソールからログイン。
huggingface-cli login
Trainerを作成する。
コサイン学習率スケジュールを利用する。
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
output_dir="codeparrot-ds",
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
evaluation_strategy="steps",
eval_steps=5_000,
logging_steps=5_000,
gradient_accumulation_steps=8,
num_train_epochs=1,
weight_decay=0.1,
warmup_steps=1_000,
lr_scheduler_type="cosine",
learning_rate=5e-4,
save_steps=5_000,
fp16=True,
push_to_hub=True,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["valid"],
)
訓練実行する。
全レコードなら20時間、サブセットのみなら2時間程度。
trainer.train()
pushする。
trainer.push_to_hub()
Code generation with a pipeline
pipelineで実行してみる。
import torch
from transformers import pipeline
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
"text-generation", model="huggingface-course/codeparrot-ds", device=device
)
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)
# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
# create some data
x = np.random.randn(100)
y = np.random.randn(100)
# create scatter plot with x, y
plt.scatter(x, y)
# create scatter
pandasのコードも生成できるか見てみる。
最後のforはカットされる。
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)
# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
# create some data
x = np.random.randn(100)
y = np.random.randn(100)
# create dataframe from x and y
df = pd.DataFrame({'x': x, 'y': y})
df.insert(0,'x', x)
for
groupbyに対応できるか見てみる。
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})
# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})
# calculate the mean income per profession
profession = df.groupby(['profession']).mean()
# compute the
scikit-learnのコードも生成できるか見てみる。
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor
# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor
# fit random forest model with 300 estimators on X, y:
rf = RandomForestRegressor(n_estimators=300, random_state=random_state, max_depth=3)
rf.fit(X, y)
rf
Training with 🤗 Accelerate
トレーニングループを完全にコントロールしたい場合や特別な変更を加えたい場合はAccelerateを使用する。
データサイエンス系のコードの特定のキーワードに注目し、該当のトークンが含まれる場合はlossを少なく見積もる。
キーワードは単一のトークンとして扱われること、"testtest"が複数のトークンに分割される事を確認する。
keytoken_ids = []
for keyword in [
"plt",
"pd",
"sk",
"fit",
"predict",
" plt",
" pd",
" sk",
" fit",
" predict",
"testtest",
]:
ids = tokenizer([keyword]).input_ids[0]
if len(ids) == 1:
keytoken_ids.append(ids[0])
else:
print(f"Keyword has not single token: {keyword}")
'Keyword has not single token: testtest'
カスタム損失関数を定義する。
指定したキーワードが入っていた場合、lossを少なく見積もる。
shift_labelsは予測トークンなので1つシフトする。
最後のトークンの次のトークンは無いので、最後のlogitsをカットする。
from torch.nn import CrossEntropyLoss
import torch
def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
# Shift so that tokens < n predict n
shift_labels = inputs[..., 1:].contiguous()
shift_logits = logits[..., :-1, :].contiguous()
# Calculate per-token loss
loss_fct = CrossEntropyLoss(reduce=False)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
# Resize and average loss per sample
loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
# Calculate and scale weighting
weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
axis=[0, 2]
)
weights = alpha * (1.0 + weights)
# Calculate weighted average
weighted_loss = (loss_per_sample * weights).mean()
return weighted_loss
データローダーを準備する。
from torch.utils.data.dataloader import DataLoader
tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)
バイアスとレイヤー正規化の重み以外に減衰がかかるようにする。
weight_decay = 0.1
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
params_with_wd, params_without_wd = [], []
for n, p in model.named_parameters():
if any(nd in n for nd in no_decay):
params_without_wd.append(p)
else:
params_with_wd.append(p)
return [
{"params": params_with_wd, "weight_decay": weight_decay},
{"params": params_without_wd, "weight_decay": 0.0},
]
評価関数の作成。
def evaluate():
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(batch["input_ids"], labels=batch["input_ids"])
losses.append(accelerator.gather(outputs.loss))
loss = torch.mean(torch.cat(losses))
try:
perplexity = torch.exp(loss)
except OverflowError:
perplexity = float("inf")
return loss.item(), perplexity.item()
モデルの作成。
model = GPT2LMHeadModel(config)
オプティマイザの作成。
from torch.optim import AdamW
optimizer = AdamW(get_grouped_params(model), lr=5e-4)
Acceleratorの作成。
from accelerate import Accelerator
accelerator = Accelerator(fp16=True)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
学習スケジューラの設定。
from transformers import get_scheduler
num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
name="linear",
optimizer=optimizer,
num_warmup_steps=1_000,
num_training_steps=num_training_steps,
)
リポジトリ名の設定。
from huggingface_hub import Repository, get_full_repo_name
model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name
'sgugger/codeparrot-ds-accelerate'
リポジトリの作成。
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)
評価関数が動くか確認。
evaluate()
(10.934126853942871, 56057.14453125)
学習ループの作成。
勾配累積ステップ数で損失をスケーリングしている。
from tqdm.notebook import tqdm
gradient_accumulation_steps = 8
eval_steps = 5_000
model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
for step, batch in tqdm(
enumerate(train_dataloader, start=1), total=num_training_steps
):
logits = model(batch["input_ids"]).logits
loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
if step % 100 == 0:
accelerator.print(
{
"lr": get_lr(),
"samples": step * samples_per_step,
"steps": completed_steps,
"loss/train": loss.item() * gradient_accumulation_steps,
}
)
loss = loss / gradient_accumulation_steps
accelerator.backward(loss)
if step % gradient_accumulation_steps == 0:
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
completed_steps += 1
if (step % (eval_steps * gradient_accumulation_steps)) == 0:
eval_loss, perplexity = evaluate()
accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
model.train()
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
repo.push_to_hub(
commit_message=f"Training in progress step {step}", blocking=False
)
Question answering
質疑応答モデルのファインチューニング。
BERTをSQuAD dataset(Wikipediaを使った質問回答集)を使ってファインチューニングする。
以下のようなモデルを作成する。
BERTの様なencoder-only modelsは「トランスフォーマーアーキテクチャを発明したのは誰か?」の様な質問には答えられるが、「なぜ空は青いのか?」の様な自由形式の質問質問にはうまく答えられない。
こういったケースではT5 and BARTの様なencoder-decoderが適している。
Preparing the data
The SQuAD dataset
データセットのロード。
from datasets import load_dataset
raw_datasets = load_dataset("squad")
raw_datasets
DatasetDict({
train: Dataset({
features: ['id', 'title', 'context', 'question', 'answers'],
num_rows: 87599
})
validation: Dataset({
features: ['id', 'title', 'context', 'question', 'answers'],
num_rows: 10570
})
})
データの確認。
answer_startフィールドはコンテキストの各回答の開始文字インデックス。
print("Context: ", raw_datasets["train"][0]["context"])
print("Question: ", raw_datasets["train"][0]["question"])
print("Answer: ", raw_datasets["train"][0]["answers"])
Context: 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'
Question: 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'
Answer: {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
回答テキストが無いデータが存在しないか確認。
raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1)
Dataset({
features: ['id', 'title', 'context', 'question', 'answers'],
num_rows: 0
})
評価セットには複数の回答が含まれる。
print(raw_datasets["validation"][0]["answers"])
print(raw_datasets["validation"][2]["answers"])
{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}
{'text': ['Santa Clara, California', "Levi's Stadium", "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."], 'answer_start': [403, 355, 355]}
コンテキストや質問文も確認。
正解が複数のケースがあり得ることが分かる。
print(raw_datasets["validation"][2]["context"])
print(raw_datasets["validation"][2]["question"])
'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'
'Where did Super Bowl 50 take place?'
Processing the training data
コンテキスト内の答えに対応するトークンの開始位置と終了位置を予測するモデルになる。
まずはトーカナイザをロードする。
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
fastトーカナイザを持っているかどうか確認。
tokenizer.is_fast
True
質問とコンテキストをトーカナイザに渡したときのスペシャルトークンの挿入位置。
[CLS] question [SEP] context [SEP]
確認する。
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]
inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])
'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, '
'the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin '
'Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms '
'upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred '
'Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a '
'replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette '
'Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues '
'and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP]'
ラベルは、回答の開始と終了のトークンのインデックスとなり、モデルは、入力のトークンごとに開始と終了のロジットを1つずつ予測するタスクを与えられ、理論的なラベルは次のようになる。
入力に収まらない長いコンテキストに対応するため、データセットの1つのサンプルから複数のトレーニング特徴を作成し、それらの間にスライディングウィンドウを設けることで、長いコンテキストに対処する。
truncation="only_second"で長いコンテキストの切り捨てを行う。
strideでチャンク間のオーバーラップを指定する。
return_overflowing_tokens=Trueでオーバーフローしたトークンも取得する。
inputs = tokenizer(
question,
context,
max_length=100,
truncation="only_second",
stride=50,
return_overflowing_tokens=True,
)
for ids in inputs["input_ids"]:
print(tokenizer.decode(ids))
'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basi [SEP]'
'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin [SEP]'
'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 [SEP]'
'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP]. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP]'
質問の答えが含まれないinputのラベルは
start_position = end_position = 0
となる。
return_offsets_mapping=True
を渡すことでトークナイザに正解のインデックスを返させる。
inputs = tokenizer(
question,
context,
max_length=100,
truncation="only_second",
stride=50,
return_overflowing_tokens=True,
return_offsets_mapping=True,
)
inputs.keys()
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
この場合は1センテンスしか与えてないので、全部0。
inputs["overflow_to_sample_mapping"]
[0, 0, 0, 0]
複数のセンテンスを与えた場合。
inputs = tokenizer(
raw_datasets["train"][2:6]["question"],
raw_datasets["train"][2:6]["context"],
max_length=100,
truncation="only_second",
stride=50,
return_overflowing_tokens=True,
return_offsets_mapping=True,
)
print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")
'The 4 examples gave 19 features.'
'Here is where each comes from: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3].'
答えのテキストから開始と終了のインデックスを取得する。
answers = raw_datasets["train"][2:6]["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(inputs["offset_mapping"]):
sample_idx = inputs["overflow_to_sample_mapping"][i]
answer = answers[sample_idx]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i)
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
# If the answer is not fully inside the context, label is (0, 0)
if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
start_positions.append(0)
end_positions.append(0)
else:
# Otherwise it's the start and end token positions
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char:
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char:
idx -= 1
end_positions.append(idx + 1)
start_positions, end_positions
([83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0],
[85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0])
正しいインデックスを指しているか確認する。
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]
start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])
print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
'Theoretical answer: the Main Building, labels give: the Main Building'
答えがコンテキストに無いケース。
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]
decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")
'Theoretical answer: a Marian place of prayer and reflection, decoded example: [CLS] What is the Grotto at Notre Dame? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grot [SEP]'
すべてのinputsに適応する関数を定義する。
コンテキストがある程度長いのでダイナミックパディングは使用しない。
max_length = 384
stride = 128
def preprocess_training_examples(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
offset_mapping = inputs.pop("offset_mapping")
sample_map = inputs.pop("overflow_to_sample_mapping")
answers = examples["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(offset_mapping):
sample_idx = sample_map[i]
answer = answers[sample_idx]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i)
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
# If the answer is not fully inside the context, label is (0, 0)
if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
start_positions.append(0)
end_positions.append(0)
else:
# Otherwise it's the start and end token positions
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char:
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char:
idx -= 1
end_positions.append(idx + 1)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
return inputs
トレーニングデータセットに適用する。
train_dataset = raw_datasets["train"].map(
preprocess_training_examples,
batched=True,
remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)
(87599, 88729)
Processing the validation data
検証データにはラベルを生成する必要がないので少し簡単。
def preprocess_validation_examples(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_map = inputs.pop("overflow_to_sample_mapping")
example_ids = []
for i in range(len(inputs["input_ids"])):
sample_idx = sample_map[i]
example_ids.append(examples["id"][sample_idx])
sequence_ids = inputs.sequence_ids(i)
offset = inputs["offset_mapping"][i]
inputs["offset_mapping"][i] = [
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
]
inputs["example_id"] = example_ids
return inputs
データセットに適用する。
validation_dataset = raw_datasets["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)
(10570, 10822)
Fine-tuning the model with the Trainer API
Post-processing
実際のメトリクスではsoftmaxを実行しないようにする。
start_token, end_tokenのすべてのペアを採点するのではなく、n_best=20 logitsに対応するものだけをスコアに使用する。
適当な訓練済みモデルを使って、メトリクスを取ってみる。
small_eval_set = raw_datasets["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
preprocess_validation_examples,
batched=True,
remove_columns=raw_datasets["validation"].column_names,
)
上記のトーカナイザはもう使わないので戻しておく。
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
モデルのロード。
一部のフィールドを除外する。
import torch
from transformers import AutoModelForQuestionAnswering
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
device
)
with torch.no_grad():
outputs = trained_model(**batch)
logitsの取得。
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()
small_eval_setの各例をeval_setの対応する特徴にマップする。
import collections
example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
example_to_features[feature["example_id"]].append(idx)
各質問に対してベストアンサーを取得する。
import numpy as np
n_best = 20
max_answer_length = 30
predicted_answers = []
for example in small_eval_set:
example_id = example["id"]
context = example["context"]
answers = []
for feature_index in example_to_features[example_id]:
start_logit = start_logits[feature_index]
end_logit = end_logits[feature_index]
offsets = eval_set["offset_mapping"][feature_index]
start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Skip answers that are not fully in the context
if offsets[start_index] is None or offsets[end_index] is None:
continue
# Skip answers with a length that is either < 0 or > max_answer_length.
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
answers.append(
{
"text": context[offsets[start_index][0] : offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
}
)
best_answer = max(answers, key=lambda x: x["logit_score"])
predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})
評価用のライブラリをロード。
import evaluate
metric = evaluate.load("squad")
期待する回答を作成。
theoretical_answers = [
{"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]
予測した値と比べてみる。
print(predicted_answers[0])
print(theoretical_answers[0])
{'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'}
{'id': '56be4db0acb8001400a502ec', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}}
メトリクスを取ってみる。
metric.compute(predictions=predicted_answers, references=theoretical_answers)
{'exact_match': 83.0, 'f1': 88.25}
全体のメトリクスを取る関数。
from tqdm.auto import tqdm
def compute_metrics(start_logits, end_logits, features, examples):
example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(features):
example_to_features[feature["example_id"]].append(idx)
predicted_answers = []
for example in tqdm(examples):
example_id = example["id"]
context = example["context"]
answers = []
# Loop through all features associated with that example
for feature_index in example_to_features[example_id]:
start_logit = start_logits[feature_index]
end_logit = end_logits[feature_index]
offsets = features[feature_index]["offset_mapping"]
start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Skip answers that are not fully in the context
if offsets[start_index] is None or offsets[end_index] is None:
continue
# Skip answers with a length that is either < 0 or > max_answer_length
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
answer = {
"text": context[offsets[start_index][0] : offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
}
answers.append(answer)
# Select the answer with the best score
if len(answers) > 0:
best_answer = max(answers, key=lambda x: x["logit_score"])
predicted_answers.append(
{"id": example_id, "prediction_text": best_answer["text"]}
)
else:
predicted_answers.append({"id": example_id, "prediction_text": ""})
theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
return metric.compute(predictions=predicted_answers, references=theoretical_answers)
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)
{'exact_match': 83.0, 'f1': 88.25}
Fine-tuning the model
ファインチューニングするモデルを準備。
ヘッドを付け替えるのでワーニングが出る。
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
ログイン。
from huggingface_hub import notebook_login
notebook_login()
コンソールからログインする場合。
huggingface-cli login
TrainingArgumentsの作成。
from transformers import TrainingArguments
args = TrainingArguments(
"bert-finetuned-squad",
evaluation_strategy="no",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
fp16=True,
push_to_hub=True,
)
Trainerを作成して、訓練実行。
from transformers import Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)
trainer.train()
メトリクスを取る。
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])
{'exact_match': 81.18259224219489, 'f1': 88.67381321905516}
pushする。
trainer.push_to_hub(commit_message="Training complete")
A custom training loop
Preparing everything for training
データローダーの準備。
from torch.utils.data import DataLoader
from transformers import default_data_collator
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
train_dataloader = DataLoader(
train_dataset,
shuffle=True,
collate_fn=default_data_collator,
batch_size=8,
)
eval_dataloader = DataLoader(
validation_set, collate_fn=default_data_collator, batch_size=8
)
モデルのロード。
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
オプティマイザの設定。
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)
Acceleratorの作成。
from accelerate import Accelerator
accelerator = Accelerator(fp16=True)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
学習スケジューラの設定。
from transformers import get_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
リポジトリ名の作成。
from huggingface_hub import Repository, get_full_repo_name
model_name = "bert-finetuned-squad-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name
'sgugger/bert-finetuned-squad-accelerate'
リポジトリの作成。
output_dir = "bert-finetuned-squad-accelerate"
repo = Repository(output_dir, clone_from=repo_name)
Training loop
トレーニングループの作成。
from tqdm.auto import tqdm
import torch
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
start_logits = []
end_logits = []
accelerator.print("Evaluation!")
for batch in tqdm(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
start_logits = np.concatenate(start_logits)
end_logits = np.concatenate(end_logits)
start_logits = start_logits[: len(validation_dataset)]
end_logits = end_logits[: len(validation_dataset)]
metrics = compute_metrics(
start_logits, end_logits, validation_dataset, raw_datasets["validation"]
)
print(f"epoch {epoch}:", metrics)
# Save and upload
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False
)
以下の3行に注意する。
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
Using the fine-tuned model
訓練済みモデルを使ってみる。
from transformers import pipeline
# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-squad"
question_answerer = pipeline("question-answering", model=model_checkpoint)
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)
{'score': 0.9979003071784973,
'start': 78,
'end': 105,
'answer': 'Jax, PyTorch and TensorFlow'}
Discussion