🍣
Elyza-tasks-100の自動評価スクリプト
概要
Elyza-tasks-100などのベンチマークを簡単に動かせるようなスクリプトです。
vLLMを使用してBatch処理を行っているため高速です。
スクリプト
generate_answers.py
import argparse
import json
from transformers import AutoTokenizer
from datasets import load_dataset
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", help="評価するモデル", required=True)
parser.add_argument("-t", "--tokenizer", help="使用するトークナイザ")
parser.add_argument("-o", "--output", help="出力jsonlファイルの名前")
parser.add_argument("-q", "--quantization", action='store_true', help="量子化するか否か")
parser.add_argument("--low_mem", action='store_true', help="vLLMを使用せず使用VRAMを減らす")
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.output is None:
args.output = f"answers-{args.model.split('/')[-1]}.jsonl"
ds = load_dataset("elyza/ELYZA-tasks-100", split="test")
tokenizer = AutoTokenizer.from_pretrained(args.model)
temperature = 0.8
top_p = 1.0
max_tokens = 2048
if args.low_mem:
import torch
from transformers import AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(args.model)
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto", load_in_4bit=True)
else:
from vllm import LLM, SamplingParams
if args.quantization:
llm = LLM(
model=args.model,
tokenizer=args.tokenizer,
dtype="bfloat16",
quantization="bitsandbytes",
load_format="bitsandbytes"
)
else:
llm = LLM(
model=args.model,
tokenizer=args.tokenizer,
dtype="bfloat16"
)
sampling_params = SamplingParams(
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
def add_index(batch, idx):
batch["task_id"] = idx
return batch
ds = ds.map(add_index, with_indices=True)
def apply_chat_template(item):
chat = [
{"role": "user", "content": item["input"]}
]
item["prompt"] = tokenizer.apply_chat_template(chat, tokenize=False)
return item
ds = ds.map(apply_chat_template, batched=False).rename_column("output", "sample_output")
if args.low_mem:
def generate_answer(item):
input_ids = tokenizer(item["prompt"], return_tensors="pt")
outputs = model.generate(**input_ids, max_new_tokens=2048)
item["output"] =tokenizer.decode(outputs[0])
return item
ds = ds.map(generate_answer)
else:
def generate_answer(batch):
responses = llm.generate(batch["prompt"], sampling_params=sampling_params)
outputs = []
for response in responses:
outputs.append(response.outputs[0].text)
batch["output"] = outputs
return batch
ds = ds.map(generate_answer, batched=True, batch_size=100)
ds = ds.remove_columns("prompt")
with open(args.output, "w", encoding="utf-8") as f:
for row in ds:
json.dump(row, f, ensure_ascii=False)
f.write("\n")
evaluate_with_gpt4o.py
# 作成中
使い方
pip install datasets vllm accelerate
# もし量子化するなら
pip install bitsandbytes>=0.42.0
# -m 使用するモデル(ローカルフォルダ指定可) 必須
# -t 使用するトークナイザ
# -o 出力するjsonlファイルの名前
# -b バッチサイズ
# -q 量子化するか否か
# --low_mem vLLMを使わず使用するVRAMを削減する 4bit量子化
python generate_answers.py -m google/gemma-2-2b-it -o answers.jsonl -b 3
補足
google/gemma-2-2b-jpn-it
およびその派生モデルを使用するときに次のエラーが発生する保それがあります。その時は次のように修正してください。
INFO 11-17 12:58:13 model_runner.py:1072] Starting to load model ./gemma-2-2b-jpn-it...
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/user/codes/auto_evaluate/generate_answers.py", line 41, in <module>
[rank0]: llm = LLM(
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/utils.py", line 1028, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 210, in __init__
[rank0]: self.llm_engine = self.engine_class.from_engine_args(
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 585, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 347, in __init__
[rank0]: self.model_executor = executor_class(vllm_config=vllm_config, )
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 36, in __init__
[rank0]: self._init_executor()
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 40, in _init_executor
[rank0]: self.driver_worker.load_model()
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/worker/worker.py", line 152, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1074, in load_model
[rank0]: self.model = get_model(vllm_config=self.vllm_config)
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
[rank0]: return loader.load_model(vllm_config=vllm_config)
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 332, in load_model
[rank0]: model = _initialize_model(vllm_config=vllm_config)
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 100, in _initialize_model
[rank0]: return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 409, in __init__
[rank0]: self.model = Gemma2Model(vllm_config=vllm_config,
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 126, in __init__
[rank0]: old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 258, in __init__
[rank0]: self.start_layer, self.end_layer, self.layers = make_layers(
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 509, in make_layers
[rank0]: [PPMissingLayer() for _ in range(start_layer)] + [
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 510, in <listcomp>
[rank0]: maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 260, in <lambda>
[rank0]: lambda prefix: Gemma2DecoderLayer(int(prefix.split(".")[
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 202, in __init__
[rank0]: hidden_act=config.hidden_act,
[rank0]: File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/transformers/configuration_utils.py", line 205, in __getattribute__
[rank0]: return super().__getattribute__(key)
[rank0]: AttributeError: 'Gemma2Config' object has no attribute 'hidden_act'. Did you mean: 'hidden_size'?
モデルファイルのダウンロード
git clone https://huggingface.co/google/gemma-2-2b-jpn-it
config.jsonの修正
{
"architectures": [
"Gemma2ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attn_logit_softcapping": 50.0,
"bos_token_id": 2,
"cache_implementation": "hybrid",
"dtype": "bfloat16",
"eos_token_id": 1,
"final_logit_softcapping": 30.0,
"head_dim": 256,
- "hidden_activation": "gelu_pytorch_tanh",
+ "hidden_act": "gelu_pytorch_tanh",
"hidden_size": 2304,
"initializer_range": 0.02,
"intermediate_size": 9216,
"max_position_embeddings": 8192,
"model_type": "gemma2",
"num_attention_heads": 8,
"num_hidden_layers": 26,
"num_key_value_heads": 4,
"pad_token_id": 0,
"query_pre_attn_scalar": 224,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"sliding_window": 4096,
"torch_dtype": "bfloat16",
"transformers_version": "4.44.2",
"use_cache": true,
"vocab_size": 256000
}
回答の生成
python generate_answers.py -m ./gemma-2-2b-jpn-it -o answers.jsonl -b 3
生成時間の目安
環境
Intel Xeon Gold 6226R @ 2.90GHz * 2
RTX A5000 24GB ≈ L4 24GB
生成時間
time python generate_answers.py -m ./gemma-2-2b-jpn-it # 1m10.947s
time python generate_answers.py -m google/gemma-2-9b-it # 2m0.204s
time python generate_answers.py -m ./gemma-2-2b-jpn-it --low_mem # 25m53.964s
Discussion