🍣

Elyza-tasks-100の自動評価スクリプト

2024/11/17に公開

概要

Elyza-tasks-100などのベンチマークを簡単に動かせるようなスクリプトです。
vLLMを使用してBatch処理を行っているため高速です。

スクリプト

generate_answers.py
import argparse
import json
from transformers import AutoTokenizer
from datasets import load_dataset

parser = argparse.ArgumentParser()

parser.add_argument("-m", "--model", help="評価するモデル", required=True)
parser.add_argument("-t", "--tokenizer", help="使用するトークナイザ")
parser.add_argument("-o", "--output", help="出力jsonlファイルの名前")
parser.add_argument("-q", "--quantization", action='store_true', help="量子化するか否か")
parser.add_argument("--low_mem", action='store_true', help="vLLMを使用せず使用VRAMを減らす")

args = parser.parse_args()

if args.tokenizer is None:
    args.tokenizer = args.model

if args.output is None:
    args.output = f"answers-{args.model.split('/')[-1]}.jsonl"


ds = load_dataset("elyza/ELYZA-tasks-100", split="test")
tokenizer = AutoTokenizer.from_pretrained(args.model)

temperature = 0.8
top_p = 1.0
max_tokens = 2048

if args.low_mem:
    import torch
    from transformers import AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto", load_in_4bit=True)
else:
    from vllm import LLM, SamplingParams
    if args.quantization:
        llm = LLM(
            model=args.model,
            tokenizer=args.tokenizer,
            dtype="bfloat16",
            quantization="bitsandbytes", 
            load_format="bitsandbytes"
        )
    else:
        llm = LLM(
            model=args.model,
            tokenizer=args.tokenizer,
            dtype="bfloat16"
        )

    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )


def add_index(batch, idx):
    batch["task_id"] = idx
    return batch

ds = ds.map(add_index, with_indices=True)

def apply_chat_template(item):
    chat = [
        {"role": "user", "content": item["input"]}
    ]
    item["prompt"] = tokenizer.apply_chat_template(chat, tokenize=False)
    return item

ds = ds.map(apply_chat_template, batched=False).rename_column("output", "sample_output")

if args.low_mem:
    def generate_answer(item):
        input_ids = tokenizer(item["prompt"], return_tensors="pt")
        outputs = model.generate(**input_ids, max_new_tokens=2048)
        item["output"] =tokenizer.decode(outputs[0])
        return item
    ds = ds.map(generate_answer)
else:
    def generate_answer(batch):
        responses = llm.generate(batch["prompt"], sampling_params=sampling_params)
        outputs = []
        for response in responses:
            outputs.append(response.outputs[0].text)
        batch["output"] = outputs
        return batch

    ds = ds.map(generate_answer, batched=True, batch_size=100)


ds = ds.remove_columns("prompt")

with open(args.output, "w", encoding="utf-8") as f:
    for row in ds:
        json.dump(row, f, ensure_ascii=False)
        f.write("\n")
evaluate_with_gpt4o.py
# 作成中

使い方

pip install datasets vllm accelerate
# もし量子化するなら
pip install bitsandbytes>=0.42.0

# -m 使用するモデル(ローカルフォルダ指定可) 必須
# -t 使用するトークナイザ
# -o 出力するjsonlファイルの名前
# -b バッチサイズ
# -q 量子化するか否か
# --low_mem vLLMを使わず使用するVRAMを削減する 4bit量子化
python generate_answers.py -m google/gemma-2-2b-it -o answers.jsonl -b 3

補足

google/gemma-2-2b-jpn-itおよびその派生モデルを使用するときに次のエラーが発生する保それがあります。その時は次のように修正してください。

INFO 11-17 12:58:13 model_runner.py:1072] Starting to load model ./gemma-2-2b-jpn-it...
[rank0]: Traceback (most recent call last):
[rank0]:   File "/home/user/codes/auto_evaluate/generate_answers.py", line 41, in <module>
[rank0]:     llm = LLM(
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/utils.py", line 1028, in inner
[rank0]:     return fn(*args, **kwargs)
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 210, in __init__
[rank0]:     self.llm_engine = self.engine_class.from_engine_args(
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 585, in from_engine_args
[rank0]:     engine = cls(
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 347, in __init__
[rank0]:     self.model_executor = executor_class(vllm_config=vllm_config, )
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 36, in __init__
[rank0]:     self._init_executor()
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 40, in _init_executor
[rank0]:     self.driver_worker.load_model()
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/worker/worker.py", line 152, in load_model
[rank0]:     self.model_runner.load_model()
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1074, in load_model
[rank0]:     self.model = get_model(vllm_config=self.vllm_config)
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
[rank0]:     return loader.load_model(vllm_config=vllm_config)
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 332, in load_model
[rank0]:     model = _initialize_model(vllm_config=vllm_config)
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 100, in _initialize_model
[rank0]:     return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 409, in __init__
[rank0]:     self.model = Gemma2Model(vllm_config=vllm_config,
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 126, in __init__
[rank0]:     old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 258, in __init__
[rank0]:     self.start_layer, self.end_layer, self.layers = make_layers(
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 509, in make_layers
[rank0]:     [PPMissingLayer() for _ in range(start_layer)] + [
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 510, in <listcomp>
[rank0]:     maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 260, in <lambda>
[rank0]:     lambda prefix: Gemma2DecoderLayer(int(prefix.split(".")[
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py", line 202, in __init__
[rank0]:     hidden_act=config.hidden_act,
[rank0]:   File "/home/user/miniconda3/envs/auto_evaluate/lib/python3.10/site-packages/transformers/configuration_utils.py", line 205, in __getattribute__
[rank0]:     return super().__getattribute__(key)
[rank0]: AttributeError: 'Gemma2Config' object has no attribute 'hidden_act'. Did you mean: 'hidden_size'?

モデルファイルのダウンロード

git clone https://huggingface.co/google/gemma-2-2b-jpn-it

config.jsonの修正

{
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "dtype": "bfloat16",
  "eos_token_id": 1,
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
- "hidden_activation": "gelu_pytorch_tanh",
+ "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 224,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 256000
}

回答の生成

python generate_answers.py -m ./gemma-2-2b-jpn-it -o answers.jsonl -b 3

生成時間の目安

環境

Intel Xeon Gold 6226R @ 2.90GHz * 2
RTX A5000 24GB ≈ L4 24GB

生成時間

time python generate_answers.py -m ./gemma-2-2b-jpn-it # 1m10.947s
time python generate_answers.py -m google/gemma-2-9b-it # 2m0.204s
time python generate_answers.py -m ./gemma-2-2b-jpn-it --low_mem # 25m53.964s

Discussion