🤗
Transformersの学習済みモデルでtokenizer.jsonを作成
学習済みモデルのtokenizer.jsonを作成するコードです。
from transformers import AutoTokenizer
from transformers import PreTrainedTokenizerFast
from transformers import BertJapaneseTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
tokenizer = AutoTokenizer.from_pretrained("tohoku-nlp/bert-base-japanese-v3")
from transformers import BertJapaneseTokenizer
original_tokenizer = BertJapaneseTokenizer.from_pretrained("tohoku-nlp/bert-base-japanese-v3")
tokenizer = Tokenizer(WordPiece(vocab=original_tokenizer.vocab, unk_token=original_tokenizer.unk_token))
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
tokenizer.save("tokenizer.json")
Discussion