Source code for EduNLP.Pretrain.bert_vec

import os
import json
from EduNLP import logger
from EduNLP.Tokenizer import PureTextTokenizer, TOKENIZER
from typing import Optional, Union
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers.file_utils import TensorType
from torch.utils.data import Dataset
from EduNLP.SIF import Symbol, FORMULA_SYMBOL, FIGURE_SYMBOL, QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL

__all__ = ["BertTokenizer", "finetune_bert"]


[docs]class BertTokenizer(object): """ Parameters ---------- pretrain_model: used pretrained model add_special_tokens: Whether to add tokens like [FIGURE], [TAG], etc. text_tokenizer: Which text tokenizer to use. Must be consistent with TOKENIZER dictionary. Returns ---------- Examples ---------- >>> tokenizer = BertTokenizer(add_special_tokens=True) >>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$" >>> token_item = tokenizer(item) >>> print(token_item.input_ids[:10]) [101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276] >>> print(tokenizer.tokenize(item)[:10]) ['公', '式', '如', '图', '[FIGURE]', 'x', ',', 'y', '约', '束'] >>> items = [item, item] >>> token_items = tokenizer(items, return_tensors='pt') >>> print(token_items.input_ids.shape) torch.Size([2, 27]) >>> print(len(tokenizer.tokenize(items))) 2 >>> tokenizer.save_pretrained('test_dir') # doctest: +SKIP >>> tokenizer = BertTokenizer.from_pretrained('test_dir') # doctest: +SKIP """ def __init__(self, pretrain_model="bert-base-chinese", add_special_tokens=False, text_tokenizer=None): self.tokenizer = AutoTokenizer.from_pretrained(pretrain_model) self.add_special_tokens = add_special_tokens if add_special_tokens: customize_tokens = [] for i in [FORMULA_SYMBOL, FIGURE_SYMBOL, QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL]: if i not in self.tokenizer.additional_special_tokens: customize_tokens.append(Symbol(i)) if customize_tokens: self.tokenizer.add_special_tokens({'additional_special_tokens': customize_tokens}) if text_tokenizer: assert text_tokenizer in TOKENIZER, f"text_tokenizer should be one of {list(TOKENIZER.keys())}" self.text_tokenizer = TOKENIZER[text_tokenizer]() self.text_tokenizer_name = text_tokenizer else: self.text_tokenizer = PureTextTokenizer() self.text_tokenizer_name = 'pure_text' def __call__(self, item: Union[list, str], return_tensors: Optional[Union[str, TensorType]] = None, *args, **kwargs): if isinstance(item, str): item = ''.join(next(self.text_tokenizer([item]))) else: token_generation = self.text_tokenizer(item) item = [''.join(next(token_generation)) for i in range(len(item))] return self.tokenizer(item, truncation=True, padding=True, return_tensors=return_tensors)
[docs] def tokenize(self, item: Union[list, str], *args, **kwargs): if isinstance(item, str): item = ''.join(next(self.text_tokenizer([item]))) return self.tokenizer.tokenize(item) else: token_generation = self.text_tokenizer(item) item = [''.join(next(token_generation)) for i in range(len(item))] item = [self.tokenizer.tokenize(i) for i in item] return item
[docs] def save_pretrained(self, tokenizer_config_dir): self.tokenizer.save_pretrained(tokenizer_config_dir) custom_config = { 'add_special_tokens': self.add_special_tokens, 'text_tokenizer': self.text_tokenizer_name } with open(os.path.join(tokenizer_config_dir, 'custom_config.json'), 'w') as f: json.dump(custom_config, f, indent=2)
[docs] @classmethod def from_pretrained(cls, tokenizer_config_dir): custom_config_dir = os.path.join(tokenizer_config_dir, 'custom_config.json') if os.path.exists(custom_config_dir): with open(custom_config_dir, 'r') as f: custom_config = json.load(f) return cls(tokenizer_config_dir, custom_config['add_special_tokens'], custom_config['text_tokenizer']) else: return cls(tokenizer_config_dir)
class FinetuneDataset(Dataset): def __init__(self, items): self.items = items self.len = len(items) def __getitem__(self, index): return self.items[index] def __len__(self): return self.len
[docs]def finetune_bert(items, output_dir, pretrain_model="bert-base-chinese", train_params=None): """ Parameters ---------- items:dict the tokenization results of questions output_dir: str the path to save the model pretrain_model: str the name or path of pre-trained model train_params: dict the training parameters passed to Trainer Examples ---------- >>> tokenizer = BertTokenizer() >>> stems = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$", ... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$"] >>> token_item = [tokenizer(i) for i in stems] >>> print(token_item[0].keys()) dict_keys(['input_ids', 'token_type_ids', 'attention_mask']) >>> finetune_bert(token_item, "examples/test_model/data/data/bert") # doctest: +SKIP {'train_runtime': ..., ..., 'epoch': 1.0} """ model = AutoModelForMaskedLM.from_pretrained(pretrain_model) tokenizer = BertTokenizer(pretrain_model, add_special_tokens=True) # resize embedding for additional special tokens model.resize_token_embeddings(len(tokenizer.tokenizer)) # training parameters if train_params: mlm_probability = train_params['mlm_probability'] if 'mlm_probability' in train_params else 0.15 epochs = train_params['epochs'] if 'epochs' in train_params else 1 batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64 save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100 save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2 logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5 gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \ if 'gradient_accumulation_steps' in train_params else 1 else: # default mlm_probability = 0.15 epochs = 1 batch_size = 64 save_steps = 1000 save_total_limit = 2 logging_steps = 5 gradient_accumulation_steps = 1 data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer.tokenizer, mlm=True, mlm_probability=mlm_probability ) dataset = FinetuneDataset(items) training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=batch_size, save_steps=save_steps, save_total_limit=save_total_limit, logging_steps=logging_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, tokenizer=tokenizer.tokenizer, train_dataset=dataset, ) trainer.train() trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir)