Source code for EduNLP.Pretrain.elmo_vec

import torch
import os
from copy import deepcopy
from transformers import TrainingArguments, Trainer
from typing import Optional, Union, List
from ..ModelZoo.rnn import ElmoLM, ElmoLMForPreTraining, ElmoLMForPropertyPrediction, ElmoLMForKnowledgePrediction
from ..ModelZoo.utils import pad_sequence
from .pretrian_utils import PretrainedEduTokenizer, EduDataset

__all__ = ["ElmoTokenizer", "ElmoDataset", "train_elmo", "train_elmo_for_property_prediction",
           "train_elmo_for_knowledge_prediction"]

DEFAULT_TRAIN_PARAMS = {
    # default
    "output_dir": None,
    "overwrite_output_dir": True,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 32,
    # evaluation_strategy: "steps",
    # eval_steps:200,
    "save_steps": 1000,
    "save_total_limit": 2,
    # "load_best_model_at_end": False,
    # metric_for_best_model: "loss",
    # greater_is_better: False,
    "logging_dir": None,
    "logging_steps": 5,
    "gradient_accumulation_steps": 1,
    "learning_rate": 5e-4,
    # disable_tqdm: True,
    # no_cuda: True,
}


[docs]class ElmoTokenizer(PretrainedEduTokenizer): """ Examples -------- >>> t=ElmoTokenizer() >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\\ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] >>> len(t) 14 >>> t.tokenize(items[0]) ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] >>> t(items[0]) {'seq_idx': tensor([1, 1, 6, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 7]), 'seq_len': tensor(17)} >>> t.set_vocab(items[0]) ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] >>> len(t) 45 >>> t(items[0]) {'seq_idx': tensor([ 1, 1, 6, 26, 27, 28, 1, 1, 9, 35, 36, 26, 37, 38, 28, 1, 7]), 'seq_len': tensor(17)} """ def __init__(self, vocab_path=None, max_length=250, tokenize_method="pure_text", add_specials=True, **kwargs): super().__init__(vocab_path=vocab_path, max_length=max_length, tokenize_method=tokenize_method, add_specials=add_specials, **kwargs)
[docs]class ElmoDataset(EduDataset): def __init__(self, tokenizer: ElmoTokenizer, **kwargs): super(ElmoDataset, self).__init__(tokenizer=tokenizer, **kwargs)
[docs] def collate_fn(self, batch_data): pad_idx = self.tokenizer.vocab.pad_idx first = batch_data[0] batch = { k: [item[k] for item in batch_data] for k in first.keys() } batch["seq_idx"] = pad_sequence(batch["seq_idx"], pad_val=pad_idx) batch = {key: torch.as_tensor(val) for key, val in batch.items()} return batch
[docs]def train_elmo(items: Union[List[dict], List[str]], output_dir: str, pretrained_dir: str = None, tokenizer_params=None, data_params=None, model_params=None, train_params=None): """ Parameters ---------- items: list, required The training corpus, each item could be str or dict output_dir: str, required The directory to save trained model files pretrained_dir: str, optional The pretrained directory for model and tokenizer tokenizer_params: dict, optional, default=None The parameters passed to ElmoTokenizer data_params: dict, optional, default=None - stem_key - label_key The parameters passed to ElmoDataset and ElmoTokenizer model_params: dict, optional, default=None The parameters passed to Trainer train_params: dict, optional, default=None """ tokenizer_params = tokenizer_params if tokenizer_params else {} data_params = data_params if data_params is not None else {} model_params = model_params if model_params is not None else {} train_params = train_params if train_params is not None else {} # tokenizer configuration if pretrained_dir is not None and os.path.exists(pretrained_dir): tokenizer = ElmoTokenizer.from_pretrained(pretrained_dir, **tokenizer_params) else: work_tokenizer_params = { "add_specials": True, "tokenize_method": "pure_text", } work_tokenizer_params.update(tokenizer_params if tokenizer_params else {}) tokenizer = ElmoTokenizer(**work_tokenizer_params) corpus_items = items if isinstance(items[0], str): tokenizer.set_vocab(corpus_items) else: tokenizer.set_vocab(corpus_items, key=lambda x: x[data_params.get("stem_key", "ques_content")]) # dataset configuration dataset = ElmoDataset(tokenizer=tokenizer, items=items, stem_key=data_params.get("stem_key", "ques_content")) # model configuration if pretrained_dir: model = ElmoLMForPreTraining.from_pretrained(pretrained_dir, **model_params) else: work_model_params = { "vocab_size": len(tokenizer), "embedding_dim": 300, "hidden_size": 300 } work_model_params.update(model_params if model_params else {}) model = ElmoLMForPreTraining(**work_model_params) model.elmo.LM_layer.rnn.flatten_parameters() # training configuration work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) work_train_params["output_dir"] = output_dir if train_params is not None: work_train_params.update(train_params if train_params else {}) work_args = TrainingArguments(**work_train_params) trainer = Trainer( model=model, args=work_args, train_dataset=dataset, data_collator=dataset.collate_fn, ) trainer.train() # trainer.model.save_pretrained(output_dir) assert isinstance(trainer.model, ElmoLMForPreTraining) trainer.save_model(output_dir) trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir) return output_dir
[docs]def train_elmo_for_property_prediction( train_items: list, output_dir: str, pretrained_dir=None, eval_items=None, tokenizer_params=None, data_params=None, train_params=None, model_params=None ): """ Parameters ---------- train_items: list, required The training items, each item could be str or dict output_dir: str, required The directory to save trained model files pretrained_dir: str, optional The pretrained directory for model and tokenizer eval_items: list, required The evaluating items, each item could be str or dict tokenizer_params: dict, optional, default=None The parameters passed to ElmoTokenizer data_params: dict, optional, default=None The parameters passed to ElmoDataset and ElmoTokenizer model_params: dict, optional, default=None The parameters passed to Trainer train_params: dict, optional, default=None """ tokenizer_params = tokenizer_params if tokenizer_params else {} data_params = data_params if data_params is not None else {} model_params = model_params if model_params is not None else {} train_params = train_params if train_params is not None else {} # tokenizer configuration if pretrained_dir is not None: tokenizer = ElmoTokenizer.from_pretrained(pretrained_dir, **tokenizer_params) else: work_tokenizer_params = { "add_special_tokens": True, "tokenize_method": "pure_text", } work_tokenizer_params.update(tokenizer_params if tokenizer_params else {}) tokenizer = ElmoTokenizer(**work_tokenizer_params) corpus_items = train_items + eval_items if eval_items else [] tokenizer.set_vocab(corpus_items, key=lambda x: x[data_params.get("stem_key", "ques_content")]) # dataset configuration train_dataset = ElmoDataset(tokenizer=tokenizer, items=train_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "difficulty")) if eval_items is not None: eval_dataset = ElmoDataset(tokenizer=tokenizer, items=eval_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "difficulty")) else: eval_dataset = None # model configuration if pretrained_dir is not None: model = ElmoLMForPropertyPrediction.from_pretrained(pretrained_dir, **model_params) else: work_model_params = { "vocab_size": len(tokenizer), "embedding_dim": 512, "hidden_size": 512 } work_model_params.update(model_params if model_params else {}) model = ElmoLMForPropertyPrediction(**work_model_params) model.elmo.LM_layer.rnn.flatten_parameters() # training configuration work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) work_train_params["output_dir"] = output_dir if train_params is not None: work_train_params.update(train_params if train_params else {}) work_train_params = TrainingArguments(**work_train_params) trainer = Trainer( model=model, args=work_train_params, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=train_dataset.collate_fn, ) trainer.train() # trainer.model.save_pretrained(output_dir) assert isinstance(trainer.model, ElmoLMForPropertyPrediction) trainer.save_model(output_dir) trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir) return output_dir
[docs]def train_elmo_for_knowledge_prediction( train_items: list, output_dir: str, pretrained_dir=None, eval_items=None, tokenizer_params=None, data_params=None, train_params=None, model_params=None ): """ Parameters ---------- train_items: list, required The training items, each item could be str or dict output_dir: str, required The directory to save trained model files pretrained_dir: str, optional The pretrained directory for model and tokenizer eval_items: list, required The evaluating items, each item could be str or dict tokenizer_params: dict, optional, default=None The parameters passed to ElmoTokenizer data_params: dict, optional, default=None The parameters passed to ElmoDataset and ElmoTokenizer model_params: dict, optional, default=None The parameters passed to Trainer train_params: dict, optional, default=None """ tokenizer_params = tokenizer_params if tokenizer_params else {} data_params = data_params if data_params is not None else {} model_params = model_params if model_params is not None else {} train_params = train_params if train_params is not None else {} # tokenizer configuration if pretrained_dir is not None: tokenizer = ElmoTokenizer.from_pretrained(pretrained_dir, **tokenizer_params) else: work_tokenizer_params = { "add_special_tokens": True, "tokenize_method": "pure_text", } work_tokenizer_params.update(tokenizer_params if tokenizer_params else {}) tokenizer = ElmoTokenizer(**work_tokenizer_params) corpus_items = train_items + eval_items if eval_items else [] tokenizer.set_vocab(corpus_items, key=lambda x: x[data_params.get("stem_key", "ques_content")]) # dataset configuration train_dataset = ElmoDataset(tokenizer=tokenizer, items=train_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "know_list")) if eval_items is not None: eval_dataset = ElmoDataset(tokenizer=tokenizer, items=eval_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "know_list")) else: eval_dataset = None # model configuration if pretrained_dir is not None: model = ElmoLMForKnowledgePrediction.from_pretrained(pretrained_dir, **model_params) else: work_model_params = { "vocab_size": len(tokenizer), "embedding_dim": 512, "hidden_size": 512 } work_model_params.update(model_params if model_params else {}) model = ElmoLMForKnowledgePrediction(**work_model_params) model.elmo.LM_layer.rnn.flatten_parameters() # training configuration work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) work_train_params["output_dir"] = output_dir if train_params is not None: work_train_params.update(train_params if train_params else {}) work_train_params = TrainingArguments(**work_train_params) trainer = Trainer( model=model, args=work_train_params, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=train_dataset.collate_fn, ) trainer.train() assert isinstance(trainer.model, ElmoLMForKnowledgePrediction) trainer.save_model(output_dir) trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir) return output_dir