[1]:
import torch
import numpy as np
import os
import json
from EduNLP.Pretrain import BertTokenizer, finetune_bert
from EduNLP.Vector import T2V
from EduNLP.I2V import Bert, get_pretrained_i2v

import os
os.environ["WANDB_DISABLED"] = "true"
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)

训练自己的Bert模型

1. 数据

[2]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/data/bert"
[3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "OpenLUNA.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

def stem_data(data):
    _data = []
    tokenizer = BertTokenizer()
    for e in data:
        d = tokenizer(e["stem"])
        if d is not None:
            _data.append(d)
    assert _data
    return _data

raw_data = raw_data()
train_items = stem_data(raw_data)

2. 训练和评估

[4]:
# 自定义训练参数
train_params = {
  'epochs': 1,
  'save_steps': 50,
  'batch_size': 1,
  'logging_steps': 3
}

finetune_bert(
  train_items[:50],
  output_dir,
  train_params=train_params
)
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 50
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 50
 10%|█         | 5/50 [00:00<00:05,  8.26it/s]
{'loss': 1.5967, 'learning_rate': 4.7e-05, 'epoch': 0.06}
 14%|█▍        | 7/50 [00:00<00:04,  9.09it/s]
{'loss': 1.9648, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.12}
 22%|██▏       | 11/50 [00:01<00:03, 10.18it/s]
{'loss': 4.4029, 'learning_rate': 4.1e-05, 'epoch': 0.18}
 26%|██▌       | 13/50 [00:01<00:03, 10.49it/s]
{'loss': 1.8761, 'learning_rate': 3.8e-05, 'epoch': 0.24}
 34%|███▍      | 17/50 [00:01<00:02, 11.02it/s]
{'loss': 1.0214, 'learning_rate': 3.5e-05, 'epoch': 0.3}
 38%|███▊      | 19/50 [00:01<00:02, 10.86it/s]
{'loss': 2.0055, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.36}
 46%|████▌     | 23/50 [00:02<00:02, 11.17it/s]
{'loss': 2.7323, 'learning_rate': 2.9e-05, 'epoch': 0.42}
 50%|█████     | 25/50 [00:02<00:02, 11.36it/s]
{'loss': 1.6171, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.48}
 54%|█████▍    | 27/50 [00:02<00:02, 11.26it/s]
{'loss': 3.3698, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.54}
 62%|██████▏   | 31/50 [00:03<00:01, 10.78it/s]
{'loss': 1.5574, 'learning_rate': 2e-05, 'epoch': 0.6}
 70%|███████   | 35/50 [00:03<00:01, 11.47it/s]
{'loss': 2.0541, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.66}
 74%|███████▍  | 37/50 [00:03<00:01, 11.28it/s]
{'loss': 2.0624, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.72}
 82%|████████▏ | 41/50 [00:03<00:00, 11.33it/s]
{'loss': 2.8201, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
 86%|████████▌ | 43/50 [00:04<00:00, 11.44it/s]
{'loss': 1.5396, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.84}
 94%|█████████▍| 47/50 [00:04<00:00, 11.66it/s]
{'loss': 3.2493, 'learning_rate': 5e-06, 'epoch': 0.9}
 98%|█████████▊| 49/50 [00:04<00:00, 10.19it/s]Saving model checkpoint to ../../examples/test_model/data/data/bert\checkpoint-50
Configuration saved in ../../examples/test_model/data/data/bert\checkpoint-50\config.json
{'loss': 2.4102, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.96}
Model weights saved in ../../examples/test_model/data/data/bert\checkpoint-50\pytorch_model.bin
tokenizer config file saved in ../../examples/test_model/data/data/bert\checkpoint-50\tokenizer_config.json
Special tokens file saved in ../../examples/test_model/data/data/bert\checkpoint-50\special_tokens_map.json
Deleting older checkpoint [..\..\examples\test_model/data\data\bert\checkpoint-5] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 50/50 [00:07<00:00,  6.26it/s]
Saving model checkpoint to ../../examples/test_model/data/data/bert
Configuration saved in ../../examples/test_model/data/data/bert\config.json
{'train_runtime': 8.0115, 'train_samples_per_second': 6.241, 'train_steps_per_second': 6.241, 'train_loss': 2.197276794910431, 'epoch': 1.0}
Model weights saved in ../../examples/test_model/data/data/bert\pytorch_model.bin
tokenizer config file saved in ../../examples/test_model/data/data/bert\tokenizer_config.json
Special tokens file saved in ../../examples/test_model/data/data/bert\special_tokens_map.json
tokenizer config file saved in ../../examples/test_model/data/data/bert\tokenizer_config.json
Special tokens file saved in ../../examples/test_model/data/data/bert\special_tokens_map.json

3.使用模型

3.1 使用训练好的Bert模型

[5]:
item = [
        {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
        若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'stem': '已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为'}
]

tokenizer_kwargs = {"tokenizer_config_dir": output_dir}
i2v = Bert('bert', 'bert', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(item[0]['stem'])
print(i_vec.shape) # == torch.Size([x, x])
print(t_vec.shape) # == torch.Size([x, x, x])
print()

# 也可以对题目列表进行表征
i_vec, t_vec = i2v([ item[0]['stem'], item[1]['stem'] ])
print(i_vec.shape) # == torch.Size([x, x])
print(t_vec.shape) # == torch.Size([x, x, x])
loading configuration file ../../examples/test_model/data/data/bert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

loading weights file ../../examples/test_model/data/data/bert\pytorch_model.bin
Some weights of the model checkpoint at ../../examples/test_model/data/data/bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../../examples/test_model/data/data/bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading file ../../examples/test_model/data/data/bert\vocab.txt
loading file ../../examples/test_model/data/data/bert\tokenizer.json
loading file ../../examples/test_model/data/data/bert\added_tokens.json
loading file ../../examples/test_model/data/data/bert\special_tokens_map.json
loading file ../../examples/test_model/data/data/bert\tokenizer_config.json
torch.Size([1, 768])
torch.Size([1, 21, 768])

torch.Size([2, 768])
torch.Size([2, 32, 768])

3.2 使用BertTokenizer

[6]:
# 在Bert-base-chinese的基础上初始化tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)

# 对题目文本进行令牌化
items = [
    "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]

# 可以对单个题目进行令牌化
print(tokenizer(items[0]))
print()

# 也可以对题目列表进行令牌化
token_items = tokenizer(items)
print(token_items)
print()

# 可以使用return_tensors参数指定返回张量的类型
print(tokenizer(items[0], return_tensors='pt'))
loading file ../../examples/test_model/data/data/bert\vocab.txt
loading file ../../examples/test_model/data/data/bert\tokenizer.json
loading file ../../examples/test_model/data/data/bert\added_tokens.json
loading file ../../examples/test_model/data/data/bert\special_tokens_map.json
loading file ../../examples/test_model/data/data/bert\tokenizer_config.json
{'input_ids': [101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

{'input_ids': [[101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2347, 4761, 1749, 166, 141, 169, 123, 171, 116, 167, 141, 169, 123, 171, 118, 127, 8206, 134, 121, 6814, 4157, 4684, 5296, 1749, 2779, 2533, 2478, 7270, 2428, 3297, 2207, 966, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

{'input_ids': tensor([[  101,  1062,  2466,  1963,  1745, 21129,   166,   117,   167,  5276,
          3338,  3340,   816,  1062,  2466,   102,   168,   134,   166,   116,
           128,  8179,  3297,  1920,   966, 21130,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}
[7]:
# 可以使用tokenize方法查看令牌化后的文本
print(tokenizer.tokenize(items[0]))
print(tokenizer.tokenize(items))
['公', '式', '如', '图', '[FIGURE]', 'x', ',', 'y', '约', '束', '条', '件', '公', '式', '[SEP]', 'z', '=', 'x', '+', '7', '##y', '最', '大', '值', '[MARK]']
[['公', '式', '如', '图', '[FIGURE]', 'x', ',', 'y', '约', '束', '条', '件', '公', '式', '[SEP]', 'z', '=', 'x', '+', '7', '##y', '最', '大', '值', '[MARK]'], ['已', '知', '圆', 'x', '^', '{', '2', '}', '+', 'y', '^', '{', '2', '}', '-', '6', '##x', '=', '0', '过', '点', '直', '线', '圆', '截', '得', '弦', '长', '度', '最', '小', '值']]

3.3 使用EduNLP中公开的预训练模型

[8]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/data/data/bert"
i2v = get_pretrained_i2v("luna_bert", model_dir=pretrained_dir)
EduNLP, INFO model_path: ..\..\examples\test_model/data\data\bert\LUNABert
EduNLP, INFO Use pretrained t2v model luna_bert
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/LUNABert.zip is saved as ..\..\examples\test_model/data\data\bert\LUNABert.zip
Downloading ..\..\examples\test_model/data\data\bert\LUNABert.zip 100.00%: 362MB | 362MB
downloader, INFO ..\..\examples\test_model/data\data\bert\LUNABert.zip is unzip to ..\..\examples\test_model/data\data\bert\LUNABert

loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

loading weights file ..\..\examples\test_model/data\data\bert\LUNABert\pytorch_model.bin
Some weights of the model checkpoint at ..\..\examples\test_model/data\data\bert\LUNABert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ..\..\examples\test_model/data\data\bert\LUNABert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

Didn't find file ..\..\examples\test_model/data\data\bert\LUNABert\tokenizer.json. We won't load it.
loading file ..\..\examples\test_model/data\data\bert\LUNABert\vocab.txt
loading file None
loading file ..\..\examples\test_model/data\data\bert\LUNABert\added_tokens.json
loading file ..\..\examples\test_model/data\data\bert\LUNABert\special_tokens_map.json
loading file ..\..\examples\test_model/data\data\bert\LUNABert\tokenizer_config.json
loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

Adding [FORMULA] to the vocabulary
Adding [FIGURE] to the vocabulary
Adding [MARK] to the vocabulary
Adding [TAG] to the vocabulary
loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

[9]:
items = [
    "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
i_vec, t_vec = i2v(items)
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(items)
print(i_vec.shape)
t_vec = i2v.infer_token_vector(items)
print(t_vec.shape)
print()

# 同样,可以获取单个题目的表征
i_vec, t_vec = i2v(item[0])
print(i_vec.shape)
print(t_vec.shape)
torch.Size([2, 768])
torch.Size([2, 32, 768])

torch.Size([2, 768])
torch.Size([2, 32, 768])

torch.Size([1, 768])
torch.Size([1, 2, 768])
[ ]: