[13]:

import torch
import numpy as np
import os
import json
from transformers import BertModel as HFBertModel
from EduNLP.Pretrain import BertTokenizer, finetune_bert
from EduNLP.Vector import T2V, BertModel
from EduNLP.I2V import Bert, get_pretrained_i2v

import os
os.environ["WANDB_DISABLED"] = "true"

训练自己的Bert模型¶

1. 数据¶

[2]:

# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/pretrain_test_models/bert"

[3]:

def stem_data():
    _data = []
    data_path = os.path.join(data_dir, "standard_luna_data.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

train_items = stem_data()

2. 训练和评估¶

[6]:

# 自定义训练参数
train_params = {
  'num_train_epochs': 1,
  'save_steps': 50,
  'per_device_train_batch_size': 1,
  'logging_steps': 3
}

finetune_bert(
  train_items,
  output_dir,
  data_params={
      "stem_key": "ques_content",
  },
  train_params=train_params
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization.
The tokenizer class you load from this checkpoint is 'BertTokenizer'.
The class this function is called from is 'EduTokenizerForBert'.
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
/home/qlh/anaconda3/envs/dev/lib/python3.6/site-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  FutureWarning,
***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 13
/home/qlh/anaconda3/envs/dev/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn('Was asked to gather along dimension 0, but all '

[13/13 00:02, Epoch 1/1]

Step	Training Loss
3	4.236200
6	3.176300
9	1.507000
12	1.878000



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../../examples/test_model/data/pretrain_test_models/bert
Configuration saved in ../../examples/test_model/data/pretrain_test_models/bert/config.json
Model weights saved in ../../examples/test_model/data/pretrain_test_models/bert/pytorch_model.bin
tokenizer config file saved in ../../examples/test_model/data/pretrain_test_models/bert/tokenizer_config.json
Special tokens file saved in ../../examples/test_model/data/pretrain_test_models/bert/special_tokens_map.json
added tokens file saved in ../../examples/test_model/data/pretrain_test_models/bert/added_tokens.json

3.使用模型¶

[15]:

test_items = [
    {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，\
            如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'}
]

3.1 直接加载令牌容器和模型¶

[10]:

pretrained_model_dir = output_dir

model = HFBertModel.from_pretrained(pretrained_model_dir)
tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)

encodes = tokenizer(test_items[0], lambda x: x['ques_content'])
model(**encodes)
encodes = tokenizer(test_items, lambda x: x['ques_content'])
model(**encodes)

loading configuration file ../../examples/test_model/data/pretrain_test_models/bert/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21137
}

loading weights file ../../examples/test_model/data/pretrain_test_models/bert/pytorch_model.bin
Some weights of the model checkpoint at ../../examples/test_model/data/pretrain_test_models/bert were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../../examples/test_model/data/pretrain_test_models/bert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading file ../../examples/test_model/data/pretrain_test_models/bert/vocab.txt
loading file ../../examples/test_model/data/pretrain_test_models/bert/added_tokens.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/special_tokens_map.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/tokenizer_config.json
Adding [TEXT] to the vocabulary
Adding [FORMULA] to the vocabulary
Adding [FIGURE] to the vocabulary
Adding [MARK] to the vocabulary
Adding [TAG] to the vocabulary
Adding [TEXT_BEGIN] to the vocabulary
Adding [TEXT_END] to the vocabulary
Adding [FORMULA_BEGIN] to the vocabulary
Adding [FORMULA_END] to the vocabulary
Assigning ['[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]'] to the additional_special_tokens key of the tokenizer

[10]:

BaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                               tensor([[[ 0.4089,  1.0626,  0.0125,  ...,  0.9473, -1.1455, -0.1160],
                                                        [-0.5247,  0.2449, -0.2175,  ..., -0.0708, -0.4598, -0.3746],
                                                        [-0.5054,  0.5353, -0.9247,  ...,  1.1696,  0.0792, -0.3338],
                                                        ...,
                                                        [-0.1391,  0.0392, -0.8276,  ...,  0.9213, -0.1554, -0.2917],
                                                        [ 0.5237,  0.2678,  0.6923,  ...,  0.1681, -0.9408, -0.2269],
                                                        [ 0.3768,  0.2677,  0.3381,  ...,  0.9611, -2.1952, -0.0641]],

                                                       [[ 0.3635,  1.0077,  0.0537,  ...,  0.8781, -1.2010, -0.1730],
                                                        [-0.4258,  0.3437, -0.1443,  ..., -0.0933, -0.3453, -0.3237],
                                                        [ 0.1931, -0.2688,  0.8572,  ...,  1.2704, -0.6482, -0.4281],
                                                        ...,
                                                        [ 0.4101,  0.1993,  0.5072,  ...,  0.8726, -2.0718, -0.1272],
                                                        [ 0.6080,  0.2398,  0.9711,  ...,  0.4306, -1.1894, -0.3648],
                                                        [ 0.2173,  0.1151,  1.1694,  ...,  0.6153, -1.1397, -0.2648]]],
                                                      grad_fn=<NativeLayerNormBackward>)),
                                              ('pooler_output',
                                               tensor([[ 0.4122, -0.3051, -0.0791,  ...,  0.3698, -0.4794, -0.4627],
                                                       [ 0.4386, -0.2620, -0.0524,  ...,  0.3713, -0.4795, -0.3963]],
                                                      grad_fn=<TanhBackward>))])

[12]:

tokenizer_kwargs = {"tokenizer_config_dir": output_dir}
i2v = Bert('bert', 'bert', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])

# 也可以对题目列表进行表征
i_vec, t_vec = i2v(test_items, key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))

loading configuration file ../../examples/test_model/data/pretrain_test_models/bert/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21137
}

loading weights file ../../examples/test_model/data/pretrain_test_models/bert/pytorch_model.bin
Some weights of the model checkpoint at ../../examples/test_model/data/pretrain_test_models/bert were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../../examples/test_model/data/pretrain_test_models/bert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading file ../../examples/test_model/data/pretrain_test_models/bert/vocab.txt
loading file ../../examples/test_model/data/pretrain_test_models/bert/added_tokens.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/special_tokens_map.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/tokenizer_config.json
Adding [TEXT] to the vocabulary
Adding [FORMULA] to the vocabulary
Adding [FIGURE] to the vocabulary
Adding [MARK] to the vocabulary
Adding [TAG] to the vocabulary
Adding [TEXT_BEGIN] to the vocabulary
Adding [TEXT_END] to the vocabulary
Adding [FORMULA_BEGIN] to the vocabulary
Adding [FORMULA_END] to the vocabulary
Assigning ['[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]'] to the additional_special_tokens key of the tokenizer

torch.Size([1, 768])
torch.Size([1, 17, 768])
torch.Size([2, 768])
torch.Size([2, 17, 768])

3.2 使用BertTokenizer¶

[23]:

# 在Bert-base-chinese的基础上初始化tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)


# 可以对单个题目进行令牌化
print(tokenizer(test_items[0], key=lambda x: x['ques_content']))
print()

# 也可以对题目列表进行令牌化
token_items = tokenizer(test_items, key=lambda x: x['ques_content'])
print(token_items)
print()

# 可以使用return_tensors参数指定返回张量的类型
print(tokenizer(test_items[0], key=lambda x: x['ques_content'], return_tensors='pt'))


# 可以使用tokenize方法查看令牌化后的文本
print(tokenizer.tokenize(test_items[0], key=lambda x: x['ques_content']))
print(tokenizer.tokenize(test_items, key=lambda x: x['ques_content']))

loading file ../../examples/test_model/data/pretrain_test_models/bert/vocab.txt
loading file ../../examples/test_model/data/pretrain_test_models/bert/added_tokens.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/special_tokens_map.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/tokenizer_config.json
Adding [TEXT] to the vocabulary
Adding [FORMULA] to the vocabulary
Adding [FIGURE] to the vocabulary
Adding [MARK] to the vocabulary
Adding [TAG] to the vocabulary
Adding [TEXT_BEGIN] to the vocabulary
Adding [TEXT_END] to the vocabulary
Adding [FORMULA_BEGIN] to the vocabulary
Adding [FORMULA_END] to the vocabulary
Assigning ['[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]'] to the additional_special_tokens key of the tokenizer

{'input_ids': tensor([[  101,   100,   100,   100, 21130,   166,   117,   167,   100,   102,
           168,   134,   166,   116,   128,   167,   100, 21131,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

{'input_ids': tensor([[  101,   100,   100,   100, 21130,   166,   117,   167,   100,   102,
           168,   134,   166,   116,   128,   167,   100, 21131,   102],
        [  101,   100, 21130,   166,   117,   167,   100,   102,   168,   134,
           166,   116,   128,   167,   100, 21131,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

{'input_ids': tensor([[  101,   100,   100,   100, 21130,   166,   117,   167,   100,   102,
           168,   134,   166,   116,   128,   167,   100, 21131,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['公式', '公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
[['公式', '公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'], ['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']]

[24]:

# 加载之前训练的模型tokenizer
tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])

t2v = BertModel(pretrained_model_dir)
i_vec = t2v(encodes)
print(i_vec.shape) # == torch.Size([2, x])
print()

i_vec = t2v.infer_vector(encodes)
t_vec = t2v.infer_tokens(encodes)
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))
print()

loading file ../../examples/test_model/data/pretrain_test_models/bert/vocab.txt
loading file ../../examples/test_model/data/pretrain_test_models/bert/added_tokens.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/special_tokens_map.json
loading file ../../examples/test_model/data/pretrain_test_models/bert/tokenizer_config.json
Adding [TEXT] to the vocabulary
Adding [FORMULA] to the vocabulary
Adding [FIGURE] to the vocabulary
Adding [MARK] to the vocabulary
Adding [TAG] to the vocabulary
Adding [TEXT_BEGIN] to the vocabulary
Adding [TEXT_END] to the vocabulary
Adding [FORMULA_BEGIN] to the vocabulary
Adding [FORMULA_END] to the vocabulary
Assigning ['[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]'] to the additional_special_tokens key of the tokenizer
loading configuration file ../../examples/test_model/data/pretrain_test_models/bert/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21137
}

loading weights file ../../examples/test_model/data/pretrain_test_models/bert/pytorch_model.bin
Some weights of the model checkpoint at ../../examples/test_model/data/pretrain_test_models/bert were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../../examples/test_model/data/pretrain_test_models/bert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

torch.Size([2, 19, 768])

torch.Size([2, 768])
torch.Size([2, 17, 768])

3.3 使用EduNLP中公开的预训练模型¶

[8]:

# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/data/data/bert"
i2v = get_pretrained_i2v("luna_bert", model_dir=pretrained_dir)

EduNLP, INFO model_path: ..\..\examples\test_model/data\data\bert\LUNABert
EduNLP, INFO Use pretrained t2v model luna_bert
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/EduNLP/LUNABert.zip is saved as ..\..\examples\test_model/data\data\bert\LUNABert.zip

Downloading ..\..\examples\test_model/data\data\bert\LUNABert.zip 100.00%: 362MB | 362MB

downloader, INFO ..\..\examples\test_model/data\data\bert\LUNABert.zip is unzip to ..\..\examples\test_model/data\data\bert\LUNABert

loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

loading weights file ..\..\examples\test_model/data\data\bert\LUNABert\pytorch_model.bin
Some weights of the model checkpoint at ..\..\examples\test_model/data\data\bert\LUNABert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ..\..\examples\test_model/data\data\bert\LUNABert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

Didn't find file ..\..\examples\test_model/data\data\bert\LUNABert\tokenizer.json. We won't load it.
loading file ..\..\examples\test_model/data\data\bert\LUNABert\vocab.txt
loading file None
loading file ..\..\examples\test_model/data\data\bert\LUNABert\added_tokens.json
loading file ..\..\examples\test_model/data\data\bert\LUNABert\special_tokens_map.json
loading file ..\..\examples\test_model/data\data\bert\LUNABert\tokenizer_config.json
loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

Adding [FORMULA] to the vocabulary
Adding [FIGURE] to the vocabulary
Adding [MARK] to the vocabulary
Adding [TAG] to the vocabulary
loading configuration file ..\..\examples\test_model/data\data\bert\LUNABert\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21132
}

[9]:

i_vec, t_vec = i2v(test_items, key=lambda x: x['ques_content'])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(test_items, key=lambda x: x['ques_content'])
print(i_vec.shape)
t_vec = i2v.infer_token_vector(test_items, key=lambda x: x['ques_content'])
print(t_vec.shape)
print()

# 同样，可以获取单个题目的表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x['ques_content'])
print(i_vec.shape)
print(t_vec.shape)

torch.Size([2, 768])
torch.Size([2, 32, 768])

torch.Size([2, 768])
torch.Size([2, 32, 768])

torch.Size([1, 768])
torch.Size([1, 2, 768])

[ ]: