使用QuesNet向量化容器¶
导入功能块¶
[6]:
from EduNLP.Pretrain import BertTokenizer
from EduNLP.Vector import T2V, BertModel
[7]:
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/bert"
items = [
r"题目一:如图几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$",
r"题目二: 如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$"
]
令牌化¶
[13]:
tokenizer = BertTokenizer.from_pretrained(output_dir)
# 对题目文本进行令牌化
items = [
"有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$",
"已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
# 可以对单个题目进行令牌化
print(tokenizer(items[0]))
print()
# 也可以对题目列表进行令牌化
token_items = tokenizer(items)
print(token_items)
print()
# 可以使用return_tensors参数指定返回张量的类型
print(tokenizer(items[0], return_tensors='pt'))
token_items = tokenizer(items, return_tensors='pt')
{'input_ids': [101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [[101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2347, 4761, 1749, 166, 141, 169, 123, 171, 116, 167, 141, 169, 123, 171, 118, 127, 8206, 134, 121, 6814, 4157, 4684, 5296, 1749, 2779, 2533, 2478, 7270, 2428, 3297, 2207, 966, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
{'input_ids': tensor([[ 101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276,
3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116,
128, 8179, 3297, 1920, 966, 21130, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1]])}
向量化¶
[15]:
t2v = BertModel(output_dir)
# 或 t2v = T2V("bert", output_dir)
# 获得句表征(含有CLS和SEP)
i_vec = t2v(token_items)
print(i_vec.shape)
print()
# 获得句表征
i_vec = t2v.infer_vector(token_items, pooling_strategy='CLS')
print(i_vec.shape)
i_vec = t2v.infer_vector(token_items, pooling_strategy='average')
print(i_vec.shape)
print()
# 获得词表征
t_vec = t2v.infer_tokens(token_items)
print(t_vec.shape)
print()
Some weights of the model checkpoint at ../../examples/test_model/data/data/bert were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../../examples/test_model/data/data/bert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
torch.Size([2, 34, 768])
torch.Size([2, 768])
torch.Size([2, 768])
torch.Size([2, 32, 768])