使用QuesNet向量化容器¶

导入功能块¶

[6]:

from EduNLP.Pretrain import BertTokenizer

from EduNLP.Vector import T2V, BertModel

[7]:

BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/data/bert"

items = [
    r"题目一：如图几何图形．此图由三个半圆构成，三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点，此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$",
    r"题目二: 如图来自古希腊数学家希波克拉底所研究的几何图形．此图由三个半圆构成，三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点，此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\SIFChoice$$\FigureID{1}$"
]

令牌化¶

[13]:

tokenizer = BertTokenizer.from_pretrained(output_dir)

# 对题目文本进行令牌化
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]

# 可以对单个题目进行令牌化
print(tokenizer(items[0]))
print()

# 也可以对题目列表进行令牌化
token_items = tokenizer(items)
print(token_items)
print()

# 可以使用return_tensors参数指定返回张量的类型
print(tokenizer(items[0], return_tensors='pt'))

token_items = tokenizer(items, return_tensors='pt')

{'input_ids': [101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

{'input_ids': [[101, 1062, 2466, 1963, 1745, 21129, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 8179, 3297, 1920, 966, 21130, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2347, 4761, 1749, 166, 141, 169, 123, 171, 116, 167, 141, 169, 123, 171, 118, 127, 8206, 134, 121, 6814, 4157, 4684, 5296, 1749, 2779, 2533, 2478, 7270, 2428, 3297, 2207, 966, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

{'input_ids': tensor([[  101,  1062,  2466,  1963,  1745, 21129,   166,   117,   167,  5276,
          3338,  3340,   816,  1062,  2466,   102,   168,   134,   166,   116,
           128,  8179,  3297,  1920,   966, 21130,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}

向量化¶

[15]:

t2v = BertModel(output_dir)
# 或 t2v = T2V("bert", output_dir)

# 获得句表征(含有CLS和SEP)
i_vec = t2v(token_items)
print(i_vec.shape)
print()

# 获得句表征
i_vec = t2v.infer_vector(token_items, pooling_strategy='CLS')
print(i_vec.shape)
i_vec = t2v.infer_vector(token_items, pooling_strategy='average')
print(i_vec.shape)
print()

# 获得词表征
t_vec = t2v.infer_tokens(token_items)
print(t_vec.shape)
print()

Some weights of the model checkpoint at ../../examples/test_model/data/data/bert were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../../examples/test_model/data/data/bert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

torch.Size([2, 34, 768])

torch.Size([2, 768])
torch.Size([2, 768])

torch.Size([2, 32, 768])