使用QuesNet向量化容器

导入功能块

[1]:
from EduNLP.Pretrain import QuesNetTokenizer, Question
from EduNLP.Vector import T2V, QuesNetModel
import os
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
[2]:
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/quesnet/quesnet_test"

raw_data = [
  {"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": None, "ques_figure_paths": None},
  {"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$", "ques_subject": 1, "ques_id": "726e139c-33a9-11ec-bd9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["0", "1", "$\\\\sqrt{2}$", "2"], "ques_answer": "C", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.566538, "ques_figure_ids": None, "ques_figure_paths": None}
]

令牌化

[3]:
# 读取保存的tokenizer
tokenizer = QuesNetTokenizer.from_pretrained(output_dir,
                                             img_dir=os.path.join(data_dir, "quesnet_img"))

# 可以处理单个题目
print(tokenizer.tokenize(raw_data[0], key=lambda x: x['ques_content']))
print()
# 也可以处理题目列表
print(tokenizer.tokenize(raw_data[:5], key=lambda x: x['ques_content']))

print()

# 将token转换为index
print(tokenizer(raw_data[0], key=lambda x: x['ques_content'], return_text=True, padding=True))
print()
print(tokenizer(raw_data[:3], key=lambda x: x['ques_content'], padding=True))


token_items = tokenizer(raw_data, key=lambda x: x['ques_content'])
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']

[['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '='], ['复数', 'z', '=', '1', '+', '2', 'i', '+', 'i', '^', '{', '3', '}', '|', 'z', '|', '=']]

{'content_idx': [0, 0, 0, 12, 18, 0, 29, 0, 29, 24, 31, 9, 32, 7, 0, 29, 7, 0, 0, 0, 22, 0, 6, 0, 0, 12, 0, 7, 0, 6, 8, 6, 0, 6, 0, 0, 6, 0, 0, 0, 0, 12, 2, 2, 2, 2, 2, 2, 2, 2], 'meta_idx': {'know_name': [0, 0, 0]}, 'content': ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '='], 'meta': {'know_name': ['代数', '集合', '集合的相等']}}

{'content_idx': [[0, 0, 0, 12, 18, 0, 29, 0, 29, 24, 31, 9, 32, 7, 0, 29, 7, 0, 0, 0, 22, 0, 6, 0, 0, 12, 0, 7, 0, 6, 8, 6, 0, 6, 0, 0, 6, 0, 0, 0, 0, 12, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 12, 8, 5, 9, 27, 5, 27, 24, 31, 0, 32, 0, 0, 0, 12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]], 'meta_idx': [{'know_name': [0, 0, 0]}, {'know_name': [0, 0, 0]}]}

向量化

[4]:
t2v = QuesNetModel(output_dir)

# 转化Tokenizer输出格式
content = token_items['content_idx']
meta_idx = token_items['meta_idx']
qs = [Question("", content[i], [0], [[0], [0], [0]], meta_idx[i]) for i in range(len(token_items))]

# 向量化
i_vec = t2v.infer_vector(qs)
t_vec = t2v.infer_tokens(qs)
print(i_vec.shape)
print(t_vec.shape)
print()
torch.Size([2, 256])
torch.Size([2, 43, 256])