使用QuesNet向量化容器¶
[1]:
from EduNLP.I2V import QuesNet, get_pretrained_i2v
import os
# 设置你的数据路径和输出路径
# BASE_DIR = "/your/own/base/path"
BASE_DIR = "../../"
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/quesnet"
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
[2]:
raw_data = [
{"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": None, "ques_figure_paths": None},
{"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$", "ques_subject": 1, "ques_id": "726e139c-33a9-11ec-bd9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["0", "1", "$\\\\sqrt{2}$", "2"], "ques_answer": "C", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.566538, "ques_figure_ids": None, "ques_figure_paths": None}
]
使用get_pretrained_i2v加载公开模型¶
[3]:
# 获取公开的预训练模型
i2v = get_pretrained_i2v("quesnet_test_256", model_dir=output_dir)
[ ]:
# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()
# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()
# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
torch.Size([1, 256])
torch.Size([1, 43, 256])
torch.Size([1, 43, 256])
torch.Size([1, 256])
torch.Size([2, 43, 256])
torch.Size([2, 256])
使用I2V加载本地模型¶
[ ]:
pretrained_dir = os.path.join(output_dir, "quesnet_test_256")
tokenizer_kwargs = {
'tokenizer_config_dir': pretrained_dir,
}
i2v = QuesNet('quesnet', 'quesnet', pretrained_dir,
tokenizer_kwargs=tokenizer_kwargs, device="cpu")
[ ]:
# 用法和I2V相同
# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()
# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()
# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
torch.Size([1, 256])
torch.Size([1, 43, 256])
torch.Size([1, 43, 256])
torch.Size([1, 256])
torch.Size([2, 43, 256])
torch.Size([2, 256])