使用QuesNet向量化容器

[1]:
from EduNLP.I2V import QuesNet, get_pretrained_i2v
import os

# 设置你的数据路径和输出路径
# BASE_DIR = "/your/own/base/path"
BASE_DIR = "../../"

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/quesnet"
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
[2]:
raw_data = [
  {"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": None, "ques_figure_paths": None},
  {"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$", "ques_subject": 1, "ques_id": "726e139c-33a9-11ec-bd9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["0", "1", "$\\\\sqrt{2}$", "2"], "ques_answer": "C", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.566538, "ques_figure_ids": None, "ques_figure_paths": None}
]

使用get_pretrained_i2v加载公开模型

[3]:
# 获取公开的预训练模型
i2v = get_pretrained_i2v("quesnet_test_256", model_dir=output_dir)
[ ]:
# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
torch.Size([1, 256])
torch.Size([1, 43, 256])

torch.Size([1, 43, 256])
torch.Size([1, 256])

torch.Size([2, 43, 256])
torch.Size([2, 256])

使用I2V加载本地模型

[ ]:
pretrained_dir = os.path.join(output_dir, "quesnet_test_256")

tokenizer_kwargs = {
    'tokenizer_config_dir': pretrained_dir,
}
i2v = QuesNet('quesnet', 'quesnet', pretrained_dir,
              tokenizer_kwargs=tokenizer_kwargs, device="cpu")
[ ]:
# 用法和I2V相同

# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
torch.Size([1, 256])
torch.Size([1, 43, 256])

torch.Size([1, 43, 256])
torch.Size([1, 256])

torch.Size([2, 43, 256])
torch.Size([2, 256])