使用QuesNet向量化容器¶
导入功能块¶
[1]:
from EduNLP.Pretrain import DisenQTokenizer
from EduNLP.Vector import T2V, DisenQModel
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
[2]:
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/disenq"
# 对题目文本进行令牌化
items = [
"有 公 式 $\\FormFigureID{wrong1?}$ ,如 图 $\\FigureID{088f15ea-xxx}$",
"已知 圆 $x^{2}+y^{2}-6 x=0$ ,过 点 (1,2) 的 直 线 被 该 圆 所 截 得 的 弦 的 长度 的 最小 值 为"
]
令牌化¶
[3]:
tokenizer = DisenQTokenizer.from_pretrained(output_dir)
# 可以对单个题目进行令牌化
print(tokenizer(items[0]))
print()
# 也可以对题目列表进行令牌化
token_items = tokenizer(items)
print(token_items)
print()
token_items = tokenizer(items)
{'content_idx': tensor([[3548, 1, 2752, 1, 1, 1821, 1]]), 'content_len': tensor([7])}
{'content_idx': tensor([[3548, 1, 2752, 1, 1, 1821, 1, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
[2568, 1829, 1, 1, 1, 4364, 1, 4737, 4772, 5196, 5699, 5813,
1829, 2938, 2921, 2817, 4737, 1, 4737, 6428, 4737, 3527, 855, 463]]), 'content_len': tensor([ 7, 24])}
向量化¶
[4]:
pretrained_dir = f"{BASE_DIR}/examples/test_model/disenq"
t2v = DisenQModel(pretrained_dir)
# 获得句表征和词表征
t_vec, i_vec_k, i_vec_i = t2v(token_items)
print(i_vec_k.shape, i_vec_i.shape)
print(t_vec.shape)
print()
# 获得词表征
t_vec = t2v.infer_tokens(token_items)
# 获得句表征
i_vec_k, i_vec_i = t2v.infer_vector(token_items)
# 获得句表征
i_vec_k = t2v.infer_vector(token_items, vector_type="k")
i_vec_i = t2v.infer_vector(token_items, vector_type="i")
torch.Size([2, 128]) torch.Size([2, 128])
torch.Size([2, 24, 128])