使用ELMo向量化容器¶
导入功能块¶
[1]:
from EduNLP.Pretrain import ElmoTokenizer
from EduNLP.Vector import T2V, ElmoModel
import os
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
[2]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/elmo"
令牌化¶
[3]:
# 加载之前训练的模型tokenizer
tokenizer = ElmoTokenizer(os.path.join(output_dir, "vocab.json"))
# 对题目文本进行令牌化
items = [
"有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$",
"已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
# 可以对单个题目进行令牌化
print(tokenizer(items[0], freeze_vocab=True))
print()
# 也可以对题目列表进行令牌化
print(tokenizer(items, freeze_vocab=True))
print()
token_items, lengths = tokenizer(items, pad_to_max_length=True)
([527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], 17)
([[527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], [7, 104, 13, 15, 16, 17, 18, 34, 79, 15, 16, 17, 18, 19, 105, 13, 10, 23, 106, 107, 104, 108, 109, 110, 111]], [17, 25])
向量化¶
[4]:
t2v = ElmoModel(output_dir)
# # 获得句表征
i_vec = t2v(token_items)
print(i_vec.shape)
print()
# 获得句表征和词表征
i_vec = t2v.infer_vector(token_items, lengths=lengths)
t_vec = t2v.infer_tokens(token_items, lengths=lengths)
print(i_vec.shape)
print(t_vec.shape)
print()
torch.Size([2, 512])
torch.Size([2, 512])
torch.Size([2, 25, 512])