使用Elmo向量化容器¶
[1]:
from EduNLP.I2V import Elmo, get_pretrained_i2v
# 设置你的数据路径和输出路径
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/elmo"
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
使用I2V加载本地模型¶
[2]:
tokenizer_kwargs = {"path": os.path.join(output_dir, "vocab.json")}
i2v = Elmo('elmo', 'elmo', output_dir, tokenizer_kwargs=tokenizer_kwargs)
[3]:
item = [
{'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
{'stem': '已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为'}
]
# 可以对单个题目进行表征
i_vec, t_vec = i2v(item[0]['stem'])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])
# 也可以对题目列表进行表征
i_vec, t_vec = i2v([ item[0]['stem'], item[1]['stem'] ])
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))
torch.Size([512])
torch.Size([15, 512])
torch.Size([2, 512])
torch.Size([2, 25, 512])
使用get_pretrained_i2v加载公开模型¶
[5]:
# 获取公开的预训练模型
i2v = get_pretrained_i2v("elmo_test", model_dir=output_dir)
EduNLP, INFO model_path: ..\..\examples\test_model\elmo\elmo_test
EduNLP, INFO Use pretrained t2v model elmo_test
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/elmo_pub/1/elmo_test.zip is saved as ..\..\examples\test_model\elmo\elmo_test.zip
downloader, INFO file existed, skipped
[6]:
items = [
"有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$",
"已知圆$x^{2}+y^{2}-6 x=0$,过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
i_vec, t_vec = i2v(items)
print(i_vec.shape)
print(t_vec.shape)
print()
# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(items)
print(i_vec.shape)
t_vec = i2v.infer_token_vector(items)
print(t_vec.shape)
print()
# 同样,可以获取单个题目的表征
i_vec, t_vec = i2v(items[0])
print(i_vec.shape)
print(t_vec.shape)
torch.Size([2, 64])
torch.Size([2, 25, 64])
torch.Size([2, 64])
torch.Size([2, 25, 64])
torch.Size([64])
torch.Size([17, 64])