[1]:
import torch
import numpy as np
import os
import json
import codecs
from EduNLP.Pretrain import QuesNetTokenizer, pretrain_quesnet
from EduNLP.Vector import T2V
from EduNLP.I2V import QuesNet, get_pretrained_i2v

os.environ["WANDB_DISABLED"] = "true"
d:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)

训练自己的QuesNet模型

1. 数据

[2]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/quesnet"
[3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "quesnet_data.json")
    with codecs.open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

raw_data = raw_data()

2. 训练Tokenizer

[4]:
tokenizer = QuesNetTokenizer(meta=['know_name'], max_length=50,
                             img_dir=os.path.join(data_dir, "quesnet_img"))

# 设置词表
tokenizer.set_vocab(raw_data, key=lambda x: x['ques_content'], trim_min_count=3, silent=False)

print("vocab_size: ", tokenizer.vocab_size)
print()

save words(3): 64/249 = 0.2570                  with frequency 696/927=0.7508
save meta information know_name: 48
vocab_size:  67

[5]:
# 保存tokenizer
tokenizer.save_pretrained(output_dir)

3. 训练QuesNet

[6]:
# 自定义训练参数
train_params = {
    # train params
    "n_epochs": 1,
    "batch_size": 1,
    "lr": 1e-3,
    'save_every': 1,
    'log_steps': 10,
    # 'device': 'cpu',
    'max_steps': 2,
    # model params
    'emb_size': 256,
    'feat_size': 256,
}

# 当前仅支持linux下训练
# pretrain_quesnet(os.path.join(os.path.abspath(data_dir), 'quesnet_data.json'),
#                  output_dir, tokenizer, True, train_params)

4. 使用模型

[7]:
pretrain_dir = os.path.join(output_dir, "quesnet_test_256")

4.1 使用训练好的QuesNet Tokenzier

[8]:
# 读取保存的tokenizer
tokenizer = QuesNetTokenizer.from_pretrained(pretrain_dir,
                                             img_dir=os.path.join(data_dir, "quesnet_img"))
[9]:
# tokenize
# 可以处理单个题目
print(tokenizer.tokenize(raw_data[0], key=lambda x: x['ques_content']))
print()
# 也可以处理题目列表
print(tokenizer.tokenize(raw_data[:5], key=lambda x: x['ques_content']))

print()

# 将token转换为index
print(tokenizer(raw_data[0], key=lambda x: x['ques_content'], return_text=True, padding=True))
print()
print(tokenizer(raw_data[:3], key=lambda x: x['ques_content'], padding=True))
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']

[['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '='], ['复数', 'z', '=', '1', '+', '2', 'i', '+', 'i', '^', '{', '3', '}', '|', 'z', '|', '='], ['埃及', '胡夫', '金字塔', '古代', '世界', '建筑', '奇迹', '形状', '视为', '正四', '棱锥', '以该', '四', '棱锥', '高为', '边长', '正方形', '面积', '等于', '四', '棱锥', '侧面', '三角形', '面积', '侧面', '三角形', '底边', '高', '底面', '正方形', '边长', '比值'], ['设', 'O', '正方形', 'ABCD', '中心', 'O', ',', 'A', ',', 'B', ',', 'C', ',', 'D', '中任取', '3', '点', '取到', '3', '点', '共线', '概率'], ['某校', '课外', '学习', '小组', '研究', '作物', '发芽率', 'y', '温度', 'x', '单位', '^', '{', '\\circ', '}', '\\mathrm', '{', 'C', '}', '关系', '20', '温度', '条件', '种子', '发芽', '实验', '实验', '数据', '\\left', '(', 'x', '_', '{', 'i', '}', ',', 'y', '_', '{', 'i', '}', '\\right', ')', '(', 'i', '=', '1', ',', '2', ',', '\\cdots', ',', '20', ')', '散点图', \FigureID{000004d6-0479-11ec-829b-797d5eb43535}, '散点图', '10', '^', '{', '\\circ', '}', '\\mathrm', '{', 'C', '}', '40', '^', '{', '\\circ', '}', '\\mathrm', '{', 'C', '}', '之间', '四个', '回归方程', '类型', '中', '适宜', '发芽率', 'y', '温度', 'x', '回归方程', '类型']]

{'content_idx': [0, 0, 0, 14, 21, 0, 32, 0, 32, 27, 34, 10, 35, 7, 0, 32, 7, 0, 0, 0, 25, 0, 6, 0, 0, 14, 0, 7, 0, 6, 8, 6, 0, 6, 0, 0, 6, 0, 0, 0, 0, 14, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'meta_idx': {'know_name': [0, 0, 0]}, 'content': ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '='], 'meta': {'know_name': ['代数', '集合', '集合的相等']}}

{'content_idx': [[0, 0, 0, 14, 21, 0, 32, 0, 32, 27, 34, 10, 35, 7, 0, 32, 7, 0, 0, 0, 25, 0, 6, 0, 0, 14, 0, 7, 0, 6, 8, 6, 0, 6, 0, 0, 6, 0, 0, 0, 0, 14, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 14, 8, 5, 10, 30, 5, 30, 27, 34, 0, 35, 0, 0, 0, 14, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [56, 81, 88, 48, 37, 63, 57, 64, 82, 72, 71, 40, 51, 71, 91, 86, 73, 89, 79, 51, 71, 42, 36, 89, 42, 36, 61, 90, 62, 73, 86, 74, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]], 'meta_idx': [{'know_name': [0, 0, 0]}, {'know_name': [0, 0, 0]}, {'know_name': [8, 7, 5]}]}

4.2 使用训练好的QuesNet模型

[10]:
tokenizer_kwargs = {
    'tokenizer_config_dir': pretrain_dir,
}
i2v = QuesNet('quesnet', 'quesnet', pretrain_dir,
              tokenizer_kwargs=tokenizer_kwargs, device="cpu")
[11]:
# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
torch.Size([1, 256])
torch.Size([1, 43, 256])

torch.Size([1, 43, 256])
torch.Size([1, 256])

torch.Size([2, 43, 256])
torch.Size([2, 256])

4.3 使用EduNLP中公开的预训练模型

[12]:
# 获取公开的预训练模型
i2v = get_pretrained_i2v("quesnet_test_256", model_dir=output_dir)
EduNLP, INFO model_path: ..\..\examples\test_model\quesnet\quesnet_test_256
EduNLP, INFO Use pretrained t2v model quesnet_test_256
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/quesnet_pub/1/quesnet_test_256.zip is saved as ..\..\examples\test_model\quesnet\quesnet_test_256.zip
downloader, INFO file existed, skipped
[13]:
# 用法和I2V相同

# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
torch.Size([1, 256])
torch.Size([1, 43, 256])

torch.Size([1, 43, 256])
torch.Size([1, 256])

torch.Size([2, 43, 256])
torch.Size([2, 256])
[ ]: