[1]:
from EduNLP.Pretrain import PretrainedEduTokenizer, EduDataset
import os
import json
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/pretrain/"
def stem_data():
_data = []
data_path = os.path.join(data_dir, "standard_luna_data.json")
with open(data_path, encoding="utf-8") as f:
for line in f.readlines():
_data.append(json.loads(line))
return _data
train_items = stem_data()
test_items = [
{'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
{'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}
]
/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
1. PretrainedEduTokenizer¶
该类主要用于处理预训练模型的输入语料,主要成分包括词表(vocab) 和 基础令牌话容器,负责将输入语料处理为适合模型的输入格式。
1.1 构造令牌化容器¶
[2]:
corpus_items = train_items + test_items
# 定义参数
tokenizer_params = {
"add_specials": True,
"tokenize_method": "pure_text",
}
# 可自定义pure_text的参数, 参考Tokenizer/PureTextTokenizer
text_params = {
"granularity": "char",
"stopwords": None,
}
tokenizer = PretrainedEduTokenizer(**tokenizer_params, text_params=text_params)
print(len(tokenizer))
# 设置预训练语料,训练令牌话容器
tokenizer.set_vocab(corpus_items, key=lambda x: x['ques_content'])
print(len(tokenizer))
# 保存令牌话容器
pretrained_tokenizer_dir = output_dir
tokenizer.save_pretrained(pretrained_tokenizer_dir)
14
Dump cache file failed.
Traceback (most recent call last):
File "/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/jieba/__init__.py", line 154, in initialize
_replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpk245c2ok' -> '/tmp/jieba.cache'
379
1.2 使用令牌化容器¶
[3]:
# 加载令牌话容器
tokenizer = PretrainedEduTokenizer.from_pretrained(pretrained_tokenizer_dir)
# 按batch进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()
# 按max_length进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100)
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()
# 不返回tensor
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_tensors=False)
print(encodes["seq_idx"])
print()
# 保留tokens
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_text=True)
print(list(encodes.keys()))
print()
['seq_idx', 'seq_len']
torch.Size([2, 17])
['seq_idx', 'seq_len']
torch.Size([2, 100])
[[1, 1, 1, 6, 22, 35, 130, 1, 9, 45, 19, 22, 46, 211, 130, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 6, 22, 35, 130, 1, 9, 45, 19, 22, 46, 211, 130, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
['seq_idx', 'seq_len', 'seq_token']
1.3 其他操作¶
扩充词表
[4]:
print(tokenizer.vocab._special_tokens)
print()
# 增加特殊词
tokenizer.add_specials(["[special]"])
print(tokenizer.tokenize("[special]"))
print(tokenizer.vocab._special_tokens)
print()
# 增加词
tokenizer.add_tokens(["[token]"])
print(tokenizer.tokenize("[token]"))
['[PAD]', '[UNK]', '[BOS]', '[EOS]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]']
['special']
['[PAD]', '[UNK]', '[BOS]', '[EOS]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[special]']
['token']
编码/解码 句子
[5]:
encode_idxs = tokenizer.encode('公式 公 式')
print(encode_idxs)
encode_tokens = tokenizer.decode(encode_idxs)
print(encode_tokens)
[1, 370, 371]
['[UNK]', '公', '式']
修改基础令牌化容器
[6]:
# 可自定义参数
formula_params = {
"skip_figure_formula": True,
"symbolize_figure_formula": False
}
tokenizer._set_basic_tokenizer("ast_formula", formula_params=formula_params)
保存与加载
[7]:
# 保存
save_dir = "./tmp"
tokenizer.save_pretrained(save_dir)
# 加载
tokenizer = PretrainedEduTokenizer.from_pretrained(save_dir)
EduDataset¶
直接使用¶
[8]:
# 使用EduDataset
dataset = EduDataset(tokenizer, items=train_items,
stem_key="ques_content")
print(dataset[0].keys())
dict_keys(['seq_idx', 'seq_len'])
[9]:
dataset = EduDataset(tokenizer, items=train_items,
stem_key="ques_content", label_key="difficulty")
print(dataset[0].keys())
dict_keys(['labels', 'seq_idx', 'seq_len'])
[10]:
dataset = EduDataset(tokenizer, items=train_items,
stem_key="ques_content", label_key="difficulty", feature_keys=["know_list"])
print(dataset[0].keys())
dict_keys(['know_list', 'labels', 'seq_idx', 'seq_len'])
保存与加载¶
考虑到预处理耗时久,若希望下次能直接使用处理后的数据,可将预处理后的数据保存在本地。
[11]:
dataset.to_disk(output_dir)
[12]:
# # 保存
dataset.to_disk(output_dir)
# # 加载
dataset1 = EduDataset(tokenizer, ds_disk_path=output_dir)
print(dataset1[0].keys())
dataset2 = EduDataset(tokenizer, ds_disk_path=output_dir, label_key="difficulty", feature_keys=["know_list"])
print(dataset2[0].keys())
dict_keys(['seq_idx', 'seq_len'])
dict_keys(['know_list', 'labels', 'seq_idx', 'seq_len'])
并行预处理¶
在题目数据量过大时,令牌化等预处理操作耗时较长,可通过并行处理加速。
[13]:
import time
s = time.time()
# 使用并行加速
dataset = EduDataset(tokenizer, items=train_items*100,
stem_key="ques_content",
num_processor=4)
print(dataset[0].keys())
e = time.time()
print(f"spand time: {(e - s):.4}s")
s = time.time()
# 不使用并行加速
dataset = EduDataset(tokenizer, items=train_items*100,
stem_key="ques_content",)
print(dataset[0].keys())
e = time.time()
print(f"spand time: {(e - s):.4}s")
dict_keys(['seq_idx', 'seq_len'])
spand time: 1.641s
dict_keys(['seq_idx', 'seq_len'])
spand time: 4.484s