[9]:
from EduNLP.Tokenizer import PureTextTokenizer, TextTokenizer, get_tokenizer
TextTokenizer and PureTextTokenizer¶
‘text’ Tokenizer symbolizes the FormulaFigures as [FUMULA] and tokenize latex Formulas as Text
‘pure_text’ Tokenizer ignores and skips the FormulaFigures and tokenize latex Formulas as Text
TextTokenizer¶
[12]:
items = [{
"stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
"options": ["1", "2"]
}]
tokenizer = get_tokenizer("text") # tokenizer = TextTokenizer()
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']
[14]:
items = ["有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]
tokenizer = get_tokenizer("text") # tokenizer = TextTokenizer()
tokens = tokenizer(items)
print(next(tokens))
['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
PureTextTokenizer¶
[13]:
items = ["有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]
tokenizer = get_tokenizer("pure_text") # tokenizer = PureTextTokenizer()
tokens = tokenizer(items)
print(next(tokens))
['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
GensimWordTokenizer and GensimSegTokenizer¶
GensimWordTokenizer is the standart basic Tokenizer for SIF items
GensimSegTokenizer is the standart basic Tokenizer for SIF items
[6]:
from EduNLP.Pretrain import GensimWordTokenizer, GensimSegTokenizer
GensimWordTokenizer¶
[7]:
item = "已知有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$, 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"
tokenizer = GensimWordTokenizer(symbol="gmas")
token_item = tokenizer(item)
print(token_item.tokens)
print()
tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
token_item = tokenizer(item)
print(token_item.tokens)
['已知', '公式', \FormFigureID{1}, '如图', '[FIGURE]', 'mathord', ',', 'mathord', '约束条件', '公式', [FORMULA], '[SEP]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '最大值', '[MARK]']
['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
GensimSegTokenizer¶
[8]:
item = "已知有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$, 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$则$z=x+7 y$的最大值为$\\SIFBlank$"
tokenizer = GensimSegTokenizer(symbol="gmas")
token_item = tokenizer(item)
print(len(token_item), token_item)
print()
tokenizer = GensimSegTokenizer(symbol="gmas", flatten=True)
token_item = tokenizer(item)
token_item = [i for i in token_item]
print(len(token_item), token_item)
print()
# segment at Tag and Sep
tokenizer = GensimSegTokenizer(symbol="gmas", depth=2)
token_item = tokenizer(item)
print(len(token_item), token_item)
print()
# tag for texts and formulas in each big segment if setting depth.
tokenizer = GensimSegTokenizer(symbol="gmas", depth=2, add_seg_mode="delimiter")
token_item = tokenizer(item)
print(len(token_item), token_item)
10 [['已知', '公式'], [\FormFigureID{1}], ['如图'], ['[FIGURE]'], ['mathord', ',', 'mathord'], ['约束条件', '公式'], [[FORMULA]], ['mathord', '=', 'mathord', '+', 'textord', 'mathord'], ['最大值'], ['[MARK]']]
19 ['已知', '公式', \FormFigureID{1}, '如图', '[FIGURE]', 'mathord', ',', 'mathord', '约束条件', '公式', [FORMULA], 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '最大值', '[MARK]']
2 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[SEP]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[TEXT_BEGIN]', '最大值', '[MARK]']]
2 [['[TEXT_BEGIN]', '已知', '公式', '[TEXT_END]', '[FORMULA_BEGIN]', \FormFigureID{1}, '[FORMULA_END]', '[TEXT_BEGIN]', '如图', '[TEXT_END]', '[FIGURE]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[FORMULA_END]', '[TEXT_BEGIN]', '约束条件', '公式', '[TEXT_END]', '[FORMULA_BEGIN]', [FORMULA], '[FORMULA_END]', '[SEP]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[FORMULA_END]', '[TEXT_BEGIN]', '最大值', '[TEXT_END]', '[MARK]']]