[2]:
from EduNLP.Tokenizer import CharTokenizer, SpaceTokenizer, CustomTokenizer, PureTextTokenizer, AstFormulaTokenizer, get_tokenizer

Basic Tokenizers

The basic tokenization containers currently available include:

  • CharTokenizer

  • SpaceTokenizer

  • CustomTokenizer

  • PureTextTokenizer

  • AstFormulaTokenizer

Here are more examples for each of them.

CustomTokenizer

[3]:
items = [{
        "stem": "文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?",
        "options": ["1", "2"]
        }]
tokenizer = get_tokenizer("custom", symbol='f')

tokens = tokenizer(items, key = lambda x: x['stem'])
print(next(tokens))

['文具店', '[FORMULA]', '练习本', '卖出', '剩', '[FORMULA]', '包', '每包', '[FORMULA]', '卖出']
[4]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("custom", symbol='f')

tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已知', '集合', '[FORMULA]', '[FORMULA]']

CharTokenizer

[5]:
items = [{
        "stem": "文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?",
        "options": ["1", "2"]
        }]
tokenizer = get_tokenizer("char", stop_words = set(",?"))

tokens = tokenizer(items, key = lambda x: x['stem'])
print(next(tokens))

['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本']
[6]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("char")
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已', '知', '集', '合', '$', 'A', '=', '\\', 'left', '\\', '{', 'x', '\\', 'mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\', 'right', '\\', '}', ',', '\\', 'quad', 'B', '=', '\\', '{', '-', '4', ',', '1', ',', '3', ',', '5', '\\', '}', ',', '\\', 'quad', '$', '则', '$', 'A', '\\', 'cap', 'B', '=', '$']

SpaceTokenizer

[7]:
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']

tokenizer = get_tokenizer("space", stop_words = [])
tokens= tokenizer(items)

print(next(tokens))

['文具店有', '$600$', '本练习本,卖出一些后,还剩', '$4$', '包,每包', '$25$', '本,卖出多少本?']
[8]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("space", stop_words = [])
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已知集合$A=\\left\\{x', '\\mid', 'x^{2}-3', 'x-4<0\\right\\},', '\\quad', 'B=\\{-4,1,3,5\\},', '\\quad$', '则', '$A', '\\cap', 'B=$']

PureTextTokenizer

[9]:
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']

tokenizer = get_tokenizer("pure_text", stop_words = [])
tokens= tokenizer(items)

print(next(tokens))
['文具店', '600', '练习本', '卖出', '剩', '4', '包', '每包', '25', '卖出']
[10]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("pure_text", stop_words = [])
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']
[11]:
items = ["有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]

tokenizer = get_tokenizer("pure_text")
tokens = tokenizer(items)
print(next(tokens))
['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']

AstFormulaTokenizer

[12]:
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']

tokenizer = get_tokenizer("ast_formula")
tokens= tokenizer(items)

print(next(tokens))
['文具店', 'textord', 'textord', 'textord', '练习本', '卖出', '剩', 'textord', '包', '每包', 'textord', 'textord', '卖出']
[13]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("ast_formula")
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))
['已知', '集合', 'mathord_0', '=', 'mathord_1', '\\mid', 'mathord_1', 'textord', '{ }', '\\supsub', '-', 'textord', 'mathord_1', '-', 'textord', '<', 'textord', '\\{', ',', 'mathord_2', '=', '\\{', '-', 'textord', ',', 'textord', ',', 'textord', ',', 'textord', '\\}', ',', 'mathord_0', '\\cap', 'mathord_2', '=']
[14]:
items = ["有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"]

tokenizer = get_tokenizer("ast_formula")
tokens = tokenizer(items)
print(next(tokens))
['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '公式', '[FORMULA]', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]']

GensimWordTokenizer and GensimSegTokenizer

  • GensimWordTokenizer is the standart basic Tokenizer for SIF items

  • GensimSegTokenizer is the standart basic Tokenizer for SIF items

[15]:
from EduNLP.Pretrain import GensimWordTokenizer, GensimSegTokenizer

GensimWordTokenizer

[16]:
item = "已知有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$, 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"

tokenizer = GensimWordTokenizer(symbol="gmas")
token_item = tokenizer(item)
print(token_item.tokens)
print()

tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
token_item = tokenizer(item)
print(token_item.tokens)
['已知', '公式', \FormFigureID{1}, '如图', '[FIGURE]', 'mathord', ',', 'mathord', '约束条件', '公式', [FORMULA], '[SEP]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '最大值', '[MARK]']

['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']

GensimSegTokenizer

[17]:
item = "已知有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$, 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$则$z=x+7 y$的最大值为$\\SIFBlank$"

tokenizer = GensimSegTokenizer(symbol="gmas")
token_item = tokenizer(item)
print(len(token_item), token_item)
print()

tokenizer = GensimSegTokenizer(symbol="gmas", flatten=True)
token_item = tokenizer(item)
token_item = [i for i in token_item]
print(len(token_item), token_item)
print()

# segment at Tag and Sep
tokenizer = GensimSegTokenizer(symbol="gmas", depth=2)
token_item = tokenizer(item)
print(len(token_item), token_item)
print()

# tag for texts and formulas in each big segment if setting depth.
tokenizer = GensimSegTokenizer(symbol="gmas", depth=2,  add_seg_mode="delimiter")
token_item = tokenizer(item)
print(len(token_item), token_item)
10 [['已知', '公式'], [\FormFigureID{1}], ['如图'], ['[FIGURE]'], ['mathord', ',', 'mathord'], ['约束条件', '公式'], [[FORMULA]], ['mathord', '=', 'mathord', '+', 'textord', 'mathord'], ['最大值'], ['[MARK]']]

19 ['已知', '公式', \FormFigureID{1}, '如图', '[FIGURE]', 'mathord', ',', 'mathord', '约束条件', '公式', [FORMULA], 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '最大值', '[MARK]']

5 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[TEXT_BEGIN]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[TEXT_BEGIN]', '[SEP]'], ['[TEXT_BEGIN]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord'], ['[TEXT_BEGIN]', '最大值'], ['[MARK]']]

5 [['[TEXT_BEGIN]', '已知', '公式', '[TEXT_END]', '[FORMULA_BEGIN]', \FormFigureID{1}, '[FORMULA_END]', '[TEXT_BEGIN]', '如图', '[TEXT_END]', '[FIGURE]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[FORMULA_END]', '[TEXT_BEGIN]', '约束条件', '公式', '[TEXT_END]', '[FORMULA_BEGIN]', [FORMULA], '[FORMULA_END]', '[TEXT_BEGIN]', '[TEXT_END]', '[SEP]'], ['[TEXT_BEGIN]', '[TEXT_END]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[FORMULA_END]'], ['[TEXT_BEGIN]', '最大值', '[TEXT_END]'], ['[MARK]']]