Source code for EduNLP.Tokenizer.tokenizer

# coding: utf-8
# 2021/8/1 @ tongshiwei

from typing import Iterable
from ..SIF.segment import seg
from ..SIF.tokenization import tokenize

__all__ = ["TOKENIZER", "Tokenizer", "PureTextTokenizer", "TextTokenizer", "get_tokenizer"]


[docs]class Tokenizer(object):
    def __call__(self, *args, **kwargs):
        raise NotImplementedError


[docs]class PureTextTokenizer(Tokenizer):
    r"""
    Duel with text and plain text formula.
    And filting special formula like $\\FormFigureID{…}$ and $\\FormFigureBase64{…}.

    Parameters
    ----------
    items: str
    key
    args
    kwargs

    Returns
    -------
    token

    Examples
    --------
    >>> tokenizer = PureTextTokenizer()
    >>> items = ["有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$,\
    ... 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$"]
    >>> tokens = tokenizer(items)
    >>> next(tokens)[:10]
    ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z']
    >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
    >>> tokens = tokenizer(items)
    >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
    ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
    '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
    '\\quad', 'A', '\\cap', 'B', '=']
    >>> items = [{
    ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
    ... "options": ["1", "2"]
    ... }]
    >>> tokens = tokenizer(items, key=lambda x: x["stem"])
    >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
    ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
    '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
    '\\quad', 'A', '\\cap', 'B', '=']
    """

    def __init__(self, *args, **kwargs):
        self.tokenization_params = {
            "formula_params": {
                "method": "linear",
                "skip_figure_formula": True
            }
        }

    def __call__(self, items: Iterable, key=lambda x: x, *args, **kwargs):
        for item in items:
            yield tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params).tokens


[docs]class TextTokenizer(Tokenizer):
    r"""
    Duel with text and formula including special formula.

    Parameters
    ----------
    items: str
    key
    args
    kwargs

    Returns
    -------
    token

    Examples
    ----------
    >>> tokenizer = TextTokenizer()
    >>> items = ["有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$,\
    ... 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$"]
    >>> tokens = tokenizer(items)
    >>> next(tokens)[:10]
    ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
    >>> items = ["$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$，则$|z|=$$\\SIFTag{stem_end}$\
    ... $\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$\
    ... $\\SIFTag{list_3}$2$\\SIFTag{options_end}$"]
    >>> tokens = tokenizer(items)
    >>> next(tokens)[:10]
    ['[TAG]', '复数', 'z', '=', '1', '+', '2', 'i', '+', 'i']
    """

    def __init__(self, *args, **kwargs):
        self.tokenization_params = {
            "formula_params": {
                "method": "linear",
                "symbolize_figure_formula": True
            }
        }

    def __call__(self, items: Iterable, key=lambda x: x, *args, **kwargs):
        for item in items:
            yield tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params).tokens


TOKENIZER = {
    "pure_text": PureTextTokenizer,
    "text": TextTokenizer
}


[docs]def get_tokenizer(name, *args, **kwargs):
    r"""
    It is a total interface to use difference tokenizer.
    Parameters
    ----------
    name: str
        the name of tokenizer, e.g. text, pure_text.
    args:
        the parameters passed to tokenizer
    kwargs:
        the parameters passed to tokenizer
    Returns
    -------
    tokenizer: Tokenizer
    Examples
    --------
    >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
    >>> tokenizer = get_tokenizer("text")
    >>> tokens = tokenizer(items)
    >>> next(tokens)  # doctest: +NORMALIZE_WHITESPACE
    ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
    '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
    '\\quad', 'A', '\\cap', 'B', '=']
    """
    if name not in TOKENIZER:
        raise KeyError(
            "Unknown tokenizer %s, use one of the provided tokenizers: %s" % (name, ", ".join(TOKENIZER.keys()))
        )
    return TOKENIZER[name](*args, **kwargs)