Source code for EduNLP.SIF.tokenization.formula.ast_token

# coding: utf-8
# 2021/5/20 @ tongshiwei
import networkx as nx
from EduNLP.Formula import Formula


# def inorder_traversal(ast: nx.DiGraph):
#     visit = set()
#     nodes = []
#
#     def _inorder_traversal(_node):
#         if _node in visit:
#             return
#         successors = list(ast.successors(_node))
#         if successors:
#             if len(successors) <= 2:
#                 _inorder_traversal(successors[0])
#                 nodes.append(_node)
#                 visit.add(_node)
#                 if len(successors) == 2:
#                     _inorder_traversal(successors[1])
#             else:
#                 nodes.append(_node)
#                 for successor in successors:
#                     if successor in visit:
#                         continue
#                     _inorder_traversal(successor)
#         else:
#             nodes.append(_node)
#
#     for node in ast.nodes:
#         if node in visit or list(ast.predecessors(node)):
#             continue
#         _inorder_traversal(node)
#     return nodes

[docs]def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post", *args, **kwargs): """ The part will run only when the return type is list. And it provides two strategy: post and linear. Besides, tokens list will append node follow its type. """ tokens = [] if strategy == "post": order = nx.dfs_postorder_nodes(ast) elif strategy == "linear": # pragma: no cover order = ast.nodes else: # pragma: no cover raise ValueError("Unknown traversal strategy: %s" % strategy) for i in order: node = ast.nodes[i] if node.get("type", "ignore") == "ignore": continue if ord2token is True and node["type"] in ["mathord", "textord", "text"]: if var_numbering is True and node["type"] == "mathord": tokens.append("%s_%s" % (node["type"], node.get("var", "con"))) else: tokens.append(node["type"]) else: tokens.append(node["text"]) return tokens
[docs]def ast_tokenize(formula, ord2token=False, var_numbering=False, return_type="formula", *args, **kwargs): """ According to return type, tokenizing formula by different methods. Parameters ---------- formula ord2token var_numbering return_type args kwargs Returns ------- Examples -------- >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list") ['x', '+', 'y', '{ }', '\\\\pi', '{ }', '2', '{ }', '\\\\frac', '\\\\supsub', '+', '1', '=', 'x'] >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list", ord2token=True) ['mathord', '+', 'mathord', '{ }', 'mathord', '{ }', 'textord', '{ }', '\\\\frac', '\\\\supsub', '+', 'textord', \ '=', 'mathord'] >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list", ord2token=True, var_numbering=True) ['mathord_0', '+', 'mathord_1', '{ }', 'mathord_con', '{ }', 'textord', '{ }', '\\\\frac', '\\\\supsub', \ '+', 'textord', '=', 'mathord_0'] >>> len(ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="ast").nodes) 14 >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x") <Formula: {x + y}^\\frac{\\pi}{2} + 1 = x> """ if return_type == "list": ast = Formula(formula, variable_standardization=True).ast_graph return traversal_formula(ast, ord2token=ord2token, var_numbering=var_numbering) elif return_type == "formula": return Formula(formula) elif return_type == "ast": return Formula(formula).ast_graph else: raise ValueError()
if __name__ == '__main__': print(ast_tokenize(r"{x + y}^\frac{\pi}{2} + 1 = x", return_type="list", ord2token=True, var_numbering=True))