from EduNLP.Formula.ast import str2ast, katex_parse
import re
[docs]class Parser:
"""
Parse the item to standard format.
"""
def __init__(self, data, check_formula=True):
self.lookahead = 0
self.head = 0
self.text = data
self.error_message = ''
self.error_postion = 0
self.error_flag = 0
self.modify_flag = 0
self.warnning = 0
self.fomula_illegal_flag = 0
self.fomula_illegal_message = ''
self.check_formula = check_formula
# 定义特殊变量
self.len_bracket = len('$\\SIFChoice$')
self.len_underline = len('$\\SIFBlank$')
# 定义 token
self.error = -1
self.character = 1
self.en_pun = 2
self.ch_pun = 3
self.latex = 4
self.end = 5
self.empty = 6
self.modify = 7
self.blank = 8
self.en_pun_list = [',', '.', '?', '!',
':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '\\', '<', '>', '[', ']',
'-'] # add some other chars
self.ch_pun_list = [',', '。', '!', '?', ':',
';', '‘', '’', '“', '”', '(', ')', ' ', '、', '《', '》', '—', '.']
self.in_list = [',', '_', '-', '%']
self.flag_list = [',', '。', '!', '?', ':',
';', '‘', '’', '“', '”', '(', ')', ' ', '、', '《', '》',
'$', ',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '<', '>', '-',
'[', ']', '—']
[docs] def is_number(self, uchar):
"""判断一个unicode是否是数字"""
if u'\u0030' <= uchar <= u'\u0039':
# print(uchar, ord(uchar))(u'\u0030' <= uchar <= u'\u0039')
return True
else:
return False
[docs] def is_alphabet(self, uchar):
"""判断一个unicode是否是英文字母"""
if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
return True
else:
return False
[docs] def is_chinese(self, uchar):
"""判断一个unicode是否是汉字"""
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False
def _is_formula_legal(self, formula_str):
r"""
Judge whether the current formula meet our specification or not.
Parameters
----------
formula_str
Returns
-------
True or False
"""
legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64',
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf']
for tag in legal_tags:
if tag in formula_str:
return True
try:
katex_parse(formula_str)
except Exception as e:
assert 'ParseError' in str(e)
self.fomula_illegal_message = "[FormulaError] " + str(e)
self.fomula_illegal_flag = 1
return False
return True
[docs] def call_error(self):
"""语法解析函数"""
# print('ERROR::position is >>> ',self.head)
# print('ERROR::match is >>>', self.text[self.head])
self.error_postion = self.head
self.error_message = self.text[:self.head + 1]
self.error_flag = 1
[docs] def get_token(self):
r"""
Get different elements in the item.
Parameters
----------
Returns
-------
elements:chinese,alphabet,number,ch_pun_list,en_pun_list,latex formula
"""
if self.head >= len(self.text):
return self.empty
ch = self.text[self.head]
if self.is_chinese(ch):
# 匹配中文字符 [\u4e00-\u9fa5]
self.head += 1
return self.character
elif self.is_alphabet(ch):
# 匹配公式之外的英文字母,只对两个汉字之间的字母做修正,其余匹配到的情况视为不合 latex 语法录入的公式
left = head = self.head
if self.head == 0:
while (head < len(self.text) and (
self.is_alphabet(self.text[head]) or self.text[head] in self.in_list)):
head += 1
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
self.head = head
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
self.head += 2
# print(self.text[left:self.head])
self.modify = 1
return self.modify
else:
forward = self.text[self.head - 1]
if self.is_chinese(forward) or forward in self.flag_list:
while (head < len(self.text) and (
self.is_alphabet(self.text[head]) or self.text[head] in self.in_list)):
head += 1
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
self.head = head
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
self.head += 2
self.modify_flag = 1
return self.modify
# self.call_error()
# return self.error
elif self.is_number(ch):
# 匹配公式之外的数字,只对两个汉字之间的数字做修正,其余匹配到的情况视为不合 latex 语法录入的公式
left = head = self.head
if self.head == 0:
while (head < len(self.text) and (
self.is_number(self.text[head]) or self.text[head] in self.in_list)):
head += 1
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
self.head = head
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
self.head += 2
self.modify_flag = 1
return self.modify
else:
forward = self.text[self.head - 1]
if self.is_chinese(forward) or forward in self.flag_list:
while (head < len(self.text) and (
self.is_number(self.text[head]) or self.text[head] in self.in_list)):
head += 1
if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
self.head = head
self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
self.head += 2
self.modify_flag = 1
return self.modify
# self.call_error()
# return self.error
elif ch == '\n':
# 匹配换行符
self.head += 1
return self.end
elif ch in self.ch_pun_list:
# 匹配中文标点
left = self.head
self.head += 1
if self.text[left] == '(':
# 匹配到一个左括号
while self.text[self.head] == ' ' or self.text[self.head] == '\xa0':
self.head += 1
if self.text[self.head] == ')':
self.head += 1
self.text = self.text[:left] + '$\\SIFChoice$' + self.text[self.head:]
self.head += self.len_bracket
self.modify_flag = 1
return self.modify
return self.ch_pun
elif ch in self.en_pun_list:
# 匹配英文标点
# print('en-pun-list')
left = self.head
self.head += 1
if self.text[left] == '(':
# 匹配到一个左括号
while self.text[self.head] == ' ' or self.text[self.head] == '\xa0':
self.head += 1
if self.text[self.head] == ')':
self.head += 1
self.text = self.text[:left] + '$\\SIFChoice$' + self.text[self.head:]
self.head += self.len_bracket
self.modify_flag = 1
return self.modify
if self.text[left] == '_':
# 匹配到一个下划线
# print('this is an underline')
while self.text[self.head] == '_' or self.text[self.head] == ' ':
self.head += 1
if self.head >= len(self.text):
break
# print('change the text')
self.text = self.text[:left] + '$\\SIFBlank$' + self.text[self.head:]
self.head += self.len_underline
# print(self.text)
self.modify_flag = 1
return self.modify
return self.en_pun
elif ch == '$':
# 匹配 latex 公式
self.head += 1
flag = 1
formula_start = self.head
while self.head < len(self.text) and self.text[self.head] != '$':
ch_informula = self.text[self.head]
if flag and self.is_chinese(ch_informula):
# latex 中出现非法中文字符,打印且只打印一次 warning
print("Warning: there is some chinese characters in formula!")
self.warnning = 1
flag = 0
self.head += 1
if self.head >= len(self.text):
self.call_error()
return self.error
# 检查latex公式的完整性和可解析性
if self.check_formula and not self._is_formula_legal(self.text[formula_start:self.head]):
self.call_error()
return self.error
self.head += 1
# print('is latex!')
return self.latex
else:
self.call_error()
return self.error
[docs] def next_token(self):
# print('call next_token')
# if self.error_flag:
# return
self.lookahead = self.get_token()
if self.error_flag:
return
[docs] def match(self, terminal):
# print('call match')
# if self.error_flag:
# return
if self.lookahead == terminal:
self.next_token()
if self.error_flag:
return
# else:
# print('match error!')
# self.call_error()
[docs] def txt(self):
# print('call txt')
# if self.error_flag:
# return
self.lookahead = self.get_token()
if self.error_flag:
return
if self.lookahead == self.character or self.lookahead == self.en_pun or \
self.lookahead == self.ch_pun or self.lookahead == self.latex:
self.match(self.lookahead)
[docs] def txt_list(self):
# print('call txt_list')
# if self.error_flag:
# return
self.txt()
if self.error_flag:
return
if self.lookahead != self.empty:
self.txt_list()
[docs] def description(self):
# print('call description')
# if self.error_flag:
# return
self.txt_list()
if self.error_flag:
return
if self.lookahead == self.empty:
self.match(self.lookahead)
[docs] def description_list(self):
r"""
use Parser to process and describe the txt
Parameters
----------
Returns
----------
Examples
--------
>>> text = '生产某种零件的A工厂25名工人的日加工零件数_ _'
>>> text_parser = Parser(text)
>>> text_parser.description_list()
>>> text_parser.text
'生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$'
>>> text = 'X的分布列为( )'
>>> text_parser = Parser(text)
>>> text_parser.description_list()
>>> text_parser.text
'$X$的分布列为$\\SIFChoice$'
>>> text = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D'
>>> text_parser = Parser(text)
>>> text_parser.description_list()
>>> text_parser.error_flag
1
>>> text = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$'
>>> text_parser = Parser(text)
>>> text_parser.description_list()
>>> text_parser.fomula_illegal_flag
1
"""
# print('call description_list')
self.description()
if self.error_flag:
# print("Error")
return
if self.lookahead != self.empty:
self.description_list() # pragma: no cover
else:
self.error_flag = 0
# print('parse successfully!')