Source code for EduNLP.SIF.tokenization.text.tokenization

# coding: utf-8
# 2021/5/18 @ tongshiwei
import logging
import jieba
from .stopwords import DEFAULT_STOPWORDS

jieba.setLogLevel(logging.INFO)


def is_chinese(word):
    """判断一个char或者string是否是汉字(串)"""
    for char in word:
        if char < u'\u4e00' or char > u'\u9fa5':
            return False
    return True


[docs]def tokenize(text, granularity="word", stopwords="default"): """ Using jieba library to tokenize item by word or char. Parameters ---------- text granularity stopwords: str, None or set Returns ------- Examples -------- >>> tokenize("三角函数是基本初等函数之一") ['三角函数', '初等', '函数'] >>> tokenize("三角函数是基本初等函数之一", granularity="char") ['三', '角', '函', '数', '初', '等', '函', '数'] """ stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords stopwords = stopwords if stopwords is not None else {} if granularity == "word": return [token for token in jieba.cut(text) if token not in stopwords and token.strip()] elif granularity == "char": jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()] # Use jieba_tokens to hangle sentence with mixed chinese and english. split_tokens = [] for token in jieba_tokens: if is_chinese(token): split_tokens.extend(list(token)) else: split_tokens.append(token) return split_tokens else: raise TypeError("Unknown granularity %s" % granularity)