Source code for EduNLP.Vector.t2v

# coding: utf-8
# 2021/7/13 @ tongshiwei

import os
import json
import requests
from longling import path_append
from EduData import get_data
from .rnn import RNNModel
from .gensim_vec import W2V, D2V
from .bert_vec import BertModel
from .quesnet import QuesNetModel
from .elmo_vec import ElmoModel
from .meta import Vector
from EduNLP.constant import MODEL_DIR
from .disenqnet import DisenQModel


MODELS = {
    "w2v": W2V,
    "d2v": D2V,
    "rnn": RNNModel,
    "lstm": RNNModel,
    "gru": RNNModel,
    "elmo": ElmoModel,
    'bert': BertModel,
    'quesnet': QuesNetModel,
    "disenq": DisenQModel,
}


MODELHUB_URL = 'https://modelhub-backend-269-production.env.bdaa.pro/v1/api/'


[docs]class T2V(object): """ The function aims to transfer token list to vector. If you have a certain model, you can use T2V directly. \ Otherwise, calling get_pretrained_t2v function is a better way to get vector which can switch it without your model. Parameters ---------- model: str select the model type e.g.: d2v, rnn, lstm, gru, elmo, etc. Examples -------- >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> model_dir = "examples/test_model/d2v" >>> url, model_name, *args = get_pretrained_model_info('d2v_test_256') >>> (); path = get_data(url, model_dir); () # doctest: +ELLIPSIS (...) >>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True) >>> t2v = T2V('d2v',filepath=path) >>> print(t2v(item)) [array([...dtype=float32)] """ def __init__(self, model: str, *args, **kwargs): model = model.lower() self.model_type = model if model in {"rnn", "lstm", "gru"}: self.i2v: Vector = MODELS[model](model, *args, **kwargs) else: self.i2v: Vector = MODELS[model](*args, **kwargs) def __call__(self, items, *args, **kwargs): return self.i2v.infer_vector(items, *args, **kwargs)
[docs] def infer_vector(self, items, *args, **kwargs): return self.i2v.infer_vector(items, *args, **kwargs)
[docs] def infer_tokens(self, items, *args, **kwargs): return self.i2v.infer_tokens(items, *args, **kwargs)
@property def vector_size(self) -> int: return self.i2v.vector_size
[docs]def get_pretrained_model_info(name): url = MODELHUB_URL + 'getPretrainedModel' param = {'name': name} r = requests.get(url, params=param) assert r.status_code == 200, r.status_code r = json.loads(r.content) return [r['url'], r['t2v_name']]
[docs]def get_all_pretrained_models(): url = MODELHUB_URL + 'getPretrainedModelList' r = requests.get(url) assert r.status_code == 200, r.status_code r = json.loads(r.content) return r['name']
[docs]def get_pretrained_t2v(name, model_dir=MODEL_DIR, **kwargs): """ It is a good idea if you want to switch token list to vector earily. Parameters ---------- name:str select the pretrained model e.g.: d2v_math_300 w2v_math_300 elmo_math_2048 bert_math_768 bert_taledu_768 disenq_math_256 quesnet_math_512 model_dir:str the path of model, default: MODEL_DIR = '~/.EduNLP/model' Returns ------- t2v model: T2V Examples -------- >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> i2v = get_pretrained_t2v("d2v_test_256", "examples/test_model/d2v") # doctest: +SKIP >>> print(i2v(item)) # doctest: +SKIP [array([...dtype=float32)] """ pretrained_models = get_all_pretrained_models() if name not in pretrained_models: raise KeyError( "Unknown pretrained model %s, use one of the provided pretrained models: %s" % ( name, ", ".join(pretrained_models)) ) url, model_name, *args = get_pretrained_model_info(name) model_path = get_data(url, model_dir) if model_name in ["d2v", "w2v"]: postfix = ".bin" if model_name == "d2v" else ".kv" model_path = path_append(model_path, os.path.basename(model_path) + postfix, to_str=True) return T2V(model_name, model_path, *args, **kwargs)