Commit f82a5e1d by 张敏捷

init

parents
# 文件夹
/data/
/.idea/
\ No newline at end of file
from anago.tagger import Tagger
from anago.trainer import Trainer
from anago.wrapper import Sequence
"""
Custom callbacks.
"""
import numpy as np
from keras.callbacks import Callback
from seqeval.metrics import f1_score, classification_report
class F1score(Callback):
def __init__(self, seq, preprocessor=None):
super(F1score, self).__init__()
self.seq = seq
self.p = preprocessor
def get_lengths(self, y_true):
lengths = []
for y in np.argmax(y_true, -1):
try:
i = list(y).index(0)
except ValueError:
i = len(y)
lengths.append(i)
return lengths
def on_epoch_end(self, epoch, logs={}):
label_true = []
label_pred = []
for i in range(len(self.seq)):
x_true, y_true = self.seq[i]
lengths = self.get_lengths(y_true)
y_pred = self.model.predict_on_batch(x_true)
y_true = self.p.inverse_transform(y_true, lengths)
y_pred = self.p.inverse_transform(y_pred, lengths)
label_true.extend(y_true)
label_pred.extend(y_pred)
score = f1_score(label_true, label_pred)
print(' - f1: {:04.2f}'.format(score * 100))
print(classification_report(label_true, label_pred))
logs['f1'] = score
This diff is collapsed. Click to expand it.
import os
import anago
import sys
import jieba
from gensim.models.keyedvectors import KeyedVectors
from anago.utils import load_data_and_labels
if __name__ == '__main__':
weights_file = "weights.h5"
params_file = "params.json"
preprocessor_file = "preprocessor.pickle"
raw_string = "张敏捷的四年本科学习生活已经结束"
str = " ".join(jieba.lcut(raw_string, cut_all=False))
print(str)
if sys.argv[1] == 'train':
DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/chinese/')
EMBEDDING_PATH = os.path.join(DATA_ROOT + 'newsblogbbs.vec')
train_path = os.path.join(DATA_ROOT, 'example.train')
print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
print(len(x_train), 'train sequences')
embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=False, unicode_errors="ignore").wv
# Use pre-trained word embeddings
model = anago.Sequence(embeddings=embeddings, word_embedding_dim=200, word_lstm_size=200)
model.fit(x_train, y_train)
model.save(weights_file=weights_file, params_file=params_file, preprocessor_file=preprocessor_file)
# print(model.analyze(text=str))
elif sys.argv[1] == 'predict':
model = anago.Sequence.load(weights_file=weights_file, params_file=params_file, preprocessor_file=preprocessor_file)
print(model.analyze(text=str))
"""
Model definition.
"""
import json
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Input, Dropout, TimeDistributed
from keras.layers.merge import Concatenate
from keras.models import Model, model_from_json
from anago.layers import CRF
def save_model(model, weights_file, params_file):
with open(params_file, 'w') as f:
params = model.to_json()
json.dump(json.loads(params), f, sort_keys=True, indent=4)
model.save_weights(weights_file)
def load_model(weights_file, params_file):
with open(params_file) as f:
model = model_from_json(f.read(), custom_objects={'CRF': CRF})
model.load_weights(weights_file)
return model
class BiLSTMCRF(object):
"""A Keras implementation of BiLSTM-CRF for sequence labeling.
References
--
Guillaume Lample, Miguel Ballesteros, Sandeep Subramanian, Kazuya Kawakami, Chris Dyer.
"Neural Architectures for Named Entity Recognition". Proceedings of NAACL 2016.
https://arxiv.org/abs/1603.01360
"""
def __init__(self,
num_labels,
word_vocab_size,
char_vocab_size=None,
word_embedding_dim=100,
char_embedding_dim=25,
word_lstm_size=100,
char_lstm_size=25,
fc_dim=100,
dropout=0.5,
embeddings=None,
use_char=True,
use_crf=True):
"""Build a Bi-LSTM CRF model.
Args:
word_vocab_size (int): word vocabulary size.
char_vocab_size (int): character vocabulary size.
num_labels (int): number of entity labels.
word_embedding_dim (int): word embedding dimensions.
char_embedding_dim (int): character embedding dimensions.
word_lstm_size (int): character LSTM feature extractor output dimensions.
char_lstm_size (int): word tagger LSTM output dimensions.
fc_dim (int): output fully-connected layer size.
dropout (float): dropout rate.
embeddings (numpy array): word embedding matrix.
use_char (boolean): add char feature.
use_crf (boolean): use crf as last layer.
"""
super(BiLSTMCRF).__init__()
self._char_embedding_dim = char_embedding_dim
self._word_embedding_dim = word_embedding_dim
self._char_lstm_size = char_lstm_size
self._word_lstm_size = word_lstm_size
self._char_vocab_size = char_vocab_size
self._word_vocab_size = word_vocab_size
self._fc_dim = fc_dim
self._dropout = dropout
self._use_char = use_char
self._use_crf = use_crf
self._embeddings = embeddings
self._num_labels = num_labels
def build(self):
# build word embedding
word_ids = Input(batch_shape=(None, None), dtype='int32', name='word_input')
inputs = [word_ids]
if self._embeddings is None:
word_embeddings = Embedding(input_dim=self._word_vocab_size,
output_dim=self._word_embedding_dim,
mask_zero=True,
name='word_embedding')(word_ids)
else:
word_embeddings = Embedding(input_dim=self._embeddings.shape[0],
output_dim=self._embeddings.shape[1],
mask_zero=True,
weights=[self._embeddings],
name='word_embedding')(word_ids)
# build character based word embedding
if self._use_char:
char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input')
inputs.append(char_ids)
char_embeddings = Embedding(input_dim=self._char_vocab_size,
output_dim=self._char_embedding_dim,
mask_zero=True,
name='char_embedding')(char_ids)
char_embeddings = TimeDistributed(Bidirectional(LSTM(self._char_lstm_size)))(char_embeddings)
word_embeddings = Concatenate()([word_embeddings, char_embeddings])
word_embeddings = Dropout(self._dropout)(word_embeddings)
z = Bidirectional(LSTM(units=self._word_lstm_size, return_sequences=True))(word_embeddings)
z = Dense(self._fc_dim, activation='tanh')(z)
if self._use_crf:
crf = CRF(self._num_labels, sparse_target=False)
loss = crf.loss_function
pred = crf(z)
else:
loss = 'categorical_crossentropy'
pred = Dense(self._num_labels, activation='softmax')(z)
model = Model(inputs=inputs, outputs=pred)
return model, loss
# -*- coding: utf-8 -*-
"""
Preprocessors.
"""
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from anago.utils import Vocabulary
def normalize_number(text):
return re.sub(r'[0-90123456789]', r'0', text)
class IndexTransformer(BaseEstimator, TransformerMixin):
"""Convert a collection of raw documents to a document id matrix.
Attributes:
_use_char: boolean. Whether to use char feature.
_num_norm: boolean. Whether to normalize text.
_word_vocab: dict. A mapping of words to feature indices.
_char_vocab: dict. A mapping of chars to feature indices.
_label_vocab: dict. A mapping of labels to feature indices.
"""
def __init__(self, lower=True, num_norm=True,
use_char=True, initial_vocab=None):
"""Create a preprocessor object.
Args:
lower: boolean. Whether to convert the texts to lowercase.
use_char: boolean. Whether to use char feature.
num_norm: boolean. Whether to normalize text.
initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
"""
self._num_norm = num_norm
self._use_char = use_char
self._word_vocab = Vocabulary(lower=lower)
self._char_vocab = Vocabulary(lower=False)
self._label_vocab = Vocabulary(lower=False, unk_token=False)
if initial_vocab:
self._word_vocab.add_documents([initial_vocab])
self._char_vocab.add_documents(initial_vocab)
def fit(self, X, y):
"""Learn vocabulary from training set.
Args:
X : iterable. An iterable which yields either str, unicode or file objects.
Returns:
self : IndexTransformer.
"""
self._word_vocab.add_documents(X)
self._label_vocab.add_documents(y)
if self._use_char:
for doc in X:
self._char_vocab.add_documents(doc)
self._word_vocab.build()
self._char_vocab.build()
self._label_vocab.build()
return self
def transform(self, X, y=None):
"""Transform documents to document ids.
Uses the vocabulary learned by fit.
Args:
X : iterable
an iterable which yields either str, unicode or file objects.
y : iterable, label strings.
Returns:
features: document id matrix.
y: label id matrix.
"""
word_ids = [self._word_vocab.doc2id(doc) for doc in X]
word_ids = pad_sequences(word_ids, padding='post')
if self._use_char:
char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
char_ids = pad_nested_sequences(char_ids)
features = [word_ids, char_ids]
else:
features = word_ids
if y is not None:
y = [self._label_vocab.doc2id(doc) for doc in y]
y = pad_sequences(y, padding='post')
y = to_categorical(y, self.label_size).astype(int)
# In 2018/06/01, to_categorical is a bit strange.
# >>> to_categorical([[1,3]], num_classes=4).shape
# (1, 2, 4)
# >>> to_categorical([[1]], num_classes=4).shape
# (1, 4)
# So, I expand dimensions when len(y.shape) == 2.
y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
return features, y
else:
print(features)
return features
def fit_transform(self, X, y=None, **params):
"""Learn vocabulary and return document id matrix.
This is equivalent to fit followed by transform.
Args:
X : iterable
an iterable which yields either str, unicode or file objects.
Returns:
list : document id matrix.
list: label id matrix.
"""
return self.fit(X, y).transform(X, y)
def inverse_transform(self, y, lengths=None):
"""Return label strings.
Args:
y: label id matrix.
lengths: sentences length.
Returns:
list: list of list of strings.
"""
y = np.argmax(y, -1)
inverse_y = [self._label_vocab.id2doc(ids) for ids in y]
if lengths is not None:
inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]
return inverse_y
@property
def word_vocab_size(self):
return len(self._word_vocab)
@property
def char_vocab_size(self):
return len(self._char_vocab)
@property
def label_size(self):
return len(self._label_vocab)
def save(self, file_path):
joblib.dump(self, file_path)
@classmethod
def load(cls, file_path):
p = joblib.load(file_path)
return p
def pad_nested_sequences(sequences, dtype='int32'):
"""Pads nested sequences to the same length.
This function transforms a list of list sequences
into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.
Args:
sequences: List of lists of lists.
dtype: Type of the output sequences.
# Returns
x: Numpy array.
"""
max_sent_len = 0
max_word_len = 0
for sent in sequences:
max_sent_len = max(len(sent), max_sent_len)
for word in sent:
max_word_len = max(len(word), max_word_len)
x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
for i, sent in enumerate(sequences):
for j, word in enumerate(sent):
x[i, j, :len(word)] = word
return x
"""
Model API.
"""
import numpy as np
from seqeval.metrics.sequence_labeling import get_entities
class Tagger(object):
"""A model API that tags input sentence.
Attributes:
model: Model.
preprocessor: Transformer. Preprocessing data for feature extraction.
tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
"""
#这儿 因为中文的关系 应该改成jieba分词 因为 英文是以空格分隔
def __init__(self, model, preprocessor, tokenizer=str.split):
self.model = model
self.preprocessor = preprocessor
self.tokenizer = tokenizer
def predict_proba(self, text):
"""Probability estimates.
The returned estimates for all classes are ordered by the
label of classes.
Args:
text : string, the input text.
Returns:
y : array-like, shape = [num_words, num_classes]
Returns the probability of the word for each class in the model,
"""
assert isinstance(text, str)
words = self.tokenizer(text)
X = self.preprocessor.transform([words])
y = self.model.predict(X)
y = y[0] # reduce batch dimension.
return y
def _get_prob(self, pred):
prob = np.max(pred, -1)
return prob
def _get_tags(self, pred):
tags = self.preprocessor.inverse_transform([pred])
tags = tags[0] # reduce batch dimension
return tags
def _build_response(self, sent, tags, prob):
words = self.tokenizer(sent)
res = {
'words': words,
'entities': [
]
}
chunks = get_entities(tags)
for chunk_type, chunk_start, chunk_end in chunks:
chunk_end += 1
entity = {
'text': ' '.join(words[chunk_start: chunk_end]),
'type': chunk_type,
'score': float(np.average(prob[chunk_start: chunk_end])),
'beginOffset': chunk_start,
'endOffset': chunk_end
}
res['entities'].append(entity)
return res
def analyze(self, text):
"""Analyze text and return pretty format.
Args:
text: string, the input text.
Returns:
res: dict.
Examples:
>>> text = 'President Obama is speaking at the White House.'
>>> model.analyze(text)
{
"words": [
"President",
"Obama",
"is",
"speaking",
"at",
"the",
"White",
"House."
],
"entities": [
{
"beginOffset": 1,
"endOffset": 2,
"score": 1,
"text": "Obama",
"type": "PER"
},
{
"beginOffset": 6,
"endOffset": 8,
"score": 1,
"text": "White House.",
"type": "ORG"
}
]
}
"""
pred = self.predict_proba(text)
tags = self._get_tags(pred)
prob = self._get_prob(pred)
res = self._build_response(text, tags, prob)
return res
def predict(self, text):
"""Predict using the model.
Args:
text: string, the input text.
Returns:
tags: list, shape = (num_words,)
Returns predicted values.
"""
pred = self.predict_proba(text)
tags = self._get_tags(pred)
return tags
"""Training-related module.
"""
from anago.callbacks import F1score
from anago.utils import NERSequence
class Trainer(object):
"""A trainer that train the model.
Attributes:
_model: Model.
_preprocessor: Transformer. Preprocessing data for feature extraction.
"""
def __init__(self, model, preprocessor=None):
self._model = model
self._preprocessor = preprocessor
def train(self, x_train, y_train, x_valid=None, y_valid=None,
epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
"""Trains the model for a fixed number of epochs (iterations on a dataset).
Args:
x_train: list of training data.
y_train: list of training target (label) data.
x_valid: list of validation data.
y_valid: list of validation target (label) data.
batch_size: Integer.
Number of samples per gradient update.
If unspecified, `batch_size` will default to 32.
epochs: Integer. Number of epochs to train the model.
verbose: Integer. 0, 1, or 2. Verbosity mode.
0 = silent, 1 = progress bar, 2 = one line per epoch.
callbacks: List of `keras.callbacks.Callback` instances.
List of callbacks to apply during training.
shuffle: Boolean (whether to shuffle the training data
before each epoch). `shuffle` will default to True.
"""
train_seq = NERSequence(x_train, y_train, batch_size, self._preprocessor.transform)
if x_valid and y_valid:
valid_seq = NERSequence(x_valid, y_valid, batch_size, self._preprocessor.transform)
f1 = F1score(valid_seq, preprocessor=self._preprocessor)
callbacks = [f1] + callbacks if callbacks else [f1]
self._model.fit_generator(generator=train_seq,
epochs=epochs,
callbacks=callbacks,
verbose=verbose,
shuffle=shuffle)
"""
Utility functions.
"""
import math
import os
from collections import Counter
import numpy as np
from keras.utils import Sequence, get_file
def download(url):
"""Download a trained weights, config and preprocessor.
Args:
url (str): target url.
"""
filepath = get_file(fname='tmp.zip', origin=url, extract=True)
base_dir = os.path.dirname(filepath)
weights_file = os.path.join(base_dir, 'weights.h5')
params_file = os.path.join(base_dir, 'params.json')
preprocessor_file = os.path.join(base_dir, 'preprocessor.pickle')
return weights_file, params_file, preprocessor_file
def load_data_and_labels(filename):
"""Loads data and label from a file.
Args:
filename (str): path to the file.
The file format is tab-separated values.
A blank line is required at the end of a sentence.
For example:
```
EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O
Peter B-PER
Blackburn I-PER
...
```
Returns:
tuple(numpy array, numpy array): data and labels.
Example:
>>> filename = 'conll2003/en/ner/train.txt'
>>> data, labels = load_data_and_labels(filename)
"""
sents, labels = [], []
words, tags = [], []
with open(filename, encoding="UTF-8") as f:
for line in f:
line = line.rstrip()
if line:
word, tag = line.split(' ')
words.append(word)
tags.append(tag)
else:
sents.append(words)
labels.append(tags)
words, tags = [], []
return sents, labels
#
# def load_data(file_path):
# if
class NERSequence(Sequence):
def __init__(self, x, y, batch_size=1, preprocess=None):
self.x = x
self.y = y
self.batch_size = batch_size
self.preprocess = preprocess
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size: (idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
return self.preprocess(batch_x, batch_y)
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
class Vocabulary(object):
"""A vocabulary that maps tokens to ints (storing a vocabulary).
Attributes:
_token_count: A collections.Counter object holding the frequencies of tokens
in the data used to build the Vocabulary.
_token2id: A collections.defaultdict instance mapping token strings to
numerical identifiers.
_id2token: A list of token strings indexed by their numerical identifiers.
"""
def __init__(self, max_size=None, lower=True, unk_token=True, specials=('<pad>',)):
"""Create a Vocabulary object.
Args:
max_size: The maximum size of the vocabulary, or None for no
maximum. Default: None.
lower: boolean. Whether to convert the texts to lowercase.
unk_token: boolean. Whether to add unknown token.
specials: The list of special tokens (e.g., padding or eos) that
will be prepended to the vocabulary. Default: ('<pad>',)
"""
self._max_size = max_size
self._lower = lower
self._unk = unk_token
self._token2id = {token: i for i, token in enumerate(specials)}
self._id2token = list(specials)
self._token_count = Counter()
def __len__(self):
return len(self._token2id)
def add_token(self, token):
"""Add token to vocabulary.
Args:
token (str): token to add.
"""
token = self.process_token(token)
self._token_count.update([token])
def add_documents(self, docs):
"""Update dictionary from a collection of documents. Each document is a list
of tokens.
Args:
docs (list): documents to add.
"""
for sent in docs:
sent = map(self.process_token, sent)
self._token_count.update(sent)
def doc2id(self, doc):
"""Get the list of token_id given doc.
Args:
doc (list): document.
Returns:
list: int id of doc.
"""
doc = map(self.process_token, doc)
return [self.token_to_id(token) for token in doc]
def id2doc(self, ids):
"""Get the token list.
Args:
ids (list): token ids.
Returns:
list: token list.
"""
return [self.id_to_token(idx) for idx in ids]
def build(self):
"""
Build vocabulary.
"""
token_freq = self._token_count.most_common(self._max_size)
idx = len(self.vocab)
for token, _ in token_freq:
self._token2id[token] = idx
self._id2token.append(token)
idx += 1
if self._unk:
unk = '<unk>'
self._token2id[unk] = idx
self._id2token.append(unk)
def process_token(self, token):
"""Process token before following methods:
* add_token
* add_documents
* doc2id
* token_to_id
Args:
token (str): token to process.
Returns:
str: processed token string.
"""
if self._lower:
token = token.lower()
return token
def token_to_id(self, token):
"""Get the token_id of given token.
Args:
token (str): token from vocabulary.
Returns:
int: int id of token.
"""
token = self.process_token(token)
return self._token2id.get(token, len(self._token2id) - 1)
def id_to_token(self, idx):
"""token-id to token (string).
Args:
idx (int): token id.
Returns:
str: string of given token id.
"""
return self._id2token[idx]
@property
def vocab(self):
"""Return the vocabulary.
Returns:
dict: get the dict object of the vocabulary.
"""
return self._token2id
@property
def reverse_vocab(self):
"""Return the vocabulary as a reversed dict object.
Returns:
dict: reversed vocabulary object.
"""
return self._id2token
def filter_embeddings(embeddings, vocab, dim):
"""Loads word vectors in numpy array.
Args:
embeddings (dict): a dictionary of numpy array.
vocab (dict): word_index lookup table.
Returns:
numpy array: an array of word embeddings.
"""
# if not isinstance(embeddings, dict):
# return
_embeddings = np.zeros([len(vocab), dim])
for word in vocab:
if word in embeddings:
word_idx = vocab[word]
_embeddings[word_idx] = embeddings[word]
return _embeddings
def load_glove(file):
"""Loads GloVe vectors in numpy array.
Args:
file (str): a path to a glove file.
Return:
dict: a dict of numpy arrays.
"""
model = {}
with open(file) as f:
for line in f:
line = line.split(' ')
word = line[0]
vector = np.array([float(val) for val in line[1:]])
model[word] = vector
return model
"""
Wrapper class.
"""
from seqeval.metrics import f1_score
from anago.models import BiLSTMCRF, save_model, load_model
from anago.preprocessing import IndexTransformer
from anago.tagger import Tagger
from anago.trainer import Trainer
from anago.utils import filter_embeddings
class Sequence(object):
def __init__(self,
word_embedding_dim=100,
char_embedding_dim=25,
word_lstm_size=100,
char_lstm_size=25,
fc_dim=100,
dropout=0.5,
embeddings=None,
use_char=True,
use_crf=True,
initial_vocab=None,
optimizer='adam'):
self.model = None
self.p = None
self.tagger = None
self.word_embedding_dim = word_embedding_dim
self.char_embedding_dim = char_embedding_dim
self.word_lstm_size = word_lstm_size
self.char_lstm_size = char_lstm_size
self.fc_dim = fc_dim
self.dropout = dropout
self.embeddings = embeddings
self.use_char = use_char
self.use_crf = use_crf
self.initial_vocab = initial_vocab
self.optimizer = optimizer
def fit(self, x_train, y_train, x_valid=None, y_valid=None,
epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
"""Fit the model for a fixed number of epochs.
Args:
x_train: list of training data.
y_train: list of training target (label) data.
x_valid: list of validation data.
y_valid: list of validation target (label) data.
batch_size: Integer.
Number of samples per gradient update.
If unspecified, `batch_size` will default to 32.
epochs: Integer. Number of epochs to train the model.
verbose: Integer. 0, 1, or 2. Verbosity mode.
0 = silent, 1 = progress bar, 2 = one line per epoch.
callbacks: List of `keras.callbacks.Callback` instances.
List of callbacks to apply during training.
shuffle: Boolean (whether to shuffle the training data
before each epoch). `shuffle` will default to True.
"""
p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
p.fit(x_train, y_train)
embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim)
model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
word_vocab_size=p.word_vocab_size,
num_labels=p.label_size,
word_embedding_dim=self.word_embedding_dim,
char_embedding_dim=self.char_embedding_dim,
word_lstm_size=self.word_lstm_size,
char_lstm_size=self.char_lstm_size,
fc_dim=self.fc_dim,
dropout=self.dropout,
embeddings=embeddings,
use_char=self.use_char,
use_crf=self.use_crf)
model, loss = model.build()
model.compile(loss=loss, optimizer=self.optimizer)
trainer = Trainer(model, preprocessor=p)
trainer.train(x_train, y_train, x_valid, y_valid,
epochs=epochs, batch_size=batch_size,
verbose=verbose, callbacks=callbacks,
shuffle=shuffle)
self.p = p
self.model = model
def score(self, x_test, y_test):
"""Returns the f1-micro score on the given test data and labels.
Args:
x_test : array-like, shape = (n_samples, sent_length)
Test samples.
y_test : array-like, shape = (n_samples, sent_length)
True labels for x.
Returns:
score : float, f1-micro score.
"""
if self.model:
x_test = self.p.transform(x_test)
lengths = map(len, y_test)
y_pred = self.model.predict(x_test)
y_pred = self.p.inverse_transform(y_pred, lengths)
score = f1_score(y_test, y_pred)
return score
else:
raise OSError('Could not find a model. Call load(dir_path).')
def analyze(self, text, tokenizer=str.split):
"""Analyze text and return pretty format.
Args:
text: string, the input text.
tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
Returns:
res: dict.
"""
if not self.tagger:
self.tagger = Tagger(self.model,
preprocessor=self.p,
tokenizer=tokenizer)
return self.tagger.analyze(text)
def save(self, weights_file, params_file, preprocessor_file):
self.p.save(preprocessor_file)
save_model(self.model, weights_file, params_file)
@classmethod
def load(cls, weights_file, params_file, preprocessor_file):
self = cls()
self.p = IndexTransformer.load(preprocessor_file)
self.model = load_model(weights_file, params_file)
return self
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment