init

f82a5e1d · 张敏捷 · f82a5e1d · f82a5e1d · f82a5e1d · f82a5e1d
Commit f82a5e1d authored Aug 01, 2018 by 张敏捷
27 changed files
--- a/.gitignore
+++ b/.gitignore
+# 文件夹
+/data/
+/.idea/
\ No newline at end of file
--- a/anago/__init__.py
+++ b/anago/__init__.py
+from anago.tagger import Tagger
+from anago.trainer import Trainer
+from anago.wrapper import Sequence
--- a/anago/__init__.pyc
+++ b/anago/__init__.pyc
--- a/anago/__pycache__/__init__.cpython-35.pyc
+++ b/anago/__pycache__/__init__.cpython-35.pyc
--- a/anago/__pycache__/callbacks.cpython-35.pyc
+++ b/anago/__pycache__/callbacks.cpython-35.pyc
--- a/anago/__pycache__/layers.cpython-35.pyc
+++ b/anago/__pycache__/layers.cpython-35.pyc
--- a/anago/__pycache__/models.cpython-35.pyc
+++ b/anago/__pycache__/models.cpython-35.pyc
--- a/anago/__pycache__/preprocessing.cpython-35.pyc
+++ b/anago/__pycache__/preprocessing.cpython-35.pyc
--- a/anago/__pycache__/tagger.cpython-35.pyc
+++ b/anago/__pycache__/tagger.cpython-35.pyc
--- a/anago/__pycache__/trainer.cpython-35.pyc
+++ b/anago/__pycache__/trainer.cpython-35.pyc
--- a/anago/__pycache__/utils.cpython-35.pyc
+++ b/anago/__pycache__/utils.cpython-35.pyc
--- a/anago/__pycache__/wrapper.cpython-35.pyc
+++ b/anago/__pycache__/wrapper.cpython-35.pyc
--- a/anago/callbacks.py
+++ b/anago/callbacks.py
+"""
+Custom callbacks.
+"""
+import numpy as np
+from keras.callbacks import Callback
+from seqeval.metrics import f1_score, classification_report
+class F1score(Callback):
+    def __init__(self, seq, preprocessor=None):
+        super(F1score, self).__init__()
+        self.seq = seq
+        self.p = preprocessor
+    def get_lengths(self, y_true):
+        lengths = []
+        for y in np.argmax(y_true, -1):
+            try:
+                i = list(y).index(0)
+            except ValueError:
+                i = len(y)
+            lengths.append(i)
+        return lengths
+    def on_epoch_end(self, epoch, logs={}):
+        label_true = []
+        label_pred = []
+        for i in range(len(self.seq)):
+            x_true, y_true = self.seq[i]
+            lengths = self.get_lengths(y_true)
+            y_pred = self.model.predict_on_batch(x_true)
+            y_true = self.p.inverse_transform(y_true, lengths)
+            y_pred = self.p.inverse_transform(y_pred, lengths)
+            label_true.extend(y_true)
+            label_pred.extend(y_pred)
+        score = f1_score(label_true, label_pred)
+        print(' - f1: {:04.2f}'.format(score * 100))
+        print(classification_report(label_true, label_pred))
+        logs['f1'] = score
--- a/anago/callbacks.pyc
+++ b/anago/callbacks.pyc
--- a/anago/layers.py
+++ b/anago/layers.py
--- a/anago/main.py
+++ b/anago/main.py
+import os
+import anago
+import sys
+import jieba
+from gensim.models.keyedvectors import KeyedVectors
+from anago.utils import load_data_and_labels
+if __name__ == '__main__':
+    weights_file = "weights.h5"
+    params_file = "params.json"
+    preprocessor_file = "preprocessor.pickle"
+    raw_string = "张敏捷的四年本科学习生活已经结束"
+    str = " ".join(jieba.lcut(raw_string, cut_all=False))
+    print(str)
+    if sys.argv[1] == 'train':
+        DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/chinese/')
+        EMBEDDING_PATH = os.path.join(DATA_ROOT + 'newsblogbbs.vec')
+        train_path = os.path.join(DATA_ROOT, 'example.train')
+        print('Loading data...')
+        x_train, y_train = load_data_and_labels(train_path)
+        print(len(x_train), 'train sequences')
+        embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=False, unicode_errors="ignore").wv
+        # Use pre-trained word embeddings
+        model = anago.Sequence(embeddings=embeddings, word_embedding_dim=200, word_lstm_size=200)
+        model.fit(x_train, y_train)
+        model.save(weights_file=weights_file, params_file=params_file, preprocessor_file=preprocessor_file)
+        # print(model.analyze(text=str))
+    elif sys.argv[1] == 'predict':
+        model = anago.Sequence.load(weights_file=weights_file, params_file=params_file, preprocessor_file=preprocessor_file)
+        print(model.analyze(text=str))
--- a/anago/models.py
+++ b/anago/models.py
+"""
+Model definition.
+"""
+import json
+from keras.layers import Dense, LSTM, Bidirectional, Embedding, Input, Dropout, TimeDistributed
+from keras.layers.merge import Concatenate
+from keras.models import Model, model_from_json
+from anago.layers import CRF
+def save_model(model, weights_file, params_file):
+    with open(params_file, 'w') as f:
+        params = model.to_json()
+        json.dump(json.loads(params), f, sort_keys=True, indent=4)
+        model.save_weights(weights_file)
+def load_model(weights_file, params_file):
+    with open(params_file) as f:
+        model = model_from_json(f.read(), custom_objects={'CRF': CRF})
+        model.load_weights(weights_file)
+    return model
+class BiLSTMCRF(object):
+    """A Keras implementation of BiLSTM-CRF for sequence labeling.
+    References
+    --
+    Guillaume Lample, Miguel Ballesteros, Sandeep Subramanian, Kazuya Kawakami, Chris Dyer.
+    "Neural Architectures for Named Entity Recognition". Proceedings of NAACL 2016.
+    https://arxiv.org/abs/1603.01360
+    """
+    def __init__(self,
+                 num_labels,
+                 word_vocab_size,
+                 char_vocab_size=None,
+                 word_embedding_dim=100,
+                 char_embedding_dim=25,
+                 word_lstm_size=100,
+                 char_lstm_size=25,
+                 fc_dim=100,
+                 dropout=0.5,
+                 embeddings=None,
+                 use_char=True,
+                 use_crf=True):
+        """Build a Bi-LSTM CRF model.
+        Args:
+            word_vocab_size (int): word vocabulary size.
+            char_vocab_size (int): character vocabulary size.
+            num_labels (int): number of entity labels.
+            word_embedding_dim (int): word embedding dimensions.
+            char_embedding_dim (int): character embedding dimensions.
+            word_lstm_size (int): character LSTM feature extractor output dimensions.
+            char_lstm_size (int): word tagger LSTM output dimensions.
+            fc_dim (int): output fully-connected layer size.
+            dropout (float): dropout rate.
+            embeddings (numpy array): word embedding matrix.
+            use_char (boolean): add char feature.
+            use_crf (boolean): use crf as last layer.
+        """
+        super(BiLSTMCRF).__init__()
+        self._char_embedding_dim = char_embedding_dim
+        self._word_embedding_dim = word_embedding_dim
+        self._char_lstm_size = char_lstm_size
+        self._word_lstm_size = word_lstm_size
+        self._char_vocab_size = char_vocab_size
+        self._word_vocab_size = word_vocab_size
+        self._fc_dim = fc_dim
+        self._dropout = dropout
+        self._use_char = use_char
+        self._use_crf = use_crf
+        self._embeddings = embeddings
+        self._num_labels = num_labels
+    def build(self):
+        # build word embedding
+        word_ids = Input(batch_shape=(None, None), dtype='int32', name='word_input')
+        inputs = [word_ids]
+        if self._embeddings is None:
+            word_embeddings = Embedding(input_dim=self._word_vocab_size,
+                                        output_dim=self._word_embedding_dim,
+                                        mask_zero=True,
+                                        name='word_embedding')(word_ids)
+        else:
+            word_embeddings = Embedding(input_dim=self._embeddings.shape[0],
+                                        output_dim=self._embeddings.shape[1],
+                                        mask_zero=True,
+                                        weights=[self._embeddings],
+                                        name='word_embedding')(word_ids)
+        # build character based word embedding
+        if self._use_char:
+            char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input')
+            inputs.append(char_ids)
+            char_embeddings = Embedding(input_dim=self._char_vocab_size,
+                                        output_dim=self._char_embedding_dim,
+                                        mask_zero=True,
+                                        name='char_embedding')(char_ids)
+            char_embeddings = TimeDistributed(Bidirectional(LSTM(self._char_lstm_size)))(char_embeddings)
+            word_embeddings = Concatenate()([word_embeddings, char_embeddings])
+        word_embeddings = Dropout(self._dropout)(word_embeddings)
+        z = Bidirectional(LSTM(units=self._word_lstm_size, return_sequences=True))(word_embeddings)
+        z = Dense(self._fc_dim, activation='tanh')(z)
+        if self._use_crf:
+            crf = CRF(self._num_labels, sparse_target=False)
+            loss = crf.loss_function
+            pred = crf(z)
+        else:
+            loss = 'categorical_crossentropy'
+            pred = Dense(self._num_labels, activation='softmax')(z)
+        model = Model(inputs=inputs, outputs=pred)
+        return model, loss
--- a/anago/params.json
+++ b/anago/params.json
--- a/anago/preprocessing.py
+++ b/anago/preprocessing.py
+# -*- coding: utf-8 -*-
+"""
+Preprocessors.
+"""
+import re
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.externals import joblib
+from keras.utils.np_utils import to_categorical
+from keras.preprocessing.sequence import pad_sequences
+from anago.utils import Vocabulary
+def normalize_number(text):
+    return re.sub(r'[0-9０１２３４５６７８９]', r'0', text)
+class IndexTransformer(BaseEstimator, TransformerMixin):
+    """Convert a collection of raw documents to a document id matrix.
+    Attributes:
+        _use_char: boolean. Whether to use char feature.
+        _num_norm: boolean. Whether to normalize text.
+        _word_vocab: dict. A mapping of words to feature indices.
+        _char_vocab: dict. A mapping of chars to feature indices.
+        _label_vocab: dict. A mapping of labels to feature indices.
+    """
+    def __init__(self, lower=True, num_norm=True,
+                 use_char=True, initial_vocab=None):
+        """Create a preprocessor object.
+        Args:
+            lower: boolean. Whether to convert the texts to lowercase.
+            use_char: boolean. Whether to use char feature.
+            num_norm: boolean. Whether to normalize text.
+            initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
+        """
+        self._num_norm = num_norm
+        self._use_char = use_char
+        self._word_vocab = Vocabulary(lower=lower)
+        self._char_vocab = Vocabulary(lower=False)
+        self._label_vocab = Vocabulary(lower=False, unk_token=False)
+        if initial_vocab:
+            self._word_vocab.add_documents([initial_vocab])
+            self._char_vocab.add_documents(initial_vocab)
+    def fit(self, X, y):
+        """Learn vocabulary from training set.
+        Args:
+            X : iterable. An iterable which yields either str, unicode or file objects.
+        Returns:
+            self : IndexTransformer.
+        """
+        self._word_vocab.add_documents(X)
+        self._label_vocab.add_documents(y)
+        if self._use_char:
+            for doc in X:
+                self._char_vocab.add_documents(doc)
+        self._word_vocab.build()
+        self._char_vocab.build()
+        self._label_vocab.build()
+        return self
+    def transform(self, X, y=None):
+        """Transform documents to document ids.
+        Uses the vocabulary learned by fit.
+        Args:
+            X : iterable
+            an iterable which yields either str, unicode or file objects.
+            y : iterable, label strings.
+        Returns:
+            features: document id matrix.
+            y: label id matrix.
+        """
+        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
+        word_ids = pad_sequences(word_ids, padding='post')
+        if self._use_char:
+            char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
+            char_ids = pad_nested_sequences(char_ids)
+            features = [word_ids, char_ids]
+        else:
+            features = word_ids
+        if y is not None:
+            y = [self._label_vocab.doc2id(doc) for doc in y]
+            y = pad_sequences(y, padding='post')
+            y = to_categorical(y, self.label_size).astype(int)
+            # In 2018/06/01, to_categorical is a bit strange.
+            # >>> to_categorical([[1,3]], num_classes=4).shape
+            # (1, 2, 4)
+            # >>> to_categorical([[1]], num_classes=4).shape
+            # (1, 4)
+            # So, I expand dimensions when len(y.shape) == 2.
+            y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
+            return features, y
+        else:
+            print(features)
+            return features
+    def fit_transform(self, X, y=None, **params):
+        """Learn vocabulary and return document id matrix.
+        This is equivalent to fit followed by transform.
+        Args:
+            X : iterable
+            an iterable which yields either str, unicode or file objects.
+        Returns:
+            list : document id matrix.
+            list: label id matrix.
+        """
+        return self.fit(X, y).transform(X, y)
+    def inverse_transform(self, y, lengths=None):
+        """Return label strings.
+        Args:
+            y: label id matrix.
+            lengths: sentences length.
+        Returns:
+            list: list of list of strings.
+        """
+        y = np.argmax(y, -1)
+        inverse_y = [self._label_vocab.id2doc(ids) for ids in y]
+        if lengths is not None:
+            inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]
+        return inverse_y
+    @property
+    def word_vocab_size(self):
+        return len(self._word_vocab)
+    @property
+    def char_vocab_size(self):
+        return len(self._char_vocab)
+    @property
+    def label_size(self):
+        return len(self._label_vocab)
+    def save(self, file_path):
+        joblib.dump(self, file_path)
+    @classmethod
+    def load(cls, file_path):
+        p = joblib.load(file_path)
+        return p
+def pad_nested_sequences(sequences, dtype='int32'):
+    """Pads nested sequences to the same length.
+    This function transforms a list of list sequences
+    into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.
+    Args:
+        sequences: List of lists of lists.
+        dtype: Type of the output sequences.
+    # Returns
+        x: Numpy array.
+    """
+    max_sent_len = 0
+    max_word_len = 0
+    for sent in sequences:
+        max_sent_len = max(len(sent), max_sent_len)
+        for word in sent:
+            max_word_len = max(len(word), max_word_len)
+    x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
+    for i, sent in enumerate(sequences):
+        for j, word in enumerate(sent):
+            x[i, j, :len(word)] = word
+    return x
--- a/anago/preprocessor.pickle
+++ b/anago/preprocessor.pickle
--- a/anago/tagger.py
+++ b/anago/tagger.py
+"""
+Model API.
+"""
+import numpy as np
+from seqeval.metrics.sequence_labeling import get_entities
+class Tagger(object):
+    """A model API that tags input sentence.
+    Attributes:
+        model: Model.
+        preprocessor: Transformer. Preprocessing data for feature extraction.
+        tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
+    """
+    #这儿 因为中文的关系 应该改成jieba分词 因为 英文是以空格分隔
+    def __init__(self, model, preprocessor, tokenizer=str.split):
+        self.model = model
+        self.preprocessor = preprocessor
+        self.tokenizer = tokenizer
+    def predict_proba(self, text):
+        """Probability estimates.
+        The returned estimates for all classes are ordered by the
+        label of classes.
+        Args:
+            text : string, the input text.
+        Returns:
+            y : array-like, shape = [num_words, num_classes]
+            Returns the probability of the word for each class in the model,
+        """
+        assert isinstance(text, str)
+        words = self.tokenizer(text)
+        X = self.preprocessor.transform([words])
+        y = self.model.predict(X)
+        y = y[0]  # reduce batch dimension.
+        return y
+    def _get_prob(self, pred):
+        prob = np.max(pred, -1)
+        return prob
+    def _get_tags(self, pred):
+        tags = self.preprocessor.inverse_transform([pred])
+        tags = tags[0]  # reduce batch dimension
+        return tags
+    def _build_response(self, sent, tags, prob):
+        words = self.tokenizer(sent)
+        res = {
+            'words': words,
+            'entities': [
+            ]
+        }
+        chunks = get_entities(tags)
+        for chunk_type, chunk_start, chunk_end in chunks:
+            chunk_end += 1
+            entity = {
+                'text': ' '.join(words[chunk_start: chunk_end]),
+                'type': chunk_type,
+                'score': float(np.average(prob[chunk_start: chunk_end])),
+                'beginOffset': chunk_start,
+                'endOffset': chunk_end
+            }
+            res['entities'].append(entity)
+        return res
+    def analyze(self, text):
+        """Analyze text and return pretty format.
+        Args:
+            text: string, the input text.
+        Returns:
+            res: dict.
+        Examples:
+            >>> text = 'President Obama is speaking at the White House.'
+            >>> model.analyze(text)
+            {
+                "words": [
+                    "President",
+                    "Obama",
+                    "is",
+                    "speaking",
+                    "at",
+                    "the",
+                    "White",
+                    "House."
+                ],
+                "entities": [
+                    {
+                        "beginOffset": 1,
+                        "endOffset": 2,
+                        "score": 1,
+                        "text": "Obama",
+                        "type": "PER"
+                    },
+                    {
+                        "beginOffset": 6,
+                        "endOffset": 8,
+                        "score": 1,
+                        "text": "White House.",
+                        "type": "ORG"
+                    }
+                ]
+            }
+        """
+        pred = self.predict_proba(text)
+        tags = self._get_tags(pred)
+        prob = self._get_prob(pred)
+        res = self._build_response(text, tags, prob)
+        return res
+    def predict(self, text):
+        """Predict using the model.
+        Args:
+            text: string, the input text.
+        Returns:
+            tags: list, shape = (num_words,)
+            Returns predicted values.
+        """
+        pred = self.predict_proba(text)
+        tags = self._get_tags(pred)
+        return tags
--- a/anago/tagger.pyc
+++ b/anago/tagger.pyc
--- a/anago/trainer.py
+++ b/anago/trainer.py
+"""Training-related module.
+"""
+from anago.callbacks import F1score
+from anago.utils import NERSequence
+class Trainer(object):
+    """A trainer that train the model.
+    Attributes:
+        _model: Model.
+        _preprocessor: Transformer. Preprocessing data for feature extraction.
+    """
+    def __init__(self, model, preprocessor=None):
+        self._model = model
+        self._preprocessor = preprocessor
+    def train(self, x_train, y_train, x_valid=None, y_valid=None,
+              epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
+        """Trains the model for a fixed number of epochs (iterations on a dataset).
+        Args:
+            x_train: list of training data.
+            y_train: list of training target (label) data.
+            x_valid: list of validation data.
+            y_valid: list of validation target (label) data.
+            batch_size: Integer.
+                Number of samples per gradient update.
+                If unspecified, `batch_size` will default to 32.
+            epochs: Integer. Number of epochs to train the model.
+            verbose: Integer. 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during training.
+            shuffle: Boolean (whether to shuffle the training data
+                before each epoch). `shuffle` will default to True.
+        """
+        train_seq = NERSequence(x_train, y_train, batch_size, self._preprocessor.transform)
+        if x_valid and y_valid:
+            valid_seq = NERSequence(x_valid, y_valid, batch_size, self._preprocessor.transform)
+            f1 = F1score(valid_seq, preprocessor=self._preprocessor)
+            callbacks = [f1] + callbacks if callbacks else [f1]
+        self._model.fit_generator(generator=train_seq,
+                                  epochs=epochs,
+                                  callbacks=callbacks,
+                                  verbose=verbose,
+                                  shuffle=shuffle)
--- a/anago/trainer.pyc
+++ b/anago/trainer.pyc
--- a/anago/utils.py
+++ b/anago/utils.py
+"""
+Utility functions.
+"""
+import math
+import os
+from collections import Counter
+import numpy as np
+from keras.utils import Sequence, get_file
+def download(url):
+    """Download a trained weights, config and preprocessor.
+    Args:
+        url (str): target url.
+    """
+    filepath = get_file(fname='tmp.zip', origin=url, extract=True)
+    base_dir = os.path.dirname(filepath)
+    weights_file = os.path.join(base_dir, 'weights.h5')
+    params_file = os.path.join(base_dir, 'params.json')
+    preprocessor_file = os.path.join(base_dir, 'preprocessor.pickle')
+    return weights_file, params_file, preprocessor_file
+def load_data_and_labels(filename):
+    """Loads data and label from a file.
+    Args:
+        filename (str): path to the file.
+        The file format is tab-separated values.
+        A blank line is required at the end of a sentence.
+        For example:
+        ```
+        EU	B-ORG
+        rejects	O
+        German	B-MISC
+        call	O
+        to	O
+        boycott	O
+        British	B-MISC
+        lamb	O
+        .	O
+        Peter	B-PER
+        Blackburn	I-PER
+        ...
+        ```
+    Returns:
+        tuple(numpy array, numpy array): data and labels.
+    Example:
+        >>> filename = 'conll2003/en/ner/train.txt'
+        >>> data, labels = load_data_and_labels(filename)
+    """
+    sents, labels = [], []
+    words, tags = [], []
+    with open(filename, encoding="UTF-8") as f:
+        for line in f:
+            line = line.rstrip()
+            if line:
+                word, tag = line.split(' ')
+                words.append(word)
+                tags.append(tag)
+            else:
+                sents.append(words)
+                labels.append(tags)
+                words, tags = [], []
+    return sents, labels
+#
+# def load_data(file_path):
+#     if
+class NERSequence(Sequence):
+    def __init__(self, x, y, batch_size=1, preprocess=None):
+        self.x = x
+        self.y = y
+        self.batch_size = batch_size
+        self.preprocess = preprocess
+    def __getitem__(self, idx):
+        batch_x = self.x[idx * self.batch_size: (idx + 1) * self.batch_size]
+        batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
+        return self.preprocess(batch_x, batch_y)
+    def __len__(self):
+        return math.ceil(len(self.x) / self.batch_size)
+class Vocabulary(object):
+    """A vocabulary that maps tokens to ints (storing a vocabulary).
+    Attributes:
+        _token_count: A collections.Counter object holding the frequencies of tokens
+            in the data used to build the Vocabulary.
+        _token2id: A collections.defaultdict instance mapping token strings to
+            numerical identifiers.
+        _id2token: A list of token strings indexed by their numerical identifiers.
+    """
+    def __init__(self, max_size=None, lower=True, unk_token=True, specials=('<pad>',)):
+        """Create a Vocabulary object.
+        Args:
+            max_size: The maximum size of the vocabulary, or None for no
+                maximum. Default: None.
+            lower: boolean. Whether to convert the texts to lowercase.
+            unk_token: boolean. Whether to add unknown token.
+            specials: The list of special tokens (e.g., padding or eos) that
+                will be prepended to the vocabulary. Default: ('<pad>',)
+        """
+        self._max_size = max_size
+        self._lower = lower
+        self._unk = unk_token
+        self._token2id = {token: i for i, token in enumerate(specials)}
+        self._id2token = list(specials)
+        self._token_count = Counter()
+    def __len__(self):
+        return len(self._token2id)
+    def add_token(self, token):
+        """Add token to vocabulary.
+        Args:
+            token (str): token to add.
+        """
+        token = self.process_token(token)
+        self._token_count.update([token])
+    def add_documents(self, docs):
+        """Update dictionary from a collection of documents. Each document is a list
+        of tokens.
+        Args:
+            docs (list): documents to add.
+        """
+        for sent in docs:
+            sent = map(self.process_token, sent)
+            self._token_count.update(sent)
+    def doc2id(self, doc):
+        """Get the list of token_id given doc.
+        Args:
+            doc (list): document.
+        Returns:
+            list: int id of doc.
+        """
+        doc = map(self.process_token, doc)
+        return [self.token_to_id(token) for token in doc]
+    def id2doc(self, ids):
+        """Get the token list.
+        Args:
+            ids (list): token ids.
+        Returns:
+            list: token list.
+        """
+        return [self.id_to_token(idx) for idx in ids]
+    def build(self):
+        """
+        Build vocabulary.
+        """
+        token_freq = self._token_count.most_common(self._max_size)
+        idx = len(self.vocab)
+        for token, _ in token_freq:
+            self._token2id[token] = idx
+            self._id2token.append(token)
+            idx += 1
+        if self._unk:
+            unk = '<unk>'
+            self._token2id[unk] = idx
+            self._id2token.append(unk)
+    def process_token(self, token):
+        """Process token before following methods:
+        * add_token
+        * add_documents
+        * doc2id
+        * token_to_id
+        Args:
+            token (str): token to process.
+        Returns:
+            str: processed token string.
+        """
+        if self._lower:
+            token = token.lower()
+        return token
+    def token_to_id(self, token):
+        """Get the token_id of given token.
+        Args:
+            token (str): token from vocabulary.
+        Returns:
+            int: int id of token.
+        """
+        token = self.process_token(token)
+        return self._token2id.get(token, len(self._token2id) - 1)
+    def id_to_token(self, idx):
+        """token-id to token (string).
+        Args:
+            idx (int): token id.
+        Returns:
+            str: string of given token id.
+        """
+        return self._id2token[idx]
+    @property
+    def vocab(self):
+        """Return the vocabulary.
+        Returns:
+            dict: get the dict object of the vocabulary.
+        """
+        return self._token2id
+    @property
+    def reverse_vocab(self):
+        """Return the vocabulary as a reversed dict object.
+        Returns:
+            dict: reversed vocabulary object.
+        """
+        return self._id2token
+def filter_embeddings(embeddings, vocab, dim):
+    """Loads word vectors in numpy array.
+    Args:
+        embeddings (dict): a dictionary of numpy array.
+        vocab (dict): word_index lookup table.
+    Returns:
+        numpy array: an array of word embeddings.
+    """
+    # if not isinstance(embeddings, dict):
+    #     return
+    _embeddings = np.zeros([len(vocab), dim])
+    for word in vocab:
+        if word in embeddings:
+            word_idx = vocab[word]
+            _embeddings[word_idx] = embeddings[word]
+    return _embeddings
+def load_glove(file):
+    """Loads GloVe vectors in numpy array.
+    Args:
+        file (str): a path to a glove file.
+    Return:
+        dict: a dict of numpy arrays.
+    """
+    model = {}
+    with open(file) as f:
+        for line in f:
+            line = line.split(' ')
+            word = line[0]
+            vector = np.array([float(val) for val in line[1:]])
+            model[word] = vector
+    return model
--- a/anago/weights.h5
+++ b/anago/weights.h5
--- a/anago/wrapper.py
+++ b/anago/wrapper.py
+"""
+Wrapper class.
+"""
+from seqeval.metrics import f1_score
+from anago.models import BiLSTMCRF, save_model, load_model
+from anago.preprocessing import IndexTransformer
+from anago.tagger import Tagger
+from anago.trainer import Trainer
+from anago.utils import filter_embeddings
+class Sequence(object):
+    def __init__(self,
+                 word_embedding_dim=100,
+                 char_embedding_dim=25,
+                 word_lstm_size=100,
+                 char_lstm_size=25,
+                 fc_dim=100,
+                 dropout=0.5,
+                 embeddings=None,
+                 use_char=True,
+                 use_crf=True,
+                 initial_vocab=None,
+                 optimizer='adam'):
+        self.model = None
+        self.p = None
+        self.tagger = None
+        self.word_embedding_dim = word_embedding_dim
+        self.char_embedding_dim = char_embedding_dim
+        self.word_lstm_size = word_lstm_size
+        self.char_lstm_size = char_lstm_size
+        self.fc_dim = fc_dim
+        self.dropout = dropout
+        self.embeddings = embeddings
+        self.use_char = use_char
+        self.use_crf = use_crf
+        self.initial_vocab = initial_vocab
+        self.optimizer = optimizer
+    def fit(self, x_train, y_train, x_valid=None, y_valid=None,
+            epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
+        """Fit the model for a fixed number of epochs.
+        Args:
+            x_train: list of training data.
+            y_train: list of training target (label) data.
+            x_valid: list of validation data.
+            y_valid: list of validation target (label) data.
+            batch_size: Integer.
+                Number of samples per gradient update.
+                If unspecified, `batch_size` will default to 32.
+            epochs: Integer. Number of epochs to train the model.
+            verbose: Integer. 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during training.
+            shuffle: Boolean (whether to shuffle the training data
+                before each epoch). `shuffle` will default to True.
+        """
+        p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
+        p.fit(x_train, y_train)
+        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim)
+        model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
+                          word_vocab_size=p.word_vocab_size,
+                          num_labels=p.label_size,
+                          word_embedding_dim=self.word_embedding_dim,
+                          char_embedding_dim=self.char_embedding_dim,
+                          word_lstm_size=self.word_lstm_size,
+                          char_lstm_size=self.char_lstm_size,
+                          fc_dim=self.fc_dim,
+                          dropout=self.dropout,
+                          embeddings=embeddings,
+                          use_char=self.use_char,
+                          use_crf=self.use_crf)
+        model, loss = model.build()
+        model.compile(loss=loss, optimizer=self.optimizer)
+        trainer = Trainer(model, preprocessor=p)
+        trainer.train(x_train, y_train, x_valid, y_valid,
+                      epochs=epochs, batch_size=batch_size,
+                      verbose=verbose, callbacks=callbacks,
+                      shuffle=shuffle)
+        self.p = p
+        self.model = model
+    def score(self, x_test, y_test):
+        """Returns the f1-micro score on the given test data and labels.
+        Args:
+            x_test : array-like, shape = (n_samples, sent_length)
+            Test samples.
+            y_test : array-like, shape = (n_samples, sent_length)
+            True labels for x.
+        Returns:
+            score : float, f1-micro score.
+        """
+        if self.model:
+            x_test = self.p.transform(x_test)
+            lengths = map(len, y_test)
+            y_pred = self.model.predict(x_test)
+            y_pred = self.p.inverse_transform(y_pred, lengths)
+            score = f1_score(y_test, y_pred)
+            return score
+        else:
+            raise OSError('Could not find a model. Call load(dir_path).')
+    def analyze(self, text, tokenizer=str.split):
+        """Analyze text and return pretty format.
+        Args:
+            text: string, the input text.
+            tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
+        Returns:
+            res: dict.
+        """
+        if not self.tagger:
+            self.tagger = Tagger(self.model,
+                                 preprocessor=self.p,
+                                 tokenizer=tokenizer)
+        return self.tagger.analyze(text)
+    def save(self, weights_file, params_file, preprocessor_file):
+        self.p.save(preprocessor_file)
+        save_model(self.model, weights_file, params_file)
+    @classmethod
+    def load(cls, weights_file, params_file, preprocessor_file):
+        self = cls()
+        self.p = IndexTransformer.load(preprocessor_file)
+        self.model = load_model(weights_file, params_file)
+        return self