PaddleNLP/lexical_analysis/predict.py

# -*- coding: UTF-8 -*-
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import time
import sys

import paddle.fluid as fluid
import paddle

import utils
import reader
import creator
sys.path.append('../shared_modules/models/')
from model_check import check_cuda
from model_check import check_version

parser = argparse.ArgumentParser(__doc__)
# 1. model parameters
model_g = utils.ArgumentGroup(parser, "model", "model configuration")
model_g.add_arg("word_emb_dim", int, 128,
                "The dimension in which a word is embedded.")
model_g.add_arg("grnn_hidden_dim", int, 256,
                "The number of hidden nodes in the GRNN layer.")
model_g.add_arg("bigru_num", int, 2,
                "The number of bi_gru layers in the network.")
model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")

# 2. data parameters
data_g = utils.ArgumentGroup(parser, "data", "data paths")
data_g.add_arg("word_dict_path", str, "./conf/word.dic",
               "The path of the word dictionary.")
data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
               "The path of the label dictionary.")
data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
               "The path of the word replacement Dictionary.")
data_g.add_arg("infer_data", str, "./data/infer.tsv",
               "The folder where the training data is located.")
data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
data_g.add_arg(
    "batch_size", int, 200,
    "The number of sequences contained in a mini-batch, "
    "or the maximum number of tokens (include paddings) contained in a mini-batch."
)


def do_infer(args):
    dataset = reader.Dataset(args)

    infer_program = fluid.Program()
    with fluid.program_guard(infer_program, fluid.default_startup_program()):
        with fluid.unique_name.guard():

            infer_ret = creator.create_model(
                args, dataset.vocab_size, dataset.num_labels, mode='infer')
    infer_program = infer_program.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
    else:
        place = fluid.CPUPlace()

    pyreader = creator.create_pyreader(
        args,
        file_name=args.infer_data,
        feed_list=infer_ret['feed_list'],
        place=place,
        model='lac',
        reader=dataset,
        mode='infer')

    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    # load model
    utils.init_checkpoint(exe, args.init_checkpoint, infer_program)

    result = infer_process(
        exe=exe,
        program=infer_program,
        reader=pyreader,
        fetch_vars=[infer_ret['words'], infer_ret['crf_decode']],
        dataset=dataset)
    for sent, tags in result:
        result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
        print(''.join(result_list))


def infer_process(exe, program, reader, fetch_vars, dataset):
    """
    the function to execute the infer process
    :param exe: the fluid Executor
    :param program: the infer_program
    :param reader: data reader
    :return: the list of prediction result
    """

    def input_check(data):
        if data[0]['words'].lod()[0][-1] == 0:
            return data[0]['words']
        return None

    results = []
    for data in reader():
        crf_decode = input_check(data)
        if crf_decode:
            results += utils.parse_result(crf_decode, crf_decode, dataset)
            continue

        words, crf_decode = exe.run(
            program,
            fetch_list=fetch_vars,
            feed=data,
            return_numpy=False,
            use_program_cache=True, )
        results += utils.parse_result(words, crf_decode, dataset)
    return results


if __name__ == "__main__":
    args = parser.parse_args()
    check_cuda(args.use_cuda)
    check_version()
    do_infer(args)