seg_lstm.py

# -*- coding: UTF-8 -*-
import math
import time

import numpy as np
import tensorflow as tf

import constant
from seg_base import SegBase
from transform_data_lstm import TransformDataLSTM


class SegLSTM(SegBase):
  def __init__(self):
    SegBase.__init__(self)
    self.dtype = tf.float32
    # 参数初始化
    self.skip_window_left = constant.LSTM_SKIP_WINDOW_LEFT
    self.skip_window_right = constant.LSTM_SKIP_WINDOW_RIGHT
    self.window_size = self.skip_window_left + self.skip_window_right + 1
    self.embed_size = 100
    self.hidden_units = 150
    self.tag_count = 4
    self.concat_embed_size = self.window_size * self.embed_size
    self.vocab_size = constant.VOCAB_SIZE
    self.alpha = 0.1
    self.lam = 0.0001
    self.eta = 0.02
    self.dropout_rate = 0.2
    # 数据初始化
    trans = TransformDataLSTM()
    self.words_batch = trans.words_batch
    self.tags_batch = trans.labels_batch
    self.dictionary = trans.dictionary
    # 模型定义和初始化
    self.sess = tf.Session()
    self.optimizer = tf.train.GradientDescentOptimizer(self.alpha)
    # self.optimizer = tf.train.AdamOptimizer(self.alpha)
    self.x = tf.placeholder(self.dtype, shape=[1, None, self.concat_embed_size])
    # self.embeddings = tf.Variable(
    #  tf.truncated_normal([self.vocab_size, self.embed_size], stddev=-1.0 / math.sqrt(self.embed_size),
    #                      dtype=self.dtype), name='embeddings')
    self.embeddings = tf.Variable(
      tf.random_uniform([self.vocab_size, self.embed_size], -1.0 / math.sqrt(self.embed_size),
                        1.0 / math.sqrt(self.embed_size), dtype=self.dtype), name='embeddings')
    # self.embeddings = tf.Variable(np.load('corpus/lstm/embeddings.npy'), dtype=self.dtype, name='embeddings')
    self.w = tf.Variable(
      tf.truncated_normal([self.tags_count, self.hidden_units], stddev=1.0 / math.sqrt(self.concat_embed_size),
                          dtype=self.dtype), name='w')
    self.b = tf.Variable(tf.zeros([self.tag_count, 1], dtype=self.dtype), name='b')
    self.A = tf.Variable(tf.random_uniform([self.tag_count, self.tag_count], -0.05, 0.05, dtype=self.dtype), name='A')
    self.Ap = tf.placeholder(self.dtype, shape=self.A.get_shape())
    self.init_A = tf.Variable(tf.random_uniform([self.tag_count], -0.05, 0.05, dtype=self.dtype), name='init_A')
    self.init_Ap = tf.placeholder(self.dtype, shape=self.init_A.get_shape())
    self.update_A_op = self.A.assign(tf.add((1 - self.alpha * self.lam) * self.A, self.alpha * self.Ap))
    self.update_init_A_op = self.init_A.assign(
      tf.add((1 - self.alpha * self.lam) * self.init_A, self.alpha * self.init_Ap))
    self.sentence_holder = tf.placeholder(tf.int32, shape=[None, self.window_size])
    self.lookup_op = tf.reshape(tf.nn.embedding_lookup(self.embeddings, self.sentence_holder),
                                [-1, 1, self.concat_embed_size])
    self.indices = tf.placeholder(tf.int32, shape=[None, 2])
    self.shape = tf.placeholder(tf.int32, shape=[2])
    self.values = tf.placeholder(self.dtype, shape=[None])
    self.map_matrix_op = tf.sparse_to_dense(self.indices, self.shape, self.values, validate_indices=False)
    self.map_matrix = tf.placeholder(self.dtype, shape=[self.tag_count, None])
    self.lstm = tf.contrib.rnn.LSTMCell(self.hidden_units)
    # self.lstm_output, self.lstm_out_state = tf.nn.dynamic_rnn(self.lstm, self.x, dtype=self.dtype)
    self.lstm_output, self.lstm_out_state = tf.nn.dynamic_rnn(self.lstm, self.lookup_op, dtype=self.dtype,
                                                              time_major=True)
    tf.global_variables_initializer().run(session=self.sess)
    self.word_scores = tf.matmul(self.w, tf.transpose(self.lstm_output[:, -1, :])) + self.b
    self.loss_scores = tf.reduce_sum(tf.multiply(self.map_matrix, self.word_scores), 0)
    self.lstm_variable = [v for v in tf.global_variables() if v.name.startswith('rnn')]
    self.params = [self.w, self.b] + self.lstm_variable
    self.loss = tf.reduce_sum(self.loss_scores) + tf.contrib.layers.apply_regularization(
      tf.contrib.layers.l2_regularizer(self.lam), self.params + [self.embeddings])
    self.regularization = list(map(lambda p: tf.assign_sub(p, self.lam * p), self.params))
    self.train = self.optimizer.minimize(self.loss, var_list=self.params + [self.embeddings])
    # tf.global_variables_initializer().run(session=self.sess)
    self.embedp = tf.placeholder(self.dtype, shape=[None, self.embed_size])
    self.embed_index = tf.placeholder(tf.int32, shape=[None])
    self.update_embed_op = tf.scatter_update(self.embeddings, self.embed_index, self.embedp)
    self.sentence_index = 0
    self.grad_embed = tf.gradients(self.loss_scores[self.sentence_index], self.x)
    self.saver = tf.train.Saver(self.params + [self.embeddings, self.A, self.init_A], max_to_keep=100)

  def model(self, sentence):
    scores = self.sess.run(self.word_scores, feed_dict={self.sentence_holder: sentence})
    path = self.viterbi(scores, self.A.eval(self.sess), self.init_A.eval(self.sess))
    return path

  def train_exe(self):
    self.sess.graph.finalize()
    last_time = time.time()
    for i in range(10):
      for sentence_index, (sentence, tags) in enumerate(zip(self.words_batch, self.tags_batch)):
        self.train_sentence(sentence, tags, len(tags))
        if sentence_index > 0 and sentence_index % 1000 == 0:
          print(sentence_index)
          print(time.time() - last_time)
          last_time = time.time()
          # print(self.cal_loss(sentence_index-500,sentence_index))
      print(self.sess.run(self.init_A))
      self.saver.save(self.sess, 'tmp/lstm-model%d.ckpt' % i)

  def train_sentence(self, sentence, tags, length):
    # sentence_embeds = self.sess.run(self.lookup_op, feed_dict={self.sentence_holder: sentence}).reshape(
    #  [length, self.concat_embed_size])
    # print(sentence_embeds.shape)
    current_tags = self.model(sentence)
    diff_tags = np.subtract(tags, current_tags)
    update_index = np.where(diff_tags != 0)[0]
    update_length = len(update_index)

    if update_length == 0:
      return

    update_tags_pos = tags[update_index]
    update_tags_neg = current_tags[update_index]

    sparse_indices = np.stack(
      [np.concatenate([update_tags_pos, update_tags_neg], axis=-1), np.tile(update_index, [2])], axis=-1)
    sparse_values = np.concatenate([-1 * np.ones(update_length), np.ones(update_length)])
    output_shape = [self.tag_count, length]
    sentence_matrix = self.sess.run(self.map_matrix_op,
                                    feed_dict={self.indices: sparse_indices, self.shape: output_shape,
                                               self.values: sparse_values})

    # 更新参数
    # self.sess.run(self.train,
    #              feed_dict={self.x: np.expand_dims(sentence_embeds, 0), self.map_matrix: sentence_matrix})
    self.sess.run(self.train, feed_dict={self.sentence_holder: sentence, self.map_matrix: sentence_matrix})
    # self.sess.run(self.regularization)

    '''
    # 更新词向量
    self.sentence_length = length

    for _, index in enumerate(update_index):
      self.sentence_index = index
      grad = self.sess.run(self.grad_embed,
                           feed_dict={self.x: np.expand_dims(np.expand_dims(sentence_embeds[index], 0), 0),
                                      self.map_matrix: sentence_matrix})[0][0]
      sentence_update_embed = (sentence_embeds[index] - self.alpha * grad) * (1 - self.lam)
      self.embeddings = self.sess.run(self.update_embed_op,
                                      feed_dict={
                                        self.embedp: sentence_update_embed.reshape([self.window_size, self.embed_size]),
                                        self.embed_index: sentence[index]})
    '''
    # 更新转移矩阵
    A_update, init_A_update, update_init = self.gen_update_A(tags, current_tags)
    if update_init:
      self.sess.run(self.update_init_A_op, feed_dict={self.init_Ap: init_A_update})
    self.sess.run(self.update_A_op, {self.Ap: A_update})

  def gen_update_A(self, correct_tags, current_tags):
    A_update = np.zeros([self.tag_count, self.tag_count], dtype=np.float32)
    init_A_update = np.zeros([self.tag_count], dtype=np.float32)
    before_corr = correct_tags[0]
    before_curr = current_tags[0]
    update_init = False

    if before_corr != before_curr:
      init_A_update[before_corr] += 1
      init_A_update[before_curr] -= 1
      update_init = True

    for _, (corr_tag, curr_tag) in enumerate(zip(correct_tags[1:], current_tags[1:])):
      if corr_tag != curr_tag or before_corr != before_curr:
        A_update[before_corr, corr_tag] += 1
        A_update[before_curr, curr_tag] -= 1
      before_corr = corr_tag
      before_curr = curr_tag

    return A_update, init_A_update, update_init

  def cal_loss(self, start, end):
    loss = 0.0
    A = self.A.eval(session=self.sess)
    init_A = self.init_A.eval(session=self.sess)
    for sentence_index, (sentence, tags) in enumerate(zip(self.words_batch[start:end], self.tags_batch[start:end])):
      sentence_score = self.sess.run(self.word_scores, feed_dict={self.sentence_holder: sentence})
      loss += self.cal_sentence_loss(tags, sentence_score, A, init_A)
    return loss

  def seg(self, sentence, model_path='tmp/lstm-model0.ckpt', debug=False):
    self.saver.restore(self.sess, model_path)
    seq = self.index2seq(self.sentence2index(sentence))
    sentence_embeds = tf.nn.embedding_lookup(self.embeddings, seq).eval(session=self.sess).reshape(
      [len(sentence), self.concat_embed_size])
    sentence_scores = self.sess.run(self.word_scores, feed_dict={self.sentence_holder: seq})
    init_A_val = self.init_A.eval(session=self.sess)
    A_val = self.A.eval(session=self.sess)
    if debug:
      print(A_val)
      # print(sentence_embeds[1])
      print(sentence_scores.T)
    current_tags = self.viterbi(sentence_scores, A_val, init_A_val)
    return self.tags2words(sentence, current_tags), current_tags


if __name__ == '__main__':
  seg = SegLSTM()
  seg.train_exe()