rec_att_head.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. import paddle
  18. import paddle.nn as nn
  19. import paddle.nn.functional as F
  20. import numpy as np
  21. class AttentionHead(nn.Layer):
  22. def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
  23. super(AttentionHead, self).__init__()
  24. self.input_size = in_channels
  25. self.hidden_size = hidden_size
  26. self.num_classes = out_channels
  27. self.attention_cell = AttentionGRUCell(
  28. in_channels, hidden_size, out_channels, use_gru=False)
  29. self.generator = nn.Linear(hidden_size, out_channels)
  30. def _char_to_onehot(self, input_char, onehot_dim):
  31. input_ont_hot = F.one_hot(input_char, onehot_dim)
  32. return input_ont_hot
  33. def forward(self, inputs, targets=None, batch_max_length=25):
  34. batch_size = inputs.shape[0]
  35. num_steps = batch_max_length
  36. hidden = paddle.zeros((batch_size, self.hidden_size))
  37. output_hiddens = []
  38. if targets is not None:
  39. for i in range(num_steps):
  40. char_onehots = self._char_to_onehot(
  41. targets[:, i], onehot_dim=self.num_classes)
  42. (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
  43. char_onehots)
  44. output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
  45. output = paddle.concat(output_hiddens, axis=1)
  46. probs = self.generator(output)
  47. else:
  48. targets = paddle.zeros(shape=[batch_size], dtype="int32")
  49. probs = None
  50. char_onehots = None
  51. outputs = None
  52. alpha = None
  53. for i in range(num_steps):
  54. char_onehots = self._char_to_onehot(
  55. targets, onehot_dim=self.num_classes)
  56. (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
  57. char_onehots)
  58. probs_step = self.generator(outputs)
  59. if probs is None:
  60. probs = paddle.unsqueeze(probs_step, axis=1)
  61. else:
  62. probs = paddle.concat(
  63. [probs, paddle.unsqueeze(
  64. probs_step, axis=1)], axis=1)
  65. next_input = probs_step.argmax(axis=1)
  66. targets = next_input
  67. return probs
  68. class AttentionGRUCell(nn.Layer):
  69. def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
  70. super(AttentionGRUCell, self).__init__()
  71. self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
  72. self.h2h = nn.Linear(hidden_size, hidden_size)
  73. self.score = nn.Linear(hidden_size, 1, bias_attr=False)
  74. self.rnn = nn.GRUCell(
  75. input_size=input_size + num_embeddings, hidden_size=hidden_size)
  76. self.hidden_size = hidden_size
  77. def forward(self, prev_hidden, batch_H, char_onehots):
  78. batch_H_proj = self.i2h(batch_H)
  79. prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
  80. res = paddle.add(batch_H_proj, prev_hidden_proj)
  81. res = paddle.tanh(res)
  82. e = self.score(res)
  83. alpha = F.softmax(e, axis=1)
  84. alpha = paddle.transpose(alpha, [0, 2, 1])
  85. context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
  86. concat_context = paddle.concat([context, char_onehots], 1)
  87. cur_hidden = self.rnn(concat_context, prev_hidden)
  88. return cur_hidden, alpha
  89. class AttentionLSTM(nn.Layer):
  90. def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
  91. super(AttentionLSTM, self).__init__()
  92. self.input_size = in_channels
  93. self.hidden_size = hidden_size
  94. self.num_classes = out_channels
  95. self.attention_cell = AttentionLSTMCell(
  96. in_channels, hidden_size, out_channels, use_gru=False)
  97. self.generator = nn.Linear(hidden_size, out_channels)
  98. def _char_to_onehot(self, input_char, onehot_dim):
  99. input_ont_hot = F.one_hot(input_char, onehot_dim)
  100. return input_ont_hot
  101. def forward(self, inputs, targets=None, batch_max_length=25):
  102. batch_size = inputs.shape[0]
  103. num_steps = batch_max_length
  104. hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
  105. (batch_size, self.hidden_size)))
  106. output_hiddens = []
  107. if targets is not None:
  108. for i in range(num_steps):
  109. # one-hot vectors for a i-th char
  110. char_onehots = self._char_to_onehot(
  111. targets[:, i], onehot_dim=self.num_classes)
  112. hidden, alpha = self.attention_cell(hidden, inputs,
  113. char_onehots)
  114. hidden = (hidden[1][0], hidden[1][1])
  115. output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
  116. output = paddle.concat(output_hiddens, axis=1)
  117. probs = self.generator(output)
  118. else:
  119. targets = paddle.zeros(shape=[batch_size], dtype="int32")
  120. probs = None
  121. for i in range(num_steps):
  122. char_onehots = self._char_to_onehot(
  123. targets, onehot_dim=self.num_classes)
  124. hidden, alpha = self.attention_cell(hidden, inputs,
  125. char_onehots)
  126. probs_step = self.generator(hidden[0])
  127. hidden = (hidden[1][0], hidden[1][1])
  128. if probs is None:
  129. probs = paddle.unsqueeze(probs_step, axis=1)
  130. else:
  131. probs = paddle.concat(
  132. [probs, paddle.unsqueeze(
  133. probs_step, axis=1)], axis=1)
  134. next_input = probs_step.argmax(axis=1)
  135. targets = next_input
  136. return probs
  137. class AttentionLSTMCell(nn.Layer):
  138. def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
  139. super(AttentionLSTMCell, self).__init__()
  140. self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
  141. self.h2h = nn.Linear(hidden_size, hidden_size)
  142. self.score = nn.Linear(hidden_size, 1, bias_attr=False)
  143. if not use_gru:
  144. self.rnn = nn.LSTMCell(
  145. input_size=input_size + num_embeddings, hidden_size=hidden_size)
  146. else:
  147. self.rnn = nn.GRUCell(
  148. input_size=input_size + num_embeddings, hidden_size=hidden_size)
  149. self.hidden_size = hidden_size
  150. def forward(self, prev_hidden, batch_H, char_onehots):
  151. batch_H_proj = self.i2h(batch_H)
  152. prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1)
  153. res = paddle.add(batch_H_proj, prev_hidden_proj)
  154. res = paddle.tanh(res)
  155. e = self.score(res)
  156. alpha = F.softmax(e, axis=1)
  157. alpha = paddle.transpose(alpha, [0, 2, 1])
  158. context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
  159. concat_context = paddle.concat([context, char_onehots], 1)
  160. cur_hidden = self.rnn(concat_context, prev_hidden)
  161. return cur_hidden, alpha