Source code for espnet.nets.chainer_backend.transformer.encoder

# encoding: utf-8

import chainer

from espnet.nets.chainer_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.subsampling import Conv2dSubsampling
from espnet.nets.chainer_backend.transformer.subsampling import LinearSampling

import logging
import numpy as np


[docs]class Encoder(chainer.Chain): """Encoder. Args: input_type(str): Sampling type. `input_type` must be `conv2d` or 'linear' currently. idim (int): Dimension of inputs. n_layers (int): Number of encoder layers. n_units (int): Number of input/output dimension of a FeedForward layer. d_units (int): Number of units of hidden layer in a FeedForward layer. h (int): Number of attention heads. dropout (float): Dropout rate """ def __init__(self, idim, args, initialW=None, initial_bias=None): super(Encoder, self).__init__() initialW = chainer.initializers.Uniform if initialW is None else initialW initial_bias = chainer.initializers.Uniform if initial_bias is None else initial_bias with self.init_scope(): channels = 64 # Based in paper if args.transformer_input_layer == 'conv2d': idim = int(np.ceil(np.ceil(idim / 2) / 2)) * channels self.input_layer = Conv2dSubsampling(channels, idim, args.adim, dropout=args.dropout_rate, initialW=initialW, initial_bias=initial_bias) elif args.transformer_input_layer == 'linear': self.input_layer = LinearSampling(idim, args.adim, initialW=initialW, initial_bias=initial_bias) else: raise ValueError('Incorrect type of input layer') self.norm = LayerNorm(args.adim) for i in range(args.elayers): name = 'encoders.' + str(i) layer = EncoderLayer(args.adim, d_units=args.eunits, h=args.aheads, dropout=args.dropout_rate, initialW=initialW, initial_bias=initial_bias) self.add_link(name, layer) self.n_layers = args.elayers def __call__(self, e, ilens): """Computing Encoder layer. Args: e (chainer.Variable): Batch of padded charactor. (B, Tmax) ilens (chainer.Variable): Batch of length of each input batch. (B,) Returns: chainer.Variable: Computed variable of encoder. numpy.array: Mask. chainer.Variable: Batch of lengths of each encoder outputs. """ e, ilens = self.input_layer(e, ilens) batch, length, dims = e.shape x_mask = np.ones([batch, length]) for j in range(batch): x_mask[j, ilens[j]:] = -1 xx_mask = (x_mask[:, None, :] >= 0) * (x_mask[:, :, None] >= 0) xx_mask = self.xp.array(xx_mask) logging.debug('encoders size: ' + str(e.shape)) e = e.reshape(-1, dims) for i in range(self.n_layers): e = self['encoders.' + str(i)](e, xx_mask, batch) return self.norm(e), x_mask, ilens