Blocks

Building blocks that can be shared across all models.

`Masked (Module)`

Wrapper to mix normal modules with others that take 2 inputs

Source code in thunder/blocks.py

class Masked(nn.Module):
    """Wrapper to mix normal modules with others that take 2 inputs"""

    def __init__(self, *layers):
        super().__init__()
        self.layer = nn.Sequential(*layers)

    def forward(
        self, audio: torch.Tensor, audio_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.layer(audio), audio_lengths

`forward(self, audio, audio_lengths)`

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in thunder/blocks.py

def forward(
    self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    return self.layer(audio), audio_lengths

`MultiSequential (Sequential)`

nn.Sequential equivalent with 2 inputs/outputs

Source code in thunder/blocks.py

class MultiSequential(nn.Sequential):
    """nn.Sequential equivalent with 2 inputs/outputs"""

    def forward(
        self, audio: torch.Tensor, audio_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        for module in self.children():
            audio, audio_lengths = module(audio, audio_lengths)
        return audio, audio_lengths

`forward(self, audio, audio_lengths)`

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in thunder/blocks.py

def forward(
    self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    for module in self.children():
        audio, audio_lengths = module(audio, audio_lengths)
    return audio, audio_lengths

`SwapLastDimension (Module)`

Layer that swap the last two dimensions of the data.

Source code in thunder/blocks.py

class SwapLastDimension(nn.Module):
    """Layer that swap the last two dimensions of the data."""

    def forward(self, x: Tensor) -> Tensor:
        return x.transpose(-1, -2)

`forward(self, x)`

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in thunder/blocks.py

def forward(self, x: Tensor) -> Tensor:
    return x.transpose(-1, -2)

`conv1d_decoder(decoder_input_channels, num_classes)`

Decoder that uses one conv1d layer

Parameters:

Name	Type	Description	Default
`num_classes`	`int`	Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.	required
`decoder_input_channels`	`int`	Number of input channels of the decoder. That is the number of channels of the features created by the encoder.	required

Returns:

Type	Description
`Module`	Pytorch model of the decoder

Source code in thunder/blocks.py

def conv1d_decoder(decoder_input_channels: int, num_classes: int) -> nn.Module:
    """Decoder that uses one conv1d layer

    Args:
        num_classes: Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.
        decoder_input_channels: Number of input channels of the decoder. That is the number of channels of the features created by the encoder.

    Returns:
        Pytorch model of the decoder
    """
    decoder = nn.Conv1d(
        decoder_input_channels,
        num_classes,
        kernel_size=1,
        bias=True,
    )
    nn.init.xavier_uniform_(decoder.weight, gain=1.0)
    return decoder

`convolution_stft(input_data, n_fft=1024, hop_length=512, win_length=1024, window=tensor([0.0000e+00, 9.4175e-06, 3.7730e-05, ..., 3.7730e-05, 9.4175e-06,0.0000e+00]), center=True, return_complex=False)`

Implements the stft operation using the convolution method. This is one alternative to make possible to export code using this operation to onnx and arm based environments. The signature shuld follow the same as torch.stft, making it possible to just swap the two. The code is based on https://github.com/pseeth/torch-stft

Source code in thunder/blocks.py

def convolution_stft(
    input_data: torch.Tensor,
    n_fft: int = 1024,
    hop_length: int = 512,
    win_length: int = 1024,
    window: torch.Tensor = torch.hann_window(1024, periodic=False),
    center: bool = True,
    return_complex: bool = False,
) -> torch.Tensor:
    """Implements the stft operation using the convolution method. This is one alternative
    to make possible to export code using this operation to onnx and arm based environments.
    The signature shuld follow the same as torch.stft, making it possible to just swap the two.
    The code is based on https://github.com/pseeth/torch-stft
    """
    assert n_fft >= win_length
    pad_amount = int(n_fft / 2)
    window = window.to(input_data.device)

    fourier_basis = _fourier_matrix(n_fft, device=input_data.device)

    cutoff = int((n_fft / 2 + 1))
    fourier_basis = torch.stack(
        [torch.real(fourier_basis[:cutoff, :]), torch.imag(fourier_basis[:cutoff, :])]
    ).reshape(-1, n_fft)
    forward_basis = fourier_basis[:, None, :].float()

    window_pad = (n_fft - win_length) // 2
    window_pad2 = n_fft - (window_pad + win_length)
    fft_window = torch.nn.functional.pad(window, [window_pad, window_pad2])
    # window the bases
    forward_basis *= fft_window
    forward_basis = forward_basis.float()

    num_batches = input_data.shape[0]
    num_samples = input_data.shape[-1]

    # similar to librosa, reflect-pad the input
    input_data = input_data.view(num_batches, 1, num_samples)

    input_data = F.pad(
        input_data.unsqueeze(1),
        (pad_amount, pad_amount, 0, 0),
        mode="reflect",
    )
    input_data = input_data.squeeze(1)

    forward_transform = F.conv1d(
        input_data, forward_basis, stride=hop_length, padding=0
    )

    cutoff = int((n_fft / 2) + 1)
    real_part = forward_transform[:, :cutoff, :]
    imag_part = forward_transform[:, cutoff:, :]
    return torch.stack((real_part, imag_part), dim=-1)

`get_same_padding(kernel_size, stride, dilation)`

Calculates the padding size to obtain same padding. Same padding means that the output will have the shape input_shape / stride. That means, for stride = 1 the output shape is the same as the input, and stride = 2 gives an output that is half of the input shape.

Parameters:

Name	Type	Description	Default
`kernel_size`	`int`	convolution kernel size. Only tested to be correct with odd values.	required
`stride`	`int`	convolution stride	required
`dilation`	`int`	convolution dilation	required

Exceptions:

Type	Description
`ValueError`	Only stride or dilation may be greater than 1

Returns:

Type	Description
`int`	padding value to obtain same padding.

Source code in thunder/blocks.py

def get_same_padding(kernel_size: int, stride: int, dilation: int) -> int:
    """Calculates the padding size to obtain same padding.
        Same padding means that the output will have the
        shape input_shape / stride. That means, for
        stride = 1 the output shape is the same as the input,
        and stride = 2 gives an output that is half of the
        input shape.

    Args:
        kernel_size: convolution kernel size. Only tested to be correct with odd values.
        stride: convolution stride
        dilation: convolution dilation

    Raises:
        ValueError: Only stride or dilation may be greater than 1

    Returns:
        padding value to obtain same padding.
    """
    if stride > 1 and dilation > 1:
        raise ValueError("Only stride OR dilation may be greater than 1")
    if dilation > 1:
        return (dilation * (kernel_size - 1) + 1) // 2
    return kernel_size // 2

`lengths_to_mask(lengths, max_length)`

Convert from integer lengths of each element to mask representation

Parameters:

Name	Type	Description	Default
`lengths`	`Tensor`	lengths of each element in the batch	required
`max_length`	`int`	maximum length expected. Can be greater than lengths.max()	required

Returns:

Type	Description
`Tensor`	Corresponding boolean mask indicating the valid region of the tensor.

Source code in thunder/blocks.py

def lengths_to_mask(lengths: torch.Tensor, max_length: int) -> torch.Tensor:
    """Convert from integer lengths of each element to mask representation

    Args:
        lengths: lengths of each element in the batch
        max_length: maximum length expected. Can be greater than lengths.max()

    Returns:
        Corresponding boolean mask indicating the valid region of the tensor.
    """
    lengths = lengths.type(torch.long)
    mask = torch.arange(max_length, device=lengths.device).expand(
        lengths.shape[0], max_length
    ) < lengths.unsqueeze(1)
    return mask

`linear_decoder(decoder_input_channels, num_classes, decoder_dropout)`

Decoder that uses a linear layer with dropout

Parameters:

Name	Type	Description	Default
`decoder_dropout`	`float`	Amount of dropout to be used in the decoder	required
`decoder_input_channels`	`int`	Number of input channels of the decoder. That is the number of channels of the features created by the encoder.	required
`num_classes`	`int`	Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.	required

Returns:

Type	Description
`Module`	Module that represents the decoder.

Source code in thunder/blocks.py

def linear_decoder(
    decoder_input_channels: int, num_classes: int, decoder_dropout: float
) -> nn.Module:
    """Decoder that uses a linear layer with dropout

    Args:
        decoder_dropout: Amount of dropout to be used in the decoder
        decoder_input_channels: Number of input channels of the decoder. That is the number of channels of the features created by the encoder.
        num_classes: Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.

    Returns:
        Module that represents the decoder.
    """

    # SwapLastDimension is necessary to
    # change from (batch, time, #vocab) to (batch, #vocab, time)
    # that is expected by the rest of the library
    return nn.Sequential(
        SwapLastDimension(),
        nn.Dropout(decoder_dropout),
        nn.Linear(decoder_input_channels, num_classes),
        SwapLastDimension(),
    )

`normalize_tensor(input_values, mask=None, div_guard=1e-07, dim=-1)`

Normalize tensor values, optionally using some mask to define the valid region.

Parameters:

Name	Type	Description	Default
`input_values`	`Tensor`	input tensor to be normalized	required
`mask`	`Optional[torch.Tensor]`	Optional mask describing the valid elements.	`None`
`div_guard`	`float`	value used to prevent division by zero when normalizing.	`1e-07`
`dim`	`int`	dimension used to calculate the mean and variance.	`-1`

Returns:

Type	Description
`Tensor`	Normalized tensor

Source code in thunder/blocks.py

def normalize_tensor(
    input_values: torch.Tensor,
    mask: Optional[torch.Tensor] = None,
    div_guard: float = 1e-7,
    dim: int = -1,
) -> torch.Tensor:
    """Normalize tensor values, optionally using some mask to define the valid region.

    Args:
        input_values: input tensor to be normalized
        mask: Optional mask describing the valid elements.
        div_guard: value used to prevent division by zero when normalizing.
        dim: dimension used to calculate the mean and variance.

    Returns:
        Normalized tensor
    """
    # Vectorized implementation of (x - x.mean()) / x.std() considering only the valid mask
    if mask is not None:
        # Making sure the elements outside the mask are zero, to have the correct mean/std
        input_values = torch.masked_fill(input_values, ~mask.type(torch.bool), 0.0)
        # Number of valid elements
        num_elements = mask.sum(dim=dim, keepdim=True).detach()
        # Mean is sum over number of elements
        x_mean = input_values.sum(dim=dim, keepdim=True).detach() / num_elements
        # std numerator: sum of squared differences to the mean
        numerator = (input_values - x_mean).pow(2).sum(dim=dim, keepdim=True).detach()
        x_std = (numerator / num_elements).sqrt()
        # using the div_guard to prevent division by zero
        normalized = (input_values - x_mean) / (x_std + div_guard)
        # Cleaning elements outside of valid mask
        return torch.masked_fill(normalized, ~mask.type(torch.bool), 0.0)

    mean = input_values.mean(dim=dim, keepdim=True).detach()
    std = (input_values.var(dim=dim, keepdim=True).detach() + div_guard).sqrt()
    return (input_values - mean) / std