Skip to content

Blocks

Building blocks that can be shared across all models.

Masked (Module)

Wrapper to mix normal modules with others that take 2 inputs

Source code in thunder/blocks.py
class Masked(nn.Module):
    """Wrapper to mix normal modules with others that take 2 inputs"""

    def __init__(self, *layers):
        super().__init__()
        self.layer = nn.Sequential(*layers)

    def forward(
        self, audio: torch.Tensor, audio_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.layer(audio), audio_lengths

forward(self, audio, audio_lengths)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in thunder/blocks.py
def forward(
    self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    return self.layer(audio), audio_lengths

MultiSequential (Sequential)

nn.Sequential equivalent with 2 inputs/outputs

Source code in thunder/blocks.py
class MultiSequential(nn.Sequential):
    """nn.Sequential equivalent with 2 inputs/outputs"""

    def forward(
        self, audio: torch.Tensor, audio_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        for module in self.children():
            audio, audio_lengths = module(audio, audio_lengths)
        return audio, audio_lengths

forward(self, audio, audio_lengths)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in thunder/blocks.py
def forward(
    self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    for module in self.children():
        audio, audio_lengths = module(audio, audio_lengths)
    return audio, audio_lengths

SwapLastDimension (Module)

Layer that swap the last two dimensions of the data.

Source code in thunder/blocks.py
class SwapLastDimension(nn.Module):
    """Layer that swap the last two dimensions of the data."""

    def forward(self, x: Tensor) -> Tensor:
        return x.transpose(-1, -2)

forward(self, x)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in thunder/blocks.py
def forward(self, x: Tensor) -> Tensor:
    return x.transpose(-1, -2)

conv1d_decoder(decoder_input_channels, num_classes)

Decoder that uses one conv1d layer

Parameters:

Name Type Description Default
num_classes int

Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.

required
decoder_input_channels int

Number of input channels of the decoder. That is the number of channels of the features created by the encoder.

required

Returns:

Type Description
Module

Pytorch model of the decoder

Source code in thunder/blocks.py
def conv1d_decoder(decoder_input_channels: int, num_classes: int) -> nn.Module:
    """Decoder that uses one conv1d layer

    Args:
        num_classes: Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.
        decoder_input_channels: Number of input channels of the decoder. That is the number of channels of the features created by the encoder.

    Returns:
        Pytorch model of the decoder
    """
    decoder = nn.Conv1d(
        decoder_input_channels,
        num_classes,
        kernel_size=1,
        bias=True,
    )
    nn.init.xavier_uniform_(decoder.weight, gain=1.0)
    return decoder

convolution_stft(input_data, n_fft=1024, hop_length=512, win_length=1024, window=tensor([0.0000e+00, 9.4175e-06, 3.7730e-05, ..., 3.7730e-05, 9.4175e-06,0.0000e+00]), center=True, return_complex=False)

Implements the stft operation using the convolution method. This is one alternative to make possible to export code using this operation to onnx and arm based environments. The signature shuld follow the same as torch.stft, making it possible to just swap the two. The code is based on https://github.com/pseeth/torch-stft

Source code in thunder/blocks.py
def convolution_stft(
    input_data: torch.Tensor,
    n_fft: int = 1024,
    hop_length: int = 512,
    win_length: int = 1024,
    window: torch.Tensor = torch.hann_window(1024, periodic=False),
    center: bool = True,
    return_complex: bool = False,
) -> torch.Tensor:
    """Implements the stft operation using the convolution method. This is one alternative
    to make possible to export code using this operation to onnx and arm based environments.
    The signature shuld follow the same as torch.stft, making it possible to just swap the two.
    The code is based on https://github.com/pseeth/torch-stft
    """
    assert n_fft >= win_length
    pad_amount = int(n_fft / 2)
    window = window.to(input_data.device)

    fourier_basis = _fourier_matrix(n_fft, device=input_data.device)

    cutoff = int((n_fft / 2 + 1))
    fourier_basis = torch.stack(
        [torch.real(fourier_basis[:cutoff, :]), torch.imag(fourier_basis[:cutoff, :])]
    ).reshape(-1, n_fft)
    forward_basis = fourier_basis[:, None, :].float()

    window_pad = (n_fft - win_length) // 2
    window_pad2 = n_fft - (window_pad + win_length)
    fft_window = torch.nn.functional.pad(window, [window_pad, window_pad2])
    # window the bases
    forward_basis *= fft_window
    forward_basis = forward_basis.float()

    num_batches = input_data.shape[0]
    num_samples = input_data.shape[-1]

    # similar to librosa, reflect-pad the input
    input_data = input_data.view(num_batches, 1, num_samples)

    input_data = F.pad(
        input_data.unsqueeze(1),
        (pad_amount, pad_amount, 0, 0),
        mode="reflect",
    )
    input_data = input_data.squeeze(1)

    forward_transform = F.conv1d(
        input_data, forward_basis, stride=hop_length, padding=0
    )

    cutoff = int((n_fft / 2) + 1)
    real_part = forward_transform[:, :cutoff, :]
    imag_part = forward_transform[:, cutoff:, :]
    return torch.stack((real_part, imag_part), dim=-1)

get_same_padding(kernel_size, stride, dilation)

Calculates the padding size to obtain same padding. Same padding means that the output will have the shape input_shape / stride. That means, for stride = 1 the output shape is the same as the input, and stride = 2 gives an output that is half of the input shape.

Parameters:

Name Type Description Default
kernel_size int

convolution kernel size. Only tested to be correct with odd values.

required
stride int

convolution stride

required
dilation int

convolution dilation

required

Exceptions:

Type Description
ValueError

Only stride or dilation may be greater than 1

Returns:

Type Description
int

padding value to obtain same padding.

Source code in thunder/blocks.py
def get_same_padding(kernel_size: int, stride: int, dilation: int) -> int:
    """Calculates the padding size to obtain same padding.
        Same padding means that the output will have the
        shape input_shape / stride. That means, for
        stride = 1 the output shape is the same as the input,
        and stride = 2 gives an output that is half of the
        input shape.

    Args:
        kernel_size: convolution kernel size. Only tested to be correct with odd values.
        stride: convolution stride
        dilation: convolution dilation

    Raises:
        ValueError: Only stride or dilation may be greater than 1

    Returns:
        padding value to obtain same padding.
    """
    if stride > 1 and dilation > 1:
        raise ValueError("Only stride OR dilation may be greater than 1")
    if dilation > 1:
        return (dilation * (kernel_size - 1) + 1) // 2
    return kernel_size // 2

lengths_to_mask(lengths, max_length)

Convert from integer lengths of each element to mask representation

Parameters:

Name Type Description Default
lengths Tensor

lengths of each element in the batch

required
max_length int

maximum length expected. Can be greater than lengths.max()

required

Returns:

Type Description
Tensor

Corresponding boolean mask indicating the valid region of the tensor.

Source code in thunder/blocks.py
def lengths_to_mask(lengths: torch.Tensor, max_length: int) -> torch.Tensor:
    """Convert from integer lengths of each element to mask representation

    Args:
        lengths: lengths of each element in the batch
        max_length: maximum length expected. Can be greater than lengths.max()

    Returns:
        Corresponding boolean mask indicating the valid region of the tensor.
    """
    lengths = lengths.type(torch.long)
    mask = torch.arange(max_length, device=lengths.device).expand(
        lengths.shape[0], max_length
    ) < lengths.unsqueeze(1)
    return mask

linear_decoder(decoder_input_channels, num_classes, decoder_dropout)

Decoder that uses a linear layer with dropout

Parameters:

Name Type Description Default
decoder_dropout float

Amount of dropout to be used in the decoder

required
decoder_input_channels int

Number of input channels of the decoder. That is the number of channels of the features created by the encoder.

required
num_classes int

Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.

required

Returns:

Type Description
Module

Module that represents the decoder.

Source code in thunder/blocks.py
def linear_decoder(
    decoder_input_channels: int, num_classes: int, decoder_dropout: float
) -> nn.Module:
    """Decoder that uses a linear layer with dropout

    Args:
        decoder_dropout: Amount of dropout to be used in the decoder
        decoder_input_channels: Number of input channels of the decoder. That is the number of channels of the features created by the encoder.
        num_classes: Number of output classes of the model. It's the size of the vocabulary, excluding the blank symbol.

    Returns:
        Module that represents the decoder.
    """

    # SwapLastDimension is necessary to
    # change from (batch, time, #vocab) to (batch, #vocab, time)
    # that is expected by the rest of the library
    return nn.Sequential(
        SwapLastDimension(),
        nn.Dropout(decoder_dropout),
        nn.Linear(decoder_input_channels, num_classes),
        SwapLastDimension(),
    )

normalize_tensor(input_values, mask=None, div_guard=1e-07, dim=-1)

Normalize tensor values, optionally using some mask to define the valid region.

Parameters:

Name Type Description Default
input_values Tensor

input tensor to be normalized

required
mask Optional[torch.Tensor]

Optional mask describing the valid elements.

None
div_guard float

value used to prevent division by zero when normalizing.

1e-07
dim int

dimension used to calculate the mean and variance.

-1

Returns:

Type Description
Tensor

Normalized tensor

Source code in thunder/blocks.py
def normalize_tensor(
    input_values: torch.Tensor,
    mask: Optional[torch.Tensor] = None,
    div_guard: float = 1e-7,
    dim: int = -1,
) -> torch.Tensor:
    """Normalize tensor values, optionally using some mask to define the valid region.

    Args:
        input_values: input tensor to be normalized
        mask: Optional mask describing the valid elements.
        div_guard: value used to prevent division by zero when normalizing.
        dim: dimension used to calculate the mean and variance.

    Returns:
        Normalized tensor
    """
    # Vectorized implementation of (x - x.mean()) / x.std() considering only the valid mask
    if mask is not None:
        # Making sure the elements outside the mask are zero, to have the correct mean/std
        input_values = torch.masked_fill(input_values, ~mask.type(torch.bool), 0.0)
        # Number of valid elements
        num_elements = mask.sum(dim=dim, keepdim=True).detach()
        # Mean is sum over number of elements
        x_mean = input_values.sum(dim=dim, keepdim=True).detach() / num_elements
        # std numerator: sum of squared differences to the mean
        numerator = (input_values - x_mean).pow(2).sum(dim=dim, keepdim=True).detach()
        x_std = (numerator / num_elements).sqrt()
        # using the div_guard to prevent division by zero
        normalized = (input_values - x_mean) / (x_std + div_guard)
        # Cleaning elements outside of valid mask
        return torch.masked_fill(normalized, ~mask.type(torch.bool), 0.0)

    mean = input_values.mean(dim=dim, keepdim=True).detach()
    std = (input_values.var(dim=dim, keepdim=True).detach() + div_guard).sqrt()
    return (input_values - mean) / std