Skip to content

Blocks

Basic building blocks to create the Quartznet model

InitMode (str, Enum)

Weight init methods. Used by init_weights.

Note

Possible values are xavier_uniform,xavier_normal,kaiming_uniform and kaiming_normal

Source code in thunder/quartznet/blocks.py
class InitMode(str, Enum):
    """Weight init methods. Used by [`init_weights`][thunder.quartznet.blocks.init_weights].

    Note:
        Possible values are `xavier_uniform`,`xavier_normal`,`kaiming_uniform` and `kaiming_normal`
    """

    xavier_uniform = "xavier_uniform"
    xavier_normal = "xavier_normal"
    kaiming_uniform = "kaiming_uniform"
    kaiming_normal = "kaiming_normal"

MaskedConv1d (Module)

Source code in thunder/quartznet/blocks.py
class MaskedConv1d(nn.Module):
    __constants__ = ["use_mask", "padding", "dilation", "kernel_size", "stride"]

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: _size_1_t,
        stride: _size_1_t = 1,
        padding: _size_1_t = 0,
        dilation: _size_1_t = 1,
        groups: int = 1,
        bias: bool = False,
        use_mask: bool = True,
    ):
        """Masked Convolution.
        This module correspond to a 1d convolution with input masking. Arguments to create are the
        same as nn.Conv1d, but with the addition of use_mask for special behaviour.
        Args:
            in_channels: Same as nn.Conv1d
            out_channels: Same as nn.Conv1d
            kernel_size: Same as nn.Conv1d
            stride: Same as nn.Conv1d
            padding: Same as nn.Conv1d
            dilation: Same as nn.Conv1d
            groups: Same as nn.Conv1d
            bias: Same as nn.Conv1d
            use_mask: Controls the masking of input before the convolution during the forward.
        """
        super().__init__()

        self.use_mask = use_mask

        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )

        self.padding = self.conv.padding[0]
        self.dilation = self.conv.dilation[0]
        self.kernel_size = self.conv.kernel_size[0]
        self.stride = self.conv.stride[0]

    def get_seq_len(self, lengths: torch.Tensor) -> torch.Tensor:
        """Get the lengths of the inputs after the convolution operation is applied.
        Args:
            lengths: Original lengths of the inputs
        Returns:
            Resulting lengths after the convolution
        """
        return (
            torch.div(
                lengths + 2 * self.padding - self.dilation * (self.kernel_size - 1) - 1,
                self.stride,
                rounding_mode="floor",
            )
            + 1
        )

    def mask_fill(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        """Mask the input based on it's respective lengths.
        Args:
            x: Signal to be processed, of shape (batch, features, time)
            lengths: Lenghts of each element in the batch of x, with shape (batch)
        Returns:
            The masked signal
        """
        mask = lengths_to_mask(lengths, x.shape[-1])
        return x.masked_fill(~mask.unsqueeze(1), 0)

    def forward(
        self, x: torch.Tensor, lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward method
        Args:
            x: Signal to be processed, of shape (batch, features, time)
            lengths: Lenghts of each element in the batch of x, with shape (batch)
        Returns:
            Both the signal processed by the convolution and the resulting lengths
        """
        if self.use_mask:
            x = self.mask_fill(x, lengths)
        out = self.conv(x)
        return out, self.get_seq_len(lengths)

__init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False, use_mask=True) special

Masked Convolution. This module correspond to a 1d convolution with input masking. Arguments to create are the same as nn.Conv1d, but with the addition of use_mask for special behaviour.

Parameters:

Name Type Description Default
in_channels int

Same as nn.Conv1d

required
out_channels int

Same as nn.Conv1d

required
kernel_size Union[int, Tuple[int]]

Same as nn.Conv1d

required
stride Union[int, Tuple[int]]

Same as nn.Conv1d

1
padding Union[int, Tuple[int]]

Same as nn.Conv1d

0
dilation Union[int, Tuple[int]]

Same as nn.Conv1d

1
groups int

Same as nn.Conv1d

1
bias bool

Same as nn.Conv1d

False
use_mask bool

Controls the masking of input before the convolution during the forward.

True
Source code in thunder/quartznet/blocks.py
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    kernel_size: _size_1_t,
    stride: _size_1_t = 1,
    padding: _size_1_t = 0,
    dilation: _size_1_t = 1,
    groups: int = 1,
    bias: bool = False,
    use_mask: bool = True,
):
    """Masked Convolution.
    This module correspond to a 1d convolution with input masking. Arguments to create are the
    same as nn.Conv1d, but with the addition of use_mask for special behaviour.
    Args:
        in_channels: Same as nn.Conv1d
        out_channels: Same as nn.Conv1d
        kernel_size: Same as nn.Conv1d
        stride: Same as nn.Conv1d
        padding: Same as nn.Conv1d
        dilation: Same as nn.Conv1d
        groups: Same as nn.Conv1d
        bias: Same as nn.Conv1d
        use_mask: Controls the masking of input before the convolution during the forward.
    """
    super().__init__()

    self.use_mask = use_mask

    self.conv = nn.Conv1d(
        in_channels,
        out_channels,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups,
        bias=bias,
    )

    self.padding = self.conv.padding[0]
    self.dilation = self.conv.dilation[0]
    self.kernel_size = self.conv.kernel_size[0]
    self.stride = self.conv.stride[0]

forward(self, x, lengths)

Forward method

Parameters:

Name Type Description Default
x Tensor

Signal to be processed, of shape (batch, features, time)

required
lengths Tensor

Lenghts of each element in the batch of x, with shape (batch)

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

Both the signal processed by the convolution and the resulting lengths

Source code in thunder/quartznet/blocks.py
def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    """Forward method
    Args:
        x: Signal to be processed, of shape (batch, features, time)
        lengths: Lenghts of each element in the batch of x, with shape (batch)
    Returns:
        Both the signal processed by the convolution and the resulting lengths
    """
    if self.use_mask:
        x = self.mask_fill(x, lengths)
    out = self.conv(x)
    return out, self.get_seq_len(lengths)

get_seq_len(self, lengths)

Get the lengths of the inputs after the convolution operation is applied.

Parameters:

Name Type Description Default
lengths Tensor

Original lengths of the inputs

required

Returns:

Type Description
Tensor

Resulting lengths after the convolution

Source code in thunder/quartznet/blocks.py
def get_seq_len(self, lengths: torch.Tensor) -> torch.Tensor:
    """Get the lengths of the inputs after the convolution operation is applied.
    Args:
        lengths: Original lengths of the inputs
    Returns:
        Resulting lengths after the convolution
    """
    return (
        torch.div(
            lengths + 2 * self.padding - self.dilation * (self.kernel_size - 1) - 1,
            self.stride,
            rounding_mode="floor",
        )
        + 1
    )

mask_fill(self, x, lengths)

Mask the input based on it's respective lengths.

Parameters:

Name Type Description Default
x Tensor

Signal to be processed, of shape (batch, features, time)

required
lengths Tensor

Lenghts of each element in the batch of x, with shape (batch)

required

Returns:

Type Description
Tensor

The masked signal

Source code in thunder/quartznet/blocks.py
def mask_fill(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
    """Mask the input based on it's respective lengths.
    Args:
        x: Signal to be processed, of shape (batch, features, time)
        lengths: Lenghts of each element in the batch of x, with shape (batch)
    Returns:
        The masked signal
    """
    mask = lengths_to_mask(lengths, x.shape[-1])
    return x.masked_fill(~mask.unsqueeze(1), 0)

QuartznetBlock (Module)

Source code in thunder/quartznet/blocks.py
class QuartznetBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        repeat: int = 5,
        kernel_size: _size_1_t = (11,),
        stride: _size_1_t = (1,),
        dilation: _size_1_t = (1,),
        dropout: float = 0.0,
        residual: bool = True,
        separable: bool = False,
    ):
        """Quartznet block. This is a refactoring of the Jasperblock present on the NeMo toolkit,
        but simplified to only support the new quartznet model. Biggest change is that
        dense residual used on Jasper is not supported here, and masked convolutions were also removed.

        Args:
            in_channels: Number of input channels
            out_channels: Number of output channels
            repeat: Repetitions inside block.
            kernel_size: Kernel size.
            stride: Stride of each repetition.
            dilation: Dilation of each repetition.
            dropout: Dropout used before each activation.
            residual: Controls the use of residual connection.
            separable: Controls the use of separable convolutions.
        """
        super().__init__()

        padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])

        inplanes_loop = in_channels
        conv = []

        for _ in range(repeat - 1):

            conv.extend(
                _get_conv_bn_layer(
                    inplanes_loop,
                    out_channels,
                    kernel_size=kernel_size,
                    stride=stride,
                    dilation=dilation,
                    padding=padding_val,
                    separable=separable,
                    bias=False,
                )
            )

            conv.extend(_get_act_dropout_layer(drop_prob=dropout))

            inplanes_loop = out_channels

        conv.extend(
            _get_conv_bn_layer(
                inplanes_loop,
                out_channels,
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding_val,
                separable=separable,
                bias=False,
            )
        )

        self.mconv = MultiSequential(*conv)

        if residual:
            stride_residual = stride if stride[0] == 1 else stride[0] ** repeat

            self.res = MultiSequential(
                *_get_conv_bn_layer(
                    in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=stride_residual,
                    bias=False,
                )
            )
        else:
            self.res = None

        self.mout = MultiSequential(*_get_act_dropout_layer(drop_prob=dropout))

    def forward(
        self, x: torch.Tensor, lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: Tensor of shape (batch, features, time) where #features == inplanes

        Returns:
            Result of applying the block on the input, and corresponding output lengths
        """

        # compute forward convolutions
        out, lengths_out = self.mconv(x, lengths)

        # compute the residuals
        if self.res is not None:
            res_out, _ = self.res(x, lengths)
            out = out + res_out

        # compute the output
        out, lengths_out = self.mout(out, lengths_out)
        return out, lengths_out

__init__(self, in_channels, out_channels, repeat=5, kernel_size=(11,), stride=(1,), dilation=(1,), dropout=0.0, residual=True, separable=False) special

Quartznet block. This is a refactoring of the Jasperblock present on the NeMo toolkit, but simplified to only support the new quartznet model. Biggest change is that dense residual used on Jasper is not supported here, and masked convolutions were also removed.

Parameters:

Name Type Description Default
in_channels int

Number of input channels

required
out_channels int

Number of output channels

required
repeat int

Repetitions inside block.

5
kernel_size Union[int, Tuple[int]]

Kernel size.

(11,)
stride Union[int, Tuple[int]]

Stride of each repetition.

(1,)
dilation Union[int, Tuple[int]]

Dilation of each repetition.

(1,)
dropout float

Dropout used before each activation.

0.0
residual bool

Controls the use of residual connection.

True
separable bool

Controls the use of separable convolutions.

False
Source code in thunder/quartznet/blocks.py
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    repeat: int = 5,
    kernel_size: _size_1_t = (11,),
    stride: _size_1_t = (1,),
    dilation: _size_1_t = (1,),
    dropout: float = 0.0,
    residual: bool = True,
    separable: bool = False,
):
    """Quartznet block. This is a refactoring of the Jasperblock present on the NeMo toolkit,
    but simplified to only support the new quartznet model. Biggest change is that
    dense residual used on Jasper is not supported here, and masked convolutions were also removed.

    Args:
        in_channels: Number of input channels
        out_channels: Number of output channels
        repeat: Repetitions inside block.
        kernel_size: Kernel size.
        stride: Stride of each repetition.
        dilation: Dilation of each repetition.
        dropout: Dropout used before each activation.
        residual: Controls the use of residual connection.
        separable: Controls the use of separable convolutions.
    """
    super().__init__()

    padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])

    inplanes_loop = in_channels
    conv = []

    for _ in range(repeat - 1):

        conv.extend(
            _get_conv_bn_layer(
                inplanes_loop,
                out_channels,
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding_val,
                separable=separable,
                bias=False,
            )
        )

        conv.extend(_get_act_dropout_layer(drop_prob=dropout))

        inplanes_loop = out_channels

    conv.extend(
        _get_conv_bn_layer(
            inplanes_loop,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding_val,
            separable=separable,
            bias=False,
        )
    )

    self.mconv = MultiSequential(*conv)

    if residual:
        stride_residual = stride if stride[0] == 1 else stride[0] ** repeat

        self.res = MultiSequential(
            *_get_conv_bn_layer(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride_residual,
                bias=False,
            )
        )
    else:
        self.res = None

    self.mout = MultiSequential(*_get_act_dropout_layer(drop_prob=dropout))

forward(self, x, lengths)

Parameters:

Name Type Description Default
x Tensor

Tensor of shape (batch, features, time) where #features == inplanes

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

Result of applying the block on the input, and corresponding output lengths

Source code in thunder/quartznet/blocks.py
def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        x: Tensor of shape (batch, features, time) where #features == inplanes

    Returns:
        Result of applying the block on the input, and corresponding output lengths
    """

    # compute forward convolutions
    out, lengths_out = self.mconv(x, lengths)

    # compute the residuals
    if self.res is not None:
        res_out, _ = self.res(x, lengths)
        out = out + res_out

    # compute the output
    out, lengths_out = self.mout(out, lengths_out)
    return out, lengths_out

QuartznetEncoder(feat_in=64, filters=[256, 256, 512, 512, 512], kernel_sizes=[33, 39, 51, 63, 75], repeat_blocks=1, dropout=0.0)

Basic Quartznet encoder setup. Can be used to build either Quartznet5x5 (repeat_blocks=1) or Quartznet15x5 (repeat_blocks=3)

Parameters:

Name Type Description Default
feat_in int

Number of input features to the model.

64
filters List[int]

List of filter sizes used to create the encoder blocks.

[256, 256, 512, 512, 512]
kernel_sizes List[int]

List of kernel sizes corresponding to each filter size.

[33, 39, 51, 63, 75]
repeat_blocks int

Number of repetitions of each block.

1

Returns:

Type Description
Module

Pytorch model corresponding to the encoder.

Source code in thunder/quartznet/blocks.py
def QuartznetEncoder(
    feat_in: int = 64,
    filters: List[int] = [256, 256, 512, 512, 512],
    kernel_sizes: List[int] = [33, 39, 51, 63, 75],
    repeat_blocks: int = 1,
    dropout: float = 0.0,
) -> nn.Module:
    """Basic Quartznet encoder setup.
    Can be used to build either Quartznet5x5 (repeat_blocks=1) or Quartznet15x5 (repeat_blocks=3)

    Args:
        feat_in: Number of input features to the model.
        filters: List of filter sizes used to create the encoder blocks.
        kernel_sizes: List of kernel sizes corresponding to each filter size.
        repeat_blocks: Number of repetitions of each block.
    Returns:
        Pytorch model corresponding to the encoder.
    """
    return MultiSequential(
        stem(feat_in),
        *body(filters, kernel_sizes, repeat_blocks, dropout),
    )

body(filters, kernel_size, repeat_blocks=1, dropout=0.0)

Creates the body of the Quartznet model. That is the middle part.

Parameters:

Name Type Description Default
filters List[int]

List of filters inside each block in the body.

required
kernel_size List[int]

Corresponding list of kernel sizes for each block. Should have the same length as the first argument.

required
repeat_blocks int

Number of repetitions of each block inside the body.

1

Returns:

Type Description
List[thunder.quartznet.blocks.QuartznetBlock]

List of layers that form the body of the network.

Source code in thunder/quartznet/blocks.py
def body(
    filters: List[int],
    kernel_size: List[int],
    repeat_blocks: int = 1,
    dropout: float = 0.0,
) -> List[QuartznetBlock]:
    """Creates the body of the Quartznet model. That is the middle part.

    Args:
        filters: List of filters inside each block in the body.
        kernel_size: Corresponding list of kernel sizes for each block. Should have the same length as the first argument.
        repeat_blocks: Number of repetitions of each block inside the body.

    Returns:
        List of layers that form the body of the network.
    """
    layers = []
    f_in = 256
    for f, k in zip(filters, kernel_size):
        for _ in range(repeat_blocks):
            layers.append(
                QuartznetBlock(
                    f_in, f, kernel_size=(k,), separable=True, dropout=dropout
                )
            )
            f_in = f
    layers.extend(
        [
            QuartznetBlock(
                f_in,
                512,
                repeat=1,
                dilation=(2,),
                kernel_size=(87,),
                residual=False,
                separable=True,
                dropout=dropout,
            ),
            QuartznetBlock(
                512,
                1024,
                repeat=1,
                kernel_size=(1,),
                residual=False,
                separable=False,
                dropout=dropout,
            ),
        ]
    )
    return layers

init_weights(m, mode=<InitMode.xavier_uniform: 'xavier_uniform'>)

Initialize Linear, Conv1d or BatchNorm1d weights. There's no return, the operation occurs inplace.

Parameters:

Name Type Description Default
m Module

The layer to be initialized

required
mode InitMode

Weight initialization mode. Only applicable to linear and conv layers.

<InitMode.xavier_uniform: 'xavier_uniform'>

Exceptions:

Type Description
ValueError

Raised when the initial mode is not one of the possible options.

Source code in thunder/quartznet/blocks.py
def init_weights(m: nn.Module, mode: InitMode = InitMode.xavier_uniform):
    """Initialize Linear, Conv1d or BatchNorm1d weights.
    There's no return, the operation occurs inplace.

    Args:
        m: The layer to be initialized
        mode: Weight initialization mode. Only applicable to linear and conv layers.

    Raises:
        ValueError: Raised when the initial mode is not one of the possible options.
    """
    if isinstance(m, MaskedConv1d):
        init_weights(m.conv, mode)
    if isinstance(m, (nn.Conv1d, nn.Linear)):
        if mode == InitMode.xavier_uniform:
            nn.init.xavier_uniform_(m.weight, gain=1.0)
        elif mode == InitMode.xavier_normal:
            nn.init.xavier_normal_(m.weight, gain=1.0)
        elif mode == InitMode.kaiming_uniform:
            nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
        elif mode == InitMode.kaiming_normal:
            nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
        else:
            raise ValueError(f"Unknown Initialization mode: {mode}")
    elif isinstance(m, nn.BatchNorm1d):
        if m.track_running_stats:
            m.running_mean.zero_()
            m.running_var.fill_(1)
            m.num_batches_tracked.zero_()
        if m.affine:
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)

stem(feat_in)

Creates the Quartznet stem. That is the first block of the model, that process the input directly.

Parameters:

Name Type Description Default
feat_in int

Number of input features

required

Returns:

Type Description
QuartznetBlock

Quartznet stem block

Source code in thunder/quartznet/blocks.py
def stem(feat_in: int) -> QuartznetBlock:
    """Creates the Quartznet stem. That is the first block of the model, that process the input directly.

    Args:
        feat_in: Number of input features

    Returns:
        Quartznet stem block
    """
    return QuartznetBlock(
        feat_in,
        256,
        repeat=1,
        stride=(2,),
        kernel_size=(33,),
        residual=False,
        separable=True,
    )