Blocks

Basic building blocks to create the Quartznet model

`InitMode (str, Enum)`

Weight init methods. Used by init_weights.

Note

Possible values are xavier_uniform,xavier_normal,kaiming_uniform and kaiming_normal

Source code in thunder/quartznet/blocks.py

class InitMode(str, Enum):
    """Weight init methods. Used by [`init_weights`][thunder.quartznet.blocks.init_weights].

    Note:
        Possible values are `xavier_uniform`,`xavier_normal`,`kaiming_uniform` and `kaiming_normal`
    """

    xavier_uniform = "xavier_uniform"
    xavier_normal = "xavier_normal"
    kaiming_uniform = "kaiming_uniform"
    kaiming_normal = "kaiming_normal"

`MaskedConv1d (Module)`

Source code in thunder/quartznet/blocks.py

class MaskedConv1d(nn.Module):
    __constants__ = ["use_mask", "padding", "dilation", "kernel_size", "stride"]

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: _size_1_t,
        stride: _size_1_t = 1,
        padding: _size_1_t = 0,
        dilation: _size_1_t = 1,
        groups: int = 1,
        bias: bool = False,
        use_mask: bool = True,
    ):
        """Masked Convolution.
        This module correspond to a 1d convolution with input masking. Arguments to create are the
        same as nn.Conv1d, but with the addition of use_mask for special behaviour.
        Args:
            in_channels: Same as nn.Conv1d
            out_channels: Same as nn.Conv1d
            kernel_size: Same as nn.Conv1d
            stride: Same as nn.Conv1d
            padding: Same as nn.Conv1d
            dilation: Same as nn.Conv1d
            groups: Same as nn.Conv1d
            bias: Same as nn.Conv1d
            use_mask: Controls the masking of input before the convolution during the forward.
        """
        super().__init__()

        self.use_mask = use_mask

        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )

        self.padding = self.conv.padding[0]
        self.dilation = self.conv.dilation[0]
        self.kernel_size = self.conv.kernel_size[0]
        self.stride = self.conv.stride[0]

    def get_seq_len(self, lengths: torch.Tensor) -> torch.Tensor:
        """Get the lengths of the inputs after the convolution operation is applied.
        Args:
            lengths: Original lengths of the inputs
        Returns:
            Resulting lengths after the convolution
        """
        return (
            torch.div(
                lengths + 2 * self.padding - self.dilation * (self.kernel_size - 1) - 1,
                self.stride,
                rounding_mode="floor",
            )
            + 1
        )

    def mask_fill(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        """Mask the input based on it's respective lengths.
        Args:
            x: Signal to be processed, of shape (batch, features, time)
            lengths: Lenghts of each element in the batch of x, with shape (batch)
        Returns:
            The masked signal
        """
        mask = lengths_to_mask(lengths, x.shape[-1])
        return x.masked_fill(~mask.unsqueeze(1), 0)

    def forward(
        self, x: torch.Tensor, lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward method
        Args:
            x: Signal to be processed, of shape (batch, features, time)
            lengths: Lenghts of each element in the batch of x, with shape (batch)
        Returns:
            Both the signal processed by the convolution and the resulting lengths
        """
        if self.use_mask:
            x = self.mask_fill(x, lengths)
        out = self.conv(x)
        return out, self.get_seq_len(lengths)

`init(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False, use_mask=True)` `special`

Masked Convolution. This module correspond to a 1d convolution with input masking. Arguments to create are the same as nn.Conv1d, but with the addition of use_mask for special behaviour.

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Same as nn.Conv1d	required
`out_channels`	`int`	Same as nn.Conv1d	required
`kernel_size`	`Union[int, Tuple[int]]`	Same as nn.Conv1d	required
`stride`	`Union[int, Tuple[int]]`	Same as nn.Conv1d	`1`
`padding`	`Union[int, Tuple[int]]`	Same as nn.Conv1d	`0`
`dilation`	`Union[int, Tuple[int]]`	Same as nn.Conv1d	`1`
`groups`	`int`	Same as nn.Conv1d	`1`
`bias`	`bool`	Same as nn.Conv1d	`False`
`use_mask`	`bool`	Controls the masking of input before the convolution during the forward.	`True`

Source code in thunder/quartznet/blocks.py

def __init__(
    self,
    in_channels: int,
    out_channels: int,
    kernel_size: _size_1_t,
    stride: _size_1_t = 1,
    padding: _size_1_t = 0,
    dilation: _size_1_t = 1,
    groups: int = 1,
    bias: bool = False,
    use_mask: bool = True,
):
    """Masked Convolution.
    This module correspond to a 1d convolution with input masking. Arguments to create are the
    same as nn.Conv1d, but with the addition of use_mask for special behaviour.
    Args:
        in_channels: Same as nn.Conv1d
        out_channels: Same as nn.Conv1d
        kernel_size: Same as nn.Conv1d
        stride: Same as nn.Conv1d
        padding: Same as nn.Conv1d
        dilation: Same as nn.Conv1d
        groups: Same as nn.Conv1d
        bias: Same as nn.Conv1d
        use_mask: Controls the masking of input before the convolution during the forward.
    """
    super().__init__()

    self.use_mask = use_mask

    self.conv = nn.Conv1d(
        in_channels,
        out_channels,
        kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups,
        bias=bias,
    )

    self.padding = self.conv.padding[0]
    self.dilation = self.conv.dilation[0]
    self.kernel_size = self.conv.kernel_size[0]
    self.stride = self.conv.stride[0]

`forward(self, x, lengths)`

Forward method

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Signal to be processed, of shape (batch, features, time)	required
`lengths`	`Tensor`	Lenghts of each element in the batch of x, with shape (batch)	required

Returns:

Type	Description
`Tuple[torch.Tensor, torch.Tensor]`	Both the signal processed by the convolution and the resulting lengths

Source code in thunder/quartznet/blocks.py

def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    """Forward method
    Args:
        x: Signal to be processed, of shape (batch, features, time)
        lengths: Lenghts of each element in the batch of x, with shape (batch)
    Returns:
        Both the signal processed by the convolution and the resulting lengths
    """
    if self.use_mask:
        x = self.mask_fill(x, lengths)
    out = self.conv(x)
    return out, self.get_seq_len(lengths)

`get_seq_len(self, lengths)`

Get the lengths of the inputs after the convolution operation is applied.

Parameters:

Name	Type	Description	Default
`lengths`	`Tensor`	Original lengths of the inputs	required

Returns:

Type	Description
`Tensor`	Resulting lengths after the convolution

Source code in thunder/quartznet/blocks.py

def get_seq_len(self, lengths: torch.Tensor) -> torch.Tensor:
    """Get the lengths of the inputs after the convolution operation is applied.
    Args:
        lengths: Original lengths of the inputs
    Returns:
        Resulting lengths after the convolution
    """
    return (
        torch.div(
            lengths + 2 * self.padding - self.dilation * (self.kernel_size - 1) - 1,
            self.stride,
            rounding_mode="floor",
        )
        + 1
    )

`mask_fill(self, x, lengths)`

Mask the input based on it's respective lengths.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Signal to be processed, of shape (batch, features, time)	required
`lengths`	`Tensor`	Lenghts of each element in the batch of x, with shape (batch)	required

Returns:

Type	Description
`Tensor`	The masked signal

Source code in thunder/quartznet/blocks.py

def mask_fill(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
    """Mask the input based on it's respective lengths.
    Args:
        x: Signal to be processed, of shape (batch, features, time)
        lengths: Lenghts of each element in the batch of x, with shape (batch)
    Returns:
        The masked signal
    """
    mask = lengths_to_mask(lengths, x.shape[-1])
    return x.masked_fill(~mask.unsqueeze(1), 0)

`QuartznetBlock (Module)`

Source code in thunder/quartznet/blocks.py

class QuartznetBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        repeat: int = 5,
        kernel_size: _size_1_t = (11,),
        stride: _size_1_t = (1,),
        dilation: _size_1_t = (1,),
        dropout: float = 0.0,
        residual: bool = True,
        separable: bool = False,
    ):
        """Quartznet block. This is a refactoring of the Jasperblock present on the NeMo toolkit,
        but simplified to only support the new quartznet model. Biggest change is that
        dense residual used on Jasper is not supported here, and masked convolutions were also removed.

        Args:
            in_channels: Number of input channels
            out_channels: Number of output channels
            repeat: Repetitions inside block.
            kernel_size: Kernel size.
            stride: Stride of each repetition.
            dilation: Dilation of each repetition.
            dropout: Dropout used before each activation.
            residual: Controls the use of residual connection.
            separable: Controls the use of separable convolutions.
        """
        super().__init__()

        padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])

        inplanes_loop = in_channels
        conv = []

        for _ in range(repeat - 1):

            conv.extend(
                _get_conv_bn_layer(
                    inplanes_loop,
                    out_channels,
                    kernel_size=kernel_size,
                    stride=stride,
                    dilation=dilation,
                    padding=padding_val,
                    separable=separable,
                    bias=False,
                )
            )

            conv.extend(_get_act_dropout_layer(drop_prob=dropout))

            inplanes_loop = out_channels

        conv.extend(
            _get_conv_bn_layer(
                inplanes_loop,
                out_channels,
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding_val,
                separable=separable,
                bias=False,
            )
        )

        self.mconv = MultiSequential(*conv)

        if residual:
            stride_residual = stride if stride[0] == 1 else stride[0] ** repeat

            self.res = MultiSequential(
                *_get_conv_bn_layer(
                    in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=stride_residual,
                    bias=False,
                )
            )
        else:
            self.res = None

        self.mout = MultiSequential(*_get_act_dropout_layer(drop_prob=dropout))

    def forward(
        self, x: torch.Tensor, lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: Tensor of shape (batch, features, time) where #features == inplanes

        Returns:
            Result of applying the block on the input, and corresponding output lengths
        """

        # compute forward convolutions
        out, lengths_out = self.mconv(x, lengths)

        # compute the residuals
        if self.res is not None:
            res_out, _ = self.res(x, lengths)
            out = out + res_out

        # compute the output
        out, lengths_out = self.mout(out, lengths_out)
        return out, lengths_out

`init(self, in_channels, out_channels, repeat=5, kernel_size=(11,), stride=(1,), dilation=(1,), dropout=0.0, residual=True, separable=False)` `special`

Quartznet block. This is a refactoring of the Jasperblock present on the NeMo toolkit, but simplified to only support the new quartznet model. Biggest change is that dense residual used on Jasper is not supported here, and masked convolutions were also removed.

Parameters:

Name	Type	Description	Default
`in_channels`	`int`	Number of input channels	required
`out_channels`	`int`	Number of output channels	required
`repeat`	`int`	Repetitions inside block.	`5`
`kernel_size`	`Union[int, Tuple[int]]`	Kernel size.	`(11,)`
`stride`	`Union[int, Tuple[int]]`	Stride of each repetition.	`(1,)`
`dilation`	`Union[int, Tuple[int]]`	Dilation of each repetition.	`(1,)`
`dropout`	`float`	Dropout used before each activation.	`0.0`
`residual`	`bool`	Controls the use of residual connection.	`True`
`separable`	`bool`	Controls the use of separable convolutions.	`False`

Source code in thunder/quartznet/blocks.py

def __init__(
    self,
    in_channels: int,
    out_channels: int,
    repeat: int = 5,
    kernel_size: _size_1_t = (11,),
    stride: _size_1_t = (1,),
    dilation: _size_1_t = (1,),
    dropout: float = 0.0,
    residual: bool = True,
    separable: bool = False,
):
    """Quartznet block. This is a refactoring of the Jasperblock present on the NeMo toolkit,
    but simplified to only support the new quartznet model. Biggest change is that
    dense residual used on Jasper is not supported here, and masked convolutions were also removed.

    Args:
        in_channels: Number of input channels
        out_channels: Number of output channels
        repeat: Repetitions inside block.
        kernel_size: Kernel size.
        stride: Stride of each repetition.
        dilation: Dilation of each repetition.
        dropout: Dropout used before each activation.
        residual: Controls the use of residual connection.
        separable: Controls the use of separable convolutions.
    """
    super().__init__()

    padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])

    inplanes_loop = in_channels
    conv = []

    for _ in range(repeat - 1):

        conv.extend(
            _get_conv_bn_layer(
                inplanes_loop,
                out_channels,
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding_val,
                separable=separable,
                bias=False,
            )
        )

        conv.extend(_get_act_dropout_layer(drop_prob=dropout))

        inplanes_loop = out_channels

    conv.extend(
        _get_conv_bn_layer(
            inplanes_loop,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding_val,
            separable=separable,
            bias=False,
        )
    )

    self.mconv = MultiSequential(*conv)

    if residual:
        stride_residual = stride if stride[0] == 1 else stride[0] ** repeat

        self.res = MultiSequential(
            *_get_conv_bn_layer(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride_residual,
                bias=False,
            )
        )
    else:
        self.res = None

    self.mout = MultiSequential(*_get_act_dropout_layer(drop_prob=dropout))

`forward(self, x, lengths)`

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Tensor of shape (batch, features, time) where #features == inplanes	required

Returns:

Type	Description
`Tuple[torch.Tensor, torch.Tensor]`	Result of applying the block on the input, and corresponding output lengths

Source code in thunder/quartznet/blocks.py

def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        x: Tensor of shape (batch, features, time) where #features == inplanes

    Returns:
        Result of applying the block on the input, and corresponding output lengths
    """

    # compute forward convolutions
    out, lengths_out = self.mconv(x, lengths)

    # compute the residuals
    if self.res is not None:
        res_out, _ = self.res(x, lengths)
        out = out + res_out

    # compute the output
    out, lengths_out = self.mout(out, lengths_out)
    return out, lengths_out

`QuartznetEncoder(feat_in=64, filters=[256, 256, 512, 512, 512], kernel_sizes=[33, 39, 51, 63, 75], repeat_blocks=1, dropout=0.0)`

Basic Quartznet encoder setup. Can be used to build either Quartznet5x5 (repeat_blocks=1) or Quartznet15x5 (repeat_blocks=3)

Parameters:

Name	Type	Description	Default
`feat_in`	`int`	Number of input features to the model.	`64`
`filters`	`List[int]`	List of filter sizes used to create the encoder blocks.	`[256, 256, 512, 512, 512]`
`kernel_sizes`	`List[int]`	List of kernel sizes corresponding to each filter size.	`[33, 39, 51, 63, 75]`
`repeat_blocks`	`int`	Number of repetitions of each block.	`1`

Returns:

Type	Description
`Module`	Pytorch model corresponding to the encoder.

Source code in thunder/quartznet/blocks.py

def QuartznetEncoder(
    feat_in: int = 64,
    filters: List[int] = [256, 256, 512, 512, 512],
    kernel_sizes: List[int] = [33, 39, 51, 63, 75],
    repeat_blocks: int = 1,
    dropout: float = 0.0,
) -> nn.Module:
    """Basic Quartznet encoder setup.
    Can be used to build either Quartznet5x5 (repeat_blocks=1) or Quartznet15x5 (repeat_blocks=3)

    Args:
        feat_in: Number of input features to the model.
        filters: List of filter sizes used to create the encoder blocks.
        kernel_sizes: List of kernel sizes corresponding to each filter size.
        repeat_blocks: Number of repetitions of each block.
    Returns:
        Pytorch model corresponding to the encoder.
    """
    return MultiSequential(
        stem(feat_in),
        *body(filters, kernel_sizes, repeat_blocks, dropout),
    )

`body(filters, kernel_size, repeat_blocks=1, dropout=0.0)`

Creates the body of the Quartznet model. That is the middle part.

Parameters:

Name	Type	Description	Default
`filters`	`List[int]`	List of filters inside each block in the body.	required
`kernel_size`	`List[int]`	Corresponding list of kernel sizes for each block. Should have the same length as the first argument.	required
`repeat_blocks`	`int`	Number of repetitions of each block inside the body.	`1`

Returns:

Type	Description
`List[thunder.quartznet.blocks.QuartznetBlock]`	List of layers that form the body of the network.

Source code in thunder/quartznet/blocks.py

def body(
    filters: List[int],
    kernel_size: List[int],
    repeat_blocks: int = 1,
    dropout: float = 0.0,
) -> List[QuartznetBlock]:
    """Creates the body of the Quartznet model. That is the middle part.

    Args:
        filters: List of filters inside each block in the body.
        kernel_size: Corresponding list of kernel sizes for each block. Should have the same length as the first argument.
        repeat_blocks: Number of repetitions of each block inside the body.

    Returns:
        List of layers that form the body of the network.
    """
    layers = []
    f_in = 256
    for f, k in zip(filters, kernel_size):
        for _ in range(repeat_blocks):
            layers.append(
                QuartznetBlock(
                    f_in, f, kernel_size=(k,), separable=True, dropout=dropout
                )
            )
            f_in = f
    layers.extend(
        [
            QuartznetBlock(
                f_in,
                512,
                repeat=1,
                dilation=(2,),
                kernel_size=(87,),
                residual=False,
                separable=True,
                dropout=dropout,
            ),
            QuartznetBlock(
                512,
                1024,
                repeat=1,
                kernel_size=(1,),
                residual=False,
                separable=False,
                dropout=dropout,
            ),
        ]
    )
    return layers

`init_weights(m, mode=<InitMode.xavier_uniform: 'xavier_uniform'>)`

Initialize Linear, Conv1d or BatchNorm1d weights. There's no return, the operation occurs inplace.

Parameters:

Name	Type	Description	Default
`m`	`Module`	The layer to be initialized	required
`mode`	`InitMode`	Weight initialization mode. Only applicable to linear and conv layers.	`<InitMode.xavier_uniform: 'xavier_uniform'>`

Exceptions:

Type	Description
`ValueError`	Raised when the initial mode is not one of the possible options.

Source code in thunder/quartznet/blocks.py

def init_weights(m: nn.Module, mode: InitMode = InitMode.xavier_uniform):
    """Initialize Linear, Conv1d or BatchNorm1d weights.
    There's no return, the operation occurs inplace.

    Args:
        m: The layer to be initialized
        mode: Weight initialization mode. Only applicable to linear and conv layers.

    Raises:
        ValueError: Raised when the initial mode is not one of the possible options.
    """
    if isinstance(m, MaskedConv1d):
        init_weights(m.conv, mode)
    if isinstance(m, (nn.Conv1d, nn.Linear)):
        if mode == InitMode.xavier_uniform:
            nn.init.xavier_uniform_(m.weight, gain=1.0)
        elif mode == InitMode.xavier_normal:
            nn.init.xavier_normal_(m.weight, gain=1.0)
        elif mode == InitMode.kaiming_uniform:
            nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
        elif mode == InitMode.kaiming_normal:
            nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
        else:
            raise ValueError(f"Unknown Initialization mode: {mode}")
    elif isinstance(m, nn.BatchNorm1d):
        if m.track_running_stats:
            m.running_mean.zero_()
            m.running_var.fill_(1)
            m.num_batches_tracked.zero_()
        if m.affine:
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)

`stem(feat_in)`

Creates the Quartznet stem. That is the first block of the model, that process the input directly.

Parameters:

Name	Type	Description	Default
`feat_in`	`int`	Number of input features	required

Returns:

Type	Description
`QuartznetBlock`	Quartznet stem block

Source code in thunder/quartznet/blocks.py

def stem(feat_in: int) -> QuartznetBlock:
    """Creates the Quartznet stem. That is the first block of the model, that process the input directly.

    Args:
        feat_in: Number of input features

    Returns:
        Quartznet stem block
    """
    return QuartznetBlock(
        feat_in,
        256,
        repeat=1,
        stride=(2,),
        kernel_size=(33,),
        residual=False,
        separable=True,
    )

Blocks

InitMode (str, Enum)

MaskedConv1d (Module)

__init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False, use_mask=True) special

forward(self, x, lengths)

get_seq_len(self, lengths)

mask_fill(self, x, lengths)

QuartznetBlock (Module)

__init__(self, in_channels, out_channels, repeat=5, kernel_size=(11,), stride=(1,), dilation=(1,), dropout=0.0, residual=True, separable=False) special

forward(self, x, lengths)

QuartznetEncoder(feat_in=64, filters=[256, 256, 512, 512, 512], kernel_sizes=[33, 39, 51, 63, 75], repeat_blocks=1, dropout=0.0)

body(filters, kernel_size, repeat_blocks=1, dropout=0.0)

init_weights(m, mode=<InitMode.xavier_uniform: 'xavier_uniform'>)

stem(feat_in)

`InitMode (str, Enum)`

`MaskedConv1d (Module)`

`init(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False, use_mask=True)` `special`

`forward(self, x, lengths)`

`get_seq_len(self, lengths)`

`mask_fill(self, x, lengths)`

`QuartznetBlock (Module)`

`init(self, in_channels, out_channels, repeat=5, kernel_size=(11,), stride=(1,), dilation=(1,), dropout=0.0, residual=True, separable=False)` `special`

`forward(self, x, lengths)`

`QuartznetEncoder(feat_in=64, filters=[256, 256, 512, 512, 512], kernel_sizes=[33, 39, 51, 63, 75], repeat_blocks=1, dropout=0.0)`

`body(filters, kernel_size, repeat_blocks=1, dropout=0.0)`

`init_weights(m, mode=<InitMode.xavier_uniform: 'xavier_uniform'>)`

`stem(feat_in)`