Skip to content

Transform

Implementation of data preprocessing transform compatible with the huggingface wav2vec2 one

Wav2Vec2Preprocess (Module)

Source code in thunder/huggingface/transform.py
class Wav2Vec2Preprocess(nn.Module):
    def __init__(
        self,
        div_guard: float = 1e-7,
        mask_input: bool = False,
    ):
        """Wav2Vec model preprocessing. It consists of normalizing the audio and optional mask.

        Args:
            div_guard: Guard value to prevent division by zero.
            mask_input: controls the use of masking in the input tensor.
        """
        super().__init__()
        self.div_guard = div_guard
        self.mask_input = mask_input

    def forward(
        self, audio: torch.Tensor, audio_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Applies the normalization

        Args:
            audio: Audio tensor of shape [batch_size, time]
            audio_lengths: corresponding length of each element in the input tensor.

        Returns:
            Normalized audio tensor with same shape as input. Optionally the valid mask
        """
        attention_mask: Optional[torch.Tensor] = None
        if self.mask_input:
            attention_mask = lengths_to_mask(
                audio_lengths, max_length=audio.size(-1)
            ).int()

        return (
            normalize_tensor(audio, attention_mask, div_guard=self.div_guard),
            audio_lengths,
        )

__init__(self, div_guard=1e-07, mask_input=False) special

Wav2Vec model preprocessing. It consists of normalizing the audio and optional mask.

Parameters:

Name Type Description Default
div_guard float

Guard value to prevent division by zero.

1e-07
mask_input bool

controls the use of masking in the input tensor.

False
Source code in thunder/huggingface/transform.py
def __init__(
    self,
    div_guard: float = 1e-7,
    mask_input: bool = False,
):
    """Wav2Vec model preprocessing. It consists of normalizing the audio and optional mask.

    Args:
        div_guard: Guard value to prevent division by zero.
        mask_input: controls the use of masking in the input tensor.
    """
    super().__init__()
    self.div_guard = div_guard
    self.mask_input = mask_input

forward(self, audio, audio_lengths)

Applies the normalization

Parameters:

Name Type Description Default
audio Tensor

Audio tensor of shape [batch_size, time]

required
audio_lengths Tensor

corresponding length of each element in the input tensor.

required

Returns:

Type Description
Tuple[torch.Tensor, Optional[torch.Tensor]]

Normalized audio tensor with same shape as input. Optionally the valid mask

Source code in thunder/huggingface/transform.py
def forward(
    self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
    """Applies the normalization

    Args:
        audio: Audio tensor of shape [batch_size, time]
        audio_lengths: corresponding length of each element in the input tensor.

    Returns:
        Normalized audio tensor with same shape as input. Optionally the valid mask
    """
    attention_mask: Optional[torch.Tensor] = None
    if self.mask_input:
        attention_mask = lengths_to_mask(
            audio_lengths, max_length=audio.size(-1)
        ).int()

    return (
        normalize_tensor(audio, attention_mask, div_guard=self.div_guard),
        audio_lengths,
    )