Transform
Implementation of data preprocessing transform compatible with the huggingface wav2vec2 one
Wav2Vec2Preprocess (Module)
Source code in thunder/huggingface/transform.py
class Wav2Vec2Preprocess(nn.Module):
def __init__(
self,
div_guard: float = 1e-7,
mask_input: bool = False,
):
"""Wav2Vec model preprocessing. It consists of normalizing the audio and optional mask.
Args:
div_guard: Guard value to prevent division by zero.
mask_input: controls the use of masking in the input tensor.
"""
super().__init__()
self.div_guard = div_guard
self.mask_input = mask_input
def forward(
self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Applies the normalization
Args:
audio: Audio tensor of shape [batch_size, time]
audio_lengths: corresponding length of each element in the input tensor.
Returns:
Normalized audio tensor with same shape as input. Optionally the valid mask
"""
attention_mask: Optional[torch.Tensor] = None
if self.mask_input:
attention_mask = lengths_to_mask(
audio_lengths, max_length=audio.size(-1)
).int()
return (
normalize_tensor(audio, attention_mask, div_guard=self.div_guard),
audio_lengths,
)
__init__(self, div_guard=1e-07, mask_input=False)
special
Wav2Vec model preprocessing. It consists of normalizing the audio and optional mask.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
div_guard |
float |
Guard value to prevent division by zero. |
1e-07 |
mask_input |
bool |
controls the use of masking in the input tensor. |
False |
Source code in thunder/huggingface/transform.py
def __init__(
self,
div_guard: float = 1e-7,
mask_input: bool = False,
):
"""Wav2Vec model preprocessing. It consists of normalizing the audio and optional mask.
Args:
div_guard: Guard value to prevent division by zero.
mask_input: controls the use of masking in the input tensor.
"""
super().__init__()
self.div_guard = div_guard
self.mask_input = mask_input
forward(self, audio, audio_lengths)
Applies the normalization
Parameters:
Name | Type | Description | Default |
---|---|---|---|
audio |
Tensor |
Audio tensor of shape [batch_size, time] |
required |
audio_lengths |
Tensor |
corresponding length of each element in the input tensor. |
required |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, Optional[torch.Tensor]] |
Normalized audio tensor with same shape as input. Optionally the valid mask |
Source code in thunder/huggingface/transform.py
def forward(
self, audio: torch.Tensor, audio_lengths: torch.Tensor
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Applies the normalization
Args:
audio: Audio tensor of shape [batch_size, time]
audio_lengths: corresponding length of each element in the input tensor.
Returns:
Normalized audio tensor with same shape as input. Optionally the valid mask
"""
attention_mask: Optional[torch.Tensor] = None
if self.mask_input:
attention_mask = lengths_to_mask(
audio_lengths, max_length=audio.size(-1)
).int()
return (
normalize_tensor(audio, attention_mask, div_guard=self.div_guard),
audio_lengths,
)