Dataset

Speech recognition datasets

`AudioFileLoader (Module)`

Source code in thunder/data/dataset.py

class AudioFileLoader(nn.Module):
    def __init__(self, force_mono: bool = True, sample_rate: int = 16000):
        """Module containing the data loading and basic preprocessing.
        It's used internally by the datasets, but can be exported so
        that during inference time there's no code dependency.

        Args:
            force_mono: If true, convert all the loaded samples to mono.
            sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
        """
        super().__init__()
        self.force_mono = force_mono
        self.sample_rate = sample_rate

    @torch.jit.export
    def open_audio(self, item: str) -> Tuple[Tensor, int]:
        """Uses the data returned by get_item to open the audio

        Args:
            item: Data returned by get_item(index)

        Returns:
            Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
        """
        return torchaudio.load(item)

    @torch.jit.export
    def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
        """Apply some base transforms to the audio, that fix silent problems.
        It transforms all the audios to mono (depending on class creation parameter),
        remove the possible DC bias present and then resamples the audios to a common
        sample rate.

        Args:
            audio: Audio tensor
            sample_rate: Sample rate

        Returns:
            Audio tensor after the transforms.
        """
        if self.force_mono and (audio.shape[0] > 1):
            audio = audio.mean(0, keepdim=True)

        # Removing the dc component from the audio
        # It happens when a faulty capture device introduce
        # an offset into the recorded waveform, and this can
        # cause problems with later transforms.
        # https://en.wikipedia.org/wiki/DC_bias
        audio = audio - audio.mean(1)

        if self.sample_rate != sample_rate:
            audio = resample(
                audio, orig_freq=int(sample_rate), new_freq=int(self.sample_rate)
            )
        return audio

    def forward(self, item: str) -> Tensor:
        """Opens audio item and do basic preprocessing

        Args:
            item: Path to the audio to be opened

        Returns:
            Audio tensor after preprocessing
        """
        audio, sample_rate = self.open_audio(item)
        return self.preprocess_audio(audio, sample_rate)

`init(self, force_mono=True, sample_rate=16000)` `special`

Module containing the data loading and basic preprocessing. It's used internally by the datasets, but can be exported so that during inference time there's no code dependency.

Parameters:

Name	Type	Description	Default
`force_mono`	`bool`	If true, convert all the loaded samples to mono.	`True`
`sample_rate`	`int`	Sample rate used by the dataset. All of the samples that have different rate will be resampled.	`16000`

Source code in thunder/data/dataset.py

def __init__(self, force_mono: bool = True, sample_rate: int = 16000):
    """Module containing the data loading and basic preprocessing.
    It's used internally by the datasets, but can be exported so
    that during inference time there's no code dependency.

    Args:
        force_mono: If true, convert all the loaded samples to mono.
        sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
    """
    super().__init__()
    self.force_mono = force_mono
    self.sample_rate = sample_rate

`forward(self, item)`

Opens audio item and do basic preprocessing

Parameters:

Name	Type	Description	Default
`item`	`str`	Path to the audio to be opened	required

Returns:

Type	Description
`Tensor`	Audio tensor after preprocessing

Source code in thunder/data/dataset.py

def forward(self, item: str) -> Tensor:
    """Opens audio item and do basic preprocessing

    Args:
        item: Path to the audio to be opened

    Returns:
        Audio tensor after preprocessing
    """
    audio, sample_rate = self.open_audio(item)
    return self.preprocess_audio(audio, sample_rate)

`open_audio(self, item)`

Uses the data returned by get_item to open the audio

Parameters:

Name	Type	Description	Default
`item`	`str`	Data returned by get_item(index)	required

Returns:

Type	Description
`Tuple[torch.Tensor, int]`	Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.

Source code in thunder/data/dataset.py

@torch.jit.export
def open_audio(self, item: str) -> Tuple[Tensor, int]:
    """Uses the data returned by get_item to open the audio

    Args:
        item: Data returned by get_item(index)

    Returns:
        Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
    """
    return torchaudio.load(item)

`preprocess_audio(self, audio, sample_rate)`

Apply some base transforms to the audio, that fix silent problems. It transforms all the audios to mono (depending on class creation parameter), remove the possible DC bias present and then resamples the audios to a common sample rate.

Parameters:

Name	Type	Description	Default
`audio`	`Tensor`	Audio tensor	required
`sample_rate`	`int`	Sample rate	required

Returns:

Type	Description
`Tensor`	Audio tensor after the transforms.

Source code in thunder/data/dataset.py

@torch.jit.export
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
    """Apply some base transforms to the audio, that fix silent problems.
    It transforms all the audios to mono (depending on class creation parameter),
    remove the possible DC bias present and then resamples the audios to a common
    sample rate.

    Args:
        audio: Audio tensor
        sample_rate: Sample rate

    Returns:
        Audio tensor after the transforms.
    """
    if self.force_mono and (audio.shape[0] > 1):
        audio = audio.mean(0, keepdim=True)

    # Removing the dc component from the audio
    # It happens when a faulty capture device introduce
    # an offset into the recorded waveform, and this can
    # cause problems with later transforms.
    # https://en.wikipedia.org/wiki/DC_bias
    audio = audio - audio.mean(1)

    if self.sample_rate != sample_rate:
        audio = resample(
            audio, orig_freq=int(sample_rate), new_freq=int(self.sample_rate)
        )
    return audio

`BaseSpeechDataset (Dataset)`

Source code in thunder/data/dataset.py

class BaseSpeechDataset(Dataset):
    def __init__(
        self, items: Sequence, force_mono: bool = True, sample_rate: int = 16000
    ):
        """This is the base class that implements the minimal functionality to have a compatible
        speech dataset, in a way that can be easily customized by subclassing.

        Args:
            items: Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.
            force_mono: If true, convert all the loaded samples to mono.
            sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
        """
        super().__init__()
        self.items = items
        self.loader = AudioFileLoader(force_mono, sample_rate)

    def __len__(self):
        return len(self.items)

    def __getitem__(self, index: int) -> Tuple[Tensor, str]:
        item = self.get_item(index)
        # Dealing with input
        audio, sr = self.open_audio(item)
        audio = self.preprocess_audio(audio, sr)
        # Dealing with output
        text = self.open_text(item)
        text = self.preprocess_text(text)

        return audio, text

    def all_outputs(self) -> List[str]:
        """Return a list with just the outputs for the whole dataset.
        Useful when creating the initial vocab tokens, or to train a
        language model.

        Returns:
            All of the outputs of the dataset, with the corresponding preprocessing applied.
        """
        outputs = []
        for index in range(len(self)):
            item = self.get_item(index)
            text = self.open_text(item)
            text = self.preprocess_text(text)
            outputs.append(text)
        return outputs

    def get_item(self, index: int) -> Any:
        """Get the item source specified by the index.

        Args:
            index: Indicates what item it needs to return information about.

        Returns:
            Whatever data necessary to open the audio and text corresponding to this index.
        """
        return self.items[index]

    def open_audio(self, item: Any) -> Tuple[Tensor, int]:
        """Uses the data returned by get_item to open the audio

        Args:
            item: Data returned by get_item(index)

        Returns:
            Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
        """
        return self.loader.open_audio(item)

    def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
        """Apply some base transforms to the audio, that fix silent problems.
        It transforms all the audios to mono (depending on class creation parameter),
        remove the possible DC bias present and then resamples the audios to a common
        sample rate.

        Args:
            audio: Audio tensor
            sample_rate: Sample rate

        Returns:
            Audio tensor after the transforms.
        """
        return self.loader.preprocess_audio(audio, sample_rate)

    def open_text(self, item: Any) -> str:
        """Opens the transcription based on the data returned by get_item(index)

        Args:
            item: The data returned by get_item.

        Returns:
            The transcription corresponding to the item.
        """
        raise NotImplementedError()

    def preprocess_text(self, text: str) -> str:
        """Add here preprocessing steps to remove some common problems in the text.

        Args:
            text: Label text

        Returns:
            Label text after processing
        """
        return text

`init(self, items, force_mono=True, sample_rate=16000)` `special`

This is the base class that implements the minimal functionality to have a compatible speech dataset, in a way that can be easily customized by subclassing.

Parameters:

Name	Type	Description	Default
`items`	`Sequence`	Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.	required
`force_mono`	`bool`	If true, convert all the loaded samples to mono.	`True`
`sample_rate`	`int`	Sample rate used by the dataset. All of the samples that have different rate will be resampled.	`16000`

Source code in thunder/data/dataset.py

def __init__(
    self, items: Sequence, force_mono: bool = True, sample_rate: int = 16000
):
    """This is the base class that implements the minimal functionality to have a compatible
    speech dataset, in a way that can be easily customized by subclassing.

    Args:
        items: Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.
        force_mono: If true, convert all the loaded samples to mono.
        sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
    """
    super().__init__()
    self.items = items
    self.loader = AudioFileLoader(force_mono, sample_rate)

`all_outputs(self)`

Return a list with just the outputs for the whole dataset. Useful when creating the initial vocab tokens, or to train a language model.

Returns:

Type	Description
`List[str]`	All of the outputs of the dataset, with the corresponding preprocessing applied.

Source code in thunder/data/dataset.py

def all_outputs(self) -> List[str]:
    """Return a list with just the outputs for the whole dataset.
    Useful when creating the initial vocab tokens, or to train a
    language model.

    Returns:
        All of the outputs of the dataset, with the corresponding preprocessing applied.
    """
    outputs = []
    for index in range(len(self)):
        item = self.get_item(index)
        text = self.open_text(item)
        text = self.preprocess_text(text)
        outputs.append(text)
    return outputs

`get_item(self, index)`

Get the item source specified by the index.

Parameters:

Name	Type	Description	Default
`index`	`int`	Indicates what item it needs to return information about.	required

Returns:

Type	Description
`Any`	Whatever data necessary to open the audio and text corresponding to this index.

Source code in thunder/data/dataset.py

def get_item(self, index: int) -> Any:
    """Get the item source specified by the index.

    Args:
        index: Indicates what item it needs to return information about.

    Returns:
        Whatever data necessary to open the audio and text corresponding to this index.
    """
    return self.items[index]

`open_audio(self, item)`

Uses the data returned by get_item to open the audio

Parameters:

Name	Type	Description	Default
`item`	`Any`	Data returned by get_item(index)	required

Returns:

Type	Description
`Tuple[torch.Tensor, int]`	Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.

Source code in thunder/data/dataset.py

def open_audio(self, item: Any) -> Tuple[Tensor, int]:
    """Uses the data returned by get_item to open the audio

    Args:
        item: Data returned by get_item(index)

    Returns:
        Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
    """
    return self.loader.open_audio(item)

`open_text(self, item)`

Opens the transcription based on the data returned by get_item(index)

Parameters:

Name	Type	Description	Default
`item`	`Any`	The data returned by get_item.	required

Returns:

Type	Description
`str`	The transcription corresponding to the item.

Source code in thunder/data/dataset.py

def open_text(self, item: Any) -> str:
    """Opens the transcription based on the data returned by get_item(index)

    Args:
        item: The data returned by get_item.

    Returns:
        The transcription corresponding to the item.
    """
    raise NotImplementedError()

`preprocess_audio(self, audio, sample_rate)`

Apply some base transforms to the audio, that fix silent problems. It transforms all the audios to mono (depending on class creation parameter), remove the possible DC bias present and then resamples the audios to a common sample rate.

Parameters:

Name	Type	Description	Default
`audio`	`Tensor`	Audio tensor	required
`sample_rate`	`int`	Sample rate	required

Returns:

Type	Description
`Tensor`	Audio tensor after the transforms.

Source code in thunder/data/dataset.py

def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
    """Apply some base transforms to the audio, that fix silent problems.
    It transforms all the audios to mono (depending on class creation parameter),
    remove the possible DC bias present and then resamples the audios to a common
    sample rate.

    Args:
        audio: Audio tensor
        sample_rate: Sample rate

    Returns:
        Audio tensor after the transforms.
    """
    return self.loader.preprocess_audio(audio, sample_rate)

`preprocess_text(self, text)`

Add here preprocessing steps to remove some common problems in the text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Label text	required

Returns:

Type	Description
`str`	Label text after processing

Source code in thunder/data/dataset.py

def preprocess_text(self, text: str) -> str:
    """Add here preprocessing steps to remove some common problems in the text.

    Args:
        text: Label text

    Returns:
        Label text after processing
    """
    return text

`ManifestSpeechDataset (BaseSpeechDataset)`

Source code in thunder/data/dataset.py

class ManifestSpeechDataset(BaseSpeechDataset):
    def __init__(self, file: Union[str, Path], force_mono: bool, sample_rate: int):
        """Dataset that loads from nemo manifest files.

        Args:
            file: Nemo manifest file.
            force_mono: If true, convert all the loaded samples to mono.
            sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
        """
        file = Path(file)
        # Reading from the manifest file
        items = [json.loads(line) for line in file.read_text().strip().splitlines()]
        super().__init__(items, force_mono=force_mono, sample_rate=sample_rate)

    def open_audio(self, item: dict) -> Tuple[Tensor, int]:
        return self.loader.open_audio(item["audio_filepath"])

    def open_text(self, item: dict) -> str:
        return item["text"]

`init(self, file, force_mono, sample_rate)` `special`

Dataset that loads from nemo manifest files.

Parameters:

Name	Type	Description	Default
`file`	`Union[str, pathlib.Path]`	Nemo manifest file.	required
`force_mono`	`bool`	If true, convert all the loaded samples to mono.	required
`sample_rate`	`int`	Sample rate used by the dataset. All of the samples that have different rate will be resampled.	required

Source code in thunder/data/dataset.py

def __init__(self, file: Union[str, Path], force_mono: bool, sample_rate: int):
    """Dataset that loads from nemo manifest files.

    Args:
        file: Nemo manifest file.
        force_mono: If true, convert all the loaded samples to mono.
        sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
    """
    file = Path(file)
    # Reading from the manifest file
    items = [json.loads(line) for line in file.read_text().strip().splitlines()]
    super().__init__(items, force_mono=force_mono, sample_rate=sample_rate)

`open_audio(self, item)`

Uses the data returned by get_item to open the audio

Parameters:

Name	Type	Description	Default
`item`	`dict`	Data returned by get_item(index)	required

Returns:

Type	Description
`Tuple[torch.Tensor, int]`	Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.

Source code in thunder/data/dataset.py

def open_audio(self, item: dict) -> Tuple[Tensor, int]:
    return self.loader.open_audio(item["audio_filepath"])

`open_text(self, item)`

Opens the transcription based on the data returned by get_item(index)

Parameters:

Name	Type	Description	Default
`item`	`dict`	The data returned by get_item.	required

Returns:

Type	Description
`str`	The transcription corresponding to the item.

Source code in thunder/data/dataset.py

def open_text(self, item: dict) -> str:
    return item["text"]

Dataset

AudioFileLoader (Module)

__init__(self, force_mono=True, sample_rate=16000) special

forward(self, item)

open_audio(self, item)

preprocess_audio(self, audio, sample_rate)

BaseSpeechDataset (Dataset)

__init__(self, items, force_mono=True, sample_rate=16000) special

all_outputs(self)

get_item(self, index)

open_audio(self, item)

open_text(self, item)

preprocess_audio(self, audio, sample_rate)

preprocess_text(self, text)

ManifestSpeechDataset (BaseSpeechDataset)

__init__(self, file, force_mono, sample_rate) special

open_audio(self, item)

open_text(self, item)

`AudioFileLoader (Module)`

`init(self, force_mono=True, sample_rate=16000)` `special`

`forward(self, item)`

`open_audio(self, item)`

`preprocess_audio(self, audio, sample_rate)`

`BaseSpeechDataset (Dataset)`

`init(self, items, force_mono=True, sample_rate=16000)` `special`

`all_outputs(self)`

`get_item(self, index)`

`open_audio(self, item)`

`open_text(self, item)`

`preprocess_audio(self, audio, sample_rate)`

`preprocess_text(self, text)`

`ManifestSpeechDataset (BaseSpeechDataset)`

`init(self, file, force_mono, sample_rate)` `special`

`open_audio(self, item)`

`open_text(self, item)`