Skip to content


Speech recognition datasets

AudioFileLoader (Module)

Source code in thunder/data/
class AudioFileLoader(nn.Module):
    def __init__(self, force_mono: bool = True, sample_rate: int = 16000):
        """Module containing the data loading and basic preprocessing.
        It's used internally by the datasets, but can be exported so
        that during inference time there's no code dependency.

            force_mono: If true, convert all the loaded samples to mono.
            sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
        self.force_mono = force_mono
        self.sample_rate = sample_rate

    def open_audio(self, item: str) -> Tuple[Tensor, int]:
        """Uses the data returned by get_item to open the audio

            item: Data returned by get_item(index)

            Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
        return torchaudio.load(item)

    def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
        """Apply some base transforms to the audio, that fix silent problems.
        It transforms all the audios to mono (depending on class creation parameter),
        remove the possible DC bias present and then resamples the audios to a common
        sample rate.

            audio: Audio tensor
            sample_rate: Sample rate

            Audio tensor after the transforms.
        if self.force_mono and (audio.shape[0] > 1):
            audio = audio.mean(0, keepdim=True)

        # Removing the dc component from the audio
        # It happens when a faulty capture device introduce
        # an offset into the recorded waveform, and this can
        # cause problems with later transforms.
        audio = audio - audio.mean(1)

        if self.sample_rate != sample_rate:
            audio = resample(
                audio, orig_freq=int(sample_rate), new_freq=int(self.sample_rate)
        return audio

    def forward(self, item: str) -> Tensor:
        """Opens audio item and do basic preprocessing

            item: Path to the audio to be opened

            Audio tensor after preprocessing
        audio, sample_rate = self.open_audio(item)
        return self.preprocess_audio(audio, sample_rate)

__init__(self, force_mono=True, sample_rate=16000) special

Module containing the data loading and basic preprocessing. It's used internally by the datasets, but can be exported so that during inference time there's no code dependency.


Name Type Description Default
force_mono bool

If true, convert all the loaded samples to mono.

sample_rate int

Sample rate used by the dataset. All of the samples that have different rate will be resampled.

Source code in thunder/data/
def __init__(self, force_mono: bool = True, sample_rate: int = 16000):
    """Module containing the data loading and basic preprocessing.
    It's used internally by the datasets, but can be exported so
    that during inference time there's no code dependency.

        force_mono: If true, convert all the loaded samples to mono.
        sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
    self.force_mono = force_mono
    self.sample_rate = sample_rate

forward(self, item)

Opens audio item and do basic preprocessing


Name Type Description Default
item str

Path to the audio to be opened



Type Description

Audio tensor after preprocessing

Source code in thunder/data/
def forward(self, item: str) -> Tensor:
    """Opens audio item and do basic preprocessing

        item: Path to the audio to be opened

        Audio tensor after preprocessing
    audio, sample_rate = self.open_audio(item)
    return self.preprocess_audio(audio, sample_rate)

open_audio(self, item)

Uses the data returned by get_item to open the audio


Name Type Description Default
item str

Data returned by get_item(index)



Type Description
Tuple[torch.Tensor, int]

Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.

Source code in thunder/data/
def open_audio(self, item: str) -> Tuple[Tensor, int]:
    """Uses the data returned by get_item to open the audio

        item: Data returned by get_item(index)

        Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
    return torchaudio.load(item)

preprocess_audio(self, audio, sample_rate)

Apply some base transforms to the audio, that fix silent problems. It transforms all the audios to mono (depending on class creation parameter), remove the possible DC bias present and then resamples the audios to a common sample rate.


Name Type Description Default
audio Tensor

Audio tensor

sample_rate int

Sample rate



Type Description

Audio tensor after the transforms.

Source code in thunder/data/
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
    """Apply some base transforms to the audio, that fix silent problems.
    It transforms all the audios to mono (depending on class creation parameter),
    remove the possible DC bias present and then resamples the audios to a common
    sample rate.

        audio: Audio tensor
        sample_rate: Sample rate

        Audio tensor after the transforms.
    if self.force_mono and (audio.shape[0] > 1):
        audio = audio.mean(0, keepdim=True)

    # Removing the dc component from the audio
    # It happens when a faulty capture device introduce
    # an offset into the recorded waveform, and this can
    # cause problems with later transforms.
    audio = audio - audio.mean(1)

    if self.sample_rate != sample_rate:
        audio = resample(
            audio, orig_freq=int(sample_rate), new_freq=int(self.sample_rate)
    return audio

BaseSpeechDataset (Dataset)

Source code in thunder/data/
class BaseSpeechDataset(Dataset):
    def __init__(
        self, items: Sequence, force_mono: bool = True, sample_rate: int = 16000
        """This is the base class that implements the minimal functionality to have a compatible
        speech dataset, in a way that can be easily customized by subclassing.

            items: Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.
            force_mono: If true, convert all the loaded samples to mono.
            sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
        self.items = items
        self.loader = AudioFileLoader(force_mono, sample_rate)

    def __len__(self):
        return len(self.items)

    def __getitem__(self, index: int) -> Tuple[Tensor, str]:
        item = self.get_item(index)
        # Dealing with input
        audio, sr = self.open_audio(item)
        audio = self.preprocess_audio(audio, sr)
        # Dealing with output
        text = self.open_text(item)
        text = self.preprocess_text(text)

        return audio, text

    def all_outputs(self) -> List[str]:
        """Return a list with just the outputs for the whole dataset.
        Useful when creating the initial vocab tokens, or to train a
        language model.

            All of the outputs of the dataset, with the corresponding preprocessing applied.
        outputs = []
        for index in range(len(self)):
            item = self.get_item(index)
            text = self.open_text(item)
            text = self.preprocess_text(text)
        return outputs

    def get_item(self, index: int) -> Any:
        """Get the item source specified by the index.

            index: Indicates what item it needs to return information about.

            Whatever data necessary to open the audio and text corresponding to this index.
        return self.items[index]

    def open_audio(self, item: Any) -> Tuple[Tensor, int]:
        """Uses the data returned by get_item to open the audio

            item: Data returned by get_item(index)

            Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
        return self.loader.open_audio(item)

    def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
        """Apply some base transforms to the audio, that fix silent problems.
        It transforms all the audios to mono (depending on class creation parameter),
        remove the possible DC bias present and then resamples the audios to a common
        sample rate.

            audio: Audio tensor
            sample_rate: Sample rate

            Audio tensor after the transforms.
        return self.loader.preprocess_audio(audio, sample_rate)

    def open_text(self, item: Any) -> str:
        """Opens the transcription based on the data returned by get_item(index)

            item: The data returned by get_item.

            The transcription corresponding to the item.
        raise NotImplementedError()

    def preprocess_text(self, text: str) -> str:
        """Add here preprocessing steps to remove some common problems in the text.

            text: Label text

            Label text after processing
        return text

__init__(self, items, force_mono=True, sample_rate=16000) special

This is the base class that implements the minimal functionality to have a compatible speech dataset, in a way that can be easily customized by subclassing.


Name Type Description Default
items Sequence

Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.

force_mono bool

If true, convert all the loaded samples to mono.

sample_rate int

Sample rate used by the dataset. All of the samples that have different rate will be resampled.

Source code in thunder/data/
def __init__(
    self, items: Sequence, force_mono: bool = True, sample_rate: int = 16000
    """This is the base class that implements the minimal functionality to have a compatible
    speech dataset, in a way that can be easily customized by subclassing.

        items: Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.
        force_mono: If true, convert all the loaded samples to mono.
        sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
    self.items = items
    self.loader = AudioFileLoader(force_mono, sample_rate)


Return a list with just the outputs for the whole dataset. Useful when creating the initial vocab tokens, or to train a language model.


Type Description

All of the outputs of the dataset, with the corresponding preprocessing applied.

Source code in thunder/data/
def all_outputs(self) -> List[str]:
    """Return a list with just the outputs for the whole dataset.
    Useful when creating the initial vocab tokens, or to train a
    language model.

        All of the outputs of the dataset, with the corresponding preprocessing applied.
    outputs = []
    for index in range(len(self)):
        item = self.get_item(index)
        text = self.open_text(item)
        text = self.preprocess_text(text)
    return outputs

get_item(self, index)

Get the item source specified by the index.


Name Type Description Default
index int

Indicates what item it needs to return information about.



Type Description

Whatever data necessary to open the audio and text corresponding to this index.

Source code in thunder/data/
def get_item(self, index: int) -> Any:
    """Get the item source specified by the index.

        index: Indicates what item it needs to return information about.

        Whatever data necessary to open the audio and text corresponding to this index.
    return self.items[index]

open_audio(self, item)

Uses the data returned by get_item to open the audio


Name Type Description Default
item Any

Data returned by get_item(index)



Type Description
Tuple[torch.Tensor, int]

Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.

Source code in thunder/data/
def open_audio(self, item: Any) -> Tuple[Tensor, int]:
    """Uses the data returned by get_item to open the audio

        item: Data returned by get_item(index)

        Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
    return self.loader.open_audio(item)

open_text(self, item)

Opens the transcription based on the data returned by get_item(index)


Name Type Description Default
item Any

The data returned by get_item.



Type Description

The transcription corresponding to the item.

Source code in thunder/data/
def open_text(self, item: Any) -> str:
    """Opens the transcription based on the data returned by get_item(index)

        item: The data returned by get_item.

        The transcription corresponding to the item.
    raise NotImplementedError()

preprocess_audio(self, audio, sample_rate)

Apply some base transforms to the audio, that fix silent problems. It transforms all the audios to mono (depending on class creation parameter), remove the possible DC bias present and then resamples the audios to a common sample rate.


Name Type Description Default
audio Tensor

Audio tensor

sample_rate int

Sample rate



Type Description

Audio tensor after the transforms.

Source code in thunder/data/
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
    """Apply some base transforms to the audio, that fix silent problems.
    It transforms all the audios to mono (depending on class creation parameter),
    remove the possible DC bias present and then resamples the audios to a common
    sample rate.

        audio: Audio tensor
        sample_rate: Sample rate

        Audio tensor after the transforms.
    return self.loader.preprocess_audio(audio, sample_rate)

preprocess_text(self, text)

Add here preprocessing steps to remove some common problems in the text.


Name Type Description Default
text str

Label text



Type Description

Label text after processing

Source code in thunder/data/
def preprocess_text(self, text: str) -> str:
    """Add here preprocessing steps to remove some common problems in the text.

        text: Label text

        Label text after processing
    return text

ManifestSpeechDataset (BaseSpeechDataset)

Source code in thunder/data/
class ManifestSpeechDataset(BaseSpeechDataset):
    def __init__(self, file: Union[str, Path], force_mono: bool, sample_rate: int):
        """Dataset that loads from nemo manifest files.

            file: Nemo manifest file.
            force_mono: If true, convert all the loaded samples to mono.
            sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
        file = Path(file)
        # Reading from the manifest file
        items = [json.loads(line) for line in file.read_text().strip().splitlines()]
        super().__init__(items, force_mono=force_mono, sample_rate=sample_rate)

    def open_audio(self, item: dict) -> Tuple[Tensor, int]:
        return self.loader.open_audio(item["audio_filepath"])

    def open_text(self, item: dict) -> str:
        return item["text"]

__init__(self, file, force_mono, sample_rate) special

Dataset that loads from nemo manifest files.


Name Type Description Default
file Union[str, pathlib.Path]

Nemo manifest file.

force_mono bool

If true, convert all the loaded samples to mono.

sample_rate int

Sample rate used by the dataset. All of the samples that have different rate will be resampled.

Source code in thunder/data/
def __init__(self, file: Union[str, Path], force_mono: bool, sample_rate: int):
    """Dataset that loads from nemo manifest files.

        file: Nemo manifest file.
        force_mono: If true, convert all the loaded samples to mono.
        sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
    file = Path(file)
    # Reading from the manifest file
    items = [json.loads(line) for line in file.read_text().strip().splitlines()]
    super().__init__(items, force_mono=force_mono, sample_rate=sample_rate)

open_audio(self, item)

Uses the data returned by get_item to open the audio


Name Type Description Default
item dict

Data returned by get_item(index)



Type Description
Tuple[torch.Tensor, int]

Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.

Source code in thunder/data/
def open_audio(self, item: dict) -> Tuple[Tensor, int]:
    return self.loader.open_audio(item["audio_filepath"])

open_text(self, item)

Opens the transcription based on the data returned by get_item(index)


Name Type Description Default
item dict

The data returned by get_item.



Type Description

The transcription corresponding to the item.

Source code in thunder/data/
def open_text(self, item: dict) -> str:
    return item["text"]