Dataset
Speech recognition datasets
AudioFileLoader (Module)
Source code in thunder/data/dataset.py
class AudioFileLoader(nn.Module):
def __init__(self, force_mono: bool = True, sample_rate: int = 16000):
"""Module containing the data loading and basic preprocessing.
It's used internally by the datasets, but can be exported so
that during inference time there's no code dependency.
Args:
force_mono: If true, convert all the loaded samples to mono.
sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
"""
super().__init__()
self.force_mono = force_mono
self.sample_rate = sample_rate
@torch.jit.export
def open_audio(self, item: str) -> Tuple[Tensor, int]:
"""Uses the data returned by get_item to open the audio
Args:
item: Data returned by get_item(index)
Returns:
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
"""
return torchaudio.load(item)
@torch.jit.export
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
"""Apply some base transforms to the audio, that fix silent problems.
It transforms all the audios to mono (depending on class creation parameter),
remove the possible DC bias present and then resamples the audios to a common
sample rate.
Args:
audio: Audio tensor
sample_rate: Sample rate
Returns:
Audio tensor after the transforms.
"""
if self.force_mono and (audio.shape[0] > 1):
audio = audio.mean(0, keepdim=True)
# Removing the dc component from the audio
# It happens when a faulty capture device introduce
# an offset into the recorded waveform, and this can
# cause problems with later transforms.
# https://en.wikipedia.org/wiki/DC_bias
audio = audio - audio.mean(1)
if self.sample_rate != sample_rate:
audio = resample(
audio, orig_freq=int(sample_rate), new_freq=int(self.sample_rate)
)
return audio
def forward(self, item: str) -> Tensor:
"""Opens audio item and do basic preprocessing
Args:
item: Path to the audio to be opened
Returns:
Audio tensor after preprocessing
"""
audio, sample_rate = self.open_audio(item)
return self.preprocess_audio(audio, sample_rate)
__init__(self, force_mono=True, sample_rate=16000)
special
Module containing the data loading and basic preprocessing. It's used internally by the datasets, but can be exported so that during inference time there's no code dependency.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
force_mono |
bool |
If true, convert all the loaded samples to mono. |
True |
sample_rate |
int |
Sample rate used by the dataset. All of the samples that have different rate will be resampled. |
16000 |
Source code in thunder/data/dataset.py
def __init__(self, force_mono: bool = True, sample_rate: int = 16000):
"""Module containing the data loading and basic preprocessing.
It's used internally by the datasets, but can be exported so
that during inference time there's no code dependency.
Args:
force_mono: If true, convert all the loaded samples to mono.
sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
"""
super().__init__()
self.force_mono = force_mono
self.sample_rate = sample_rate
forward(self, item)
Opens audio item and do basic preprocessing
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
str |
Path to the audio to be opened |
required |
Returns:
Type | Description |
---|---|
Tensor |
Audio tensor after preprocessing |
Source code in thunder/data/dataset.py
def forward(self, item: str) -> Tensor:
"""Opens audio item and do basic preprocessing
Args:
item: Path to the audio to be opened
Returns:
Audio tensor after preprocessing
"""
audio, sample_rate = self.open_audio(item)
return self.preprocess_audio(audio, sample_rate)
open_audio(self, item)
Uses the data returned by get_item to open the audio
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
str |
Data returned by get_item(index) |
required |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, int] |
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate. |
Source code in thunder/data/dataset.py
@torch.jit.export
def open_audio(self, item: str) -> Tuple[Tensor, int]:
"""Uses the data returned by get_item to open the audio
Args:
item: Data returned by get_item(index)
Returns:
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
"""
return torchaudio.load(item)
preprocess_audio(self, audio, sample_rate)
Apply some base transforms to the audio, that fix silent problems. It transforms all the audios to mono (depending on class creation parameter), remove the possible DC bias present and then resamples the audios to a common sample rate.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
audio |
Tensor |
Audio tensor |
required |
sample_rate |
int |
Sample rate |
required |
Returns:
Type | Description |
---|---|
Tensor |
Audio tensor after the transforms. |
Source code in thunder/data/dataset.py
@torch.jit.export
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
"""Apply some base transforms to the audio, that fix silent problems.
It transforms all the audios to mono (depending on class creation parameter),
remove the possible DC bias present and then resamples the audios to a common
sample rate.
Args:
audio: Audio tensor
sample_rate: Sample rate
Returns:
Audio tensor after the transforms.
"""
if self.force_mono and (audio.shape[0] > 1):
audio = audio.mean(0, keepdim=True)
# Removing the dc component from the audio
# It happens when a faulty capture device introduce
# an offset into the recorded waveform, and this can
# cause problems with later transforms.
# https://en.wikipedia.org/wiki/DC_bias
audio = audio - audio.mean(1)
if self.sample_rate != sample_rate:
audio = resample(
audio, orig_freq=int(sample_rate), new_freq=int(self.sample_rate)
)
return audio
BaseSpeechDataset (Dataset)
Source code in thunder/data/dataset.py
class BaseSpeechDataset(Dataset):
def __init__(
self, items: Sequence, force_mono: bool = True, sample_rate: int = 16000
):
"""This is the base class that implements the minimal functionality to have a compatible
speech dataset, in a way that can be easily customized by subclassing.
Args:
items: Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.
force_mono: If true, convert all the loaded samples to mono.
sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
"""
super().__init__()
self.items = items
self.loader = AudioFileLoader(force_mono, sample_rate)
def __len__(self):
return len(self.items)
def __getitem__(self, index: int) -> Tuple[Tensor, str]:
item = self.get_item(index)
# Dealing with input
audio, sr = self.open_audio(item)
audio = self.preprocess_audio(audio, sr)
# Dealing with output
text = self.open_text(item)
text = self.preprocess_text(text)
return audio, text
def all_outputs(self) -> List[str]:
"""Return a list with just the outputs for the whole dataset.
Useful when creating the initial vocab tokens, or to train a
language model.
Returns:
All of the outputs of the dataset, with the corresponding preprocessing applied.
"""
outputs = []
for index in range(len(self)):
item = self.get_item(index)
text = self.open_text(item)
text = self.preprocess_text(text)
outputs.append(text)
return outputs
def get_item(self, index: int) -> Any:
"""Get the item source specified by the index.
Args:
index: Indicates what item it needs to return information about.
Returns:
Whatever data necessary to open the audio and text corresponding to this index.
"""
return self.items[index]
def open_audio(self, item: Any) -> Tuple[Tensor, int]:
"""Uses the data returned by get_item to open the audio
Args:
item: Data returned by get_item(index)
Returns:
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
"""
return self.loader.open_audio(item)
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
"""Apply some base transforms to the audio, that fix silent problems.
It transforms all the audios to mono (depending on class creation parameter),
remove the possible DC bias present and then resamples the audios to a common
sample rate.
Args:
audio: Audio tensor
sample_rate: Sample rate
Returns:
Audio tensor after the transforms.
"""
return self.loader.preprocess_audio(audio, sample_rate)
def open_text(self, item: Any) -> str:
"""Opens the transcription based on the data returned by get_item(index)
Args:
item: The data returned by get_item.
Returns:
The transcription corresponding to the item.
"""
raise NotImplementedError()
def preprocess_text(self, text: str) -> str:
"""Add here preprocessing steps to remove some common problems in the text.
Args:
text: Label text
Returns:
Label text after processing
"""
return text
__init__(self, items, force_mono=True, sample_rate=16000)
special
This is the base class that implements the minimal functionality to have a compatible speech dataset, in a way that can be easily customized by subclassing.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
items |
Sequence |
Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data. |
required |
force_mono |
bool |
If true, convert all the loaded samples to mono. |
True |
sample_rate |
int |
Sample rate used by the dataset. All of the samples that have different rate will be resampled. |
16000 |
Source code in thunder/data/dataset.py
def __init__(
self, items: Sequence, force_mono: bool = True, sample_rate: int = 16000
):
"""This is the base class that implements the minimal functionality to have a compatible
speech dataset, in a way that can be easily customized by subclassing.
Args:
items: Source of items in the dataset, sorted by audio duration. This can be a list of files, a pandas dataframe or any other iterable structure where you record your data.
force_mono: If true, convert all the loaded samples to mono.
sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
"""
super().__init__()
self.items = items
self.loader = AudioFileLoader(force_mono, sample_rate)
all_outputs(self)
Return a list with just the outputs for the whole dataset. Useful when creating the initial vocab tokens, or to train a language model.
Returns:
Type | Description |
---|---|
List[str] |
All of the outputs of the dataset, with the corresponding preprocessing applied. |
Source code in thunder/data/dataset.py
def all_outputs(self) -> List[str]:
"""Return a list with just the outputs for the whole dataset.
Useful when creating the initial vocab tokens, or to train a
language model.
Returns:
All of the outputs of the dataset, with the corresponding preprocessing applied.
"""
outputs = []
for index in range(len(self)):
item = self.get_item(index)
text = self.open_text(item)
text = self.preprocess_text(text)
outputs.append(text)
return outputs
get_item(self, index)
Get the item source specified by the index.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
index |
int |
Indicates what item it needs to return information about. |
required |
Returns:
Type | Description |
---|---|
Any |
Whatever data necessary to open the audio and text corresponding to this index. |
Source code in thunder/data/dataset.py
def get_item(self, index: int) -> Any:
"""Get the item source specified by the index.
Args:
index: Indicates what item it needs to return information about.
Returns:
Whatever data necessary to open the audio and text corresponding to this index.
"""
return self.items[index]
open_audio(self, item)
Uses the data returned by get_item to open the audio
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
Any |
Data returned by get_item(index) |
required |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, int] |
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate. |
Source code in thunder/data/dataset.py
def open_audio(self, item: Any) -> Tuple[Tensor, int]:
"""Uses the data returned by get_item to open the audio
Args:
item: Data returned by get_item(index)
Returns:
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate.
"""
return self.loader.open_audio(item)
open_text(self, item)
Opens the transcription based on the data returned by get_item(index)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
Any |
The data returned by get_item. |
required |
Returns:
Type | Description |
---|---|
str |
The transcription corresponding to the item. |
Source code in thunder/data/dataset.py
def open_text(self, item: Any) -> str:
"""Opens the transcription based on the data returned by get_item(index)
Args:
item: The data returned by get_item.
Returns:
The transcription corresponding to the item.
"""
raise NotImplementedError()
preprocess_audio(self, audio, sample_rate)
Apply some base transforms to the audio, that fix silent problems. It transforms all the audios to mono (depending on class creation parameter), remove the possible DC bias present and then resamples the audios to a common sample rate.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
audio |
Tensor |
Audio tensor |
required |
sample_rate |
int |
Sample rate |
required |
Returns:
Type | Description |
---|---|
Tensor |
Audio tensor after the transforms. |
Source code in thunder/data/dataset.py
def preprocess_audio(self, audio: Tensor, sample_rate: int) -> Tensor:
"""Apply some base transforms to the audio, that fix silent problems.
It transforms all the audios to mono (depending on class creation parameter),
remove the possible DC bias present and then resamples the audios to a common
sample rate.
Args:
audio: Audio tensor
sample_rate: Sample rate
Returns:
Audio tensor after the transforms.
"""
return self.loader.preprocess_audio(audio, sample_rate)
preprocess_text(self, text)
Add here preprocessing steps to remove some common problems in the text.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Label text |
required |
Returns:
Type | Description |
---|---|
str |
Label text after processing |
Source code in thunder/data/dataset.py
def preprocess_text(self, text: str) -> str:
"""Add here preprocessing steps to remove some common problems in the text.
Args:
text: Label text
Returns:
Label text after processing
"""
return text
ManifestSpeechDataset (BaseSpeechDataset)
Source code in thunder/data/dataset.py
class ManifestSpeechDataset(BaseSpeechDataset):
def __init__(self, file: Union[str, Path], force_mono: bool, sample_rate: int):
"""Dataset that loads from nemo manifest files.
Args:
file: Nemo manifest file.
force_mono: If true, convert all the loaded samples to mono.
sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
"""
file = Path(file)
# Reading from the manifest file
items = [json.loads(line) for line in file.read_text().strip().splitlines()]
super().__init__(items, force_mono=force_mono, sample_rate=sample_rate)
def open_audio(self, item: dict) -> Tuple[Tensor, int]:
return self.loader.open_audio(item["audio_filepath"])
def open_text(self, item: dict) -> str:
return item["text"]
__init__(self, file, force_mono, sample_rate)
special
Dataset that loads from nemo manifest files.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
file |
Union[str, pathlib.Path] |
Nemo manifest file. |
required |
force_mono |
bool |
If true, convert all the loaded samples to mono. |
required |
sample_rate |
int |
Sample rate used by the dataset. All of the samples that have different rate will be resampled. |
required |
Source code in thunder/data/dataset.py
def __init__(self, file: Union[str, Path], force_mono: bool, sample_rate: int):
"""Dataset that loads from nemo manifest files.
Args:
file: Nemo manifest file.
force_mono: If true, convert all the loaded samples to mono.
sample_rate: Sample rate used by the dataset. All of the samples that have different rate will be resampled.
"""
file = Path(file)
# Reading from the manifest file
items = [json.loads(line) for line in file.read_text().strip().splitlines()]
super().__init__(items, force_mono=force_mono, sample_rate=sample_rate)
open_audio(self, item)
Uses the data returned by get_item to open the audio
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
dict |
Data returned by get_item(index) |
required |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, int] |
Tuple containing the audio tensor with shape (channels, time), and the corresponding sample rate. |
Source code in thunder/data/dataset.py
def open_audio(self, item: dict) -> Tuple[Tensor, int]:
return self.loader.open_audio(item["audio_filepath"])
open_text(self, item)
Opens the transcription based on the data returned by get_item(index)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
item |
dict |
The data returned by get_item. |
required |
Returns:
Type | Description |
---|---|
str |
The transcription corresponding to the item. |
Source code in thunder/data/dataset.py
def open_text(self, item: dict) -> str:
return item["text"]