Vocab
Classes that represent the vocabulary used by the model.
Vocabulary (Module)
Source code in thunder/text_processing/vocab.py
class Vocabulary(nn.Module):
def __init__(
self,
tokens: List[str],
blank_token: str = "<blank>",
pad_token: Optional[str] = None,
unknown_token: Optional[str] = None,
start_token: Optional[str] = None,
end_token: Optional[str] = None,
):
"""Class that represents a vocabulary, with the related methods
to numericalize a sequence of tokens into numbers, and do the
reverse mapping of numbers back to tokens.
Args:
tokens: Basic list of tokens that will be part of the vocabulary. Check [`docs`](https://scart97.github.io/thunder-speech/quick%20reference%20guide/#how-to-get-the-tokens-from-my-dataset)
blank_token: Token that will represent the ctc blank.
pad_token: Token that will represent padding, might also act as the ctc blank.
unknown_token: Token that will represent unknown elements. Notice that this is different than the blank used by ctc.
start_token: Token that will represent the beginning of the sequence.
end_token: Token that will represent the end of the sequence.
"""
super().__init__()
self.unknown_token = unknown_token
self.start_token = start_token
self.end_token = end_token
self.blank_token = blank_token
self.pad_token = pad_token or blank_token
self.itos = tokens
self._maybe_add_token(blank_token)
self._maybe_add_token(pad_token)
self._maybe_add_token(unknown_token)
self._maybe_add_token(start_token)
self._maybe_add_token(end_token)
self.stoi = {token: i for i, token in enumerate(self.itos)}
self.blank_idx = self.itos.index(self.blank_token)
self.pad_idx = self.itos.index(self.pad_token)
self._unk_idx = -1
if self.unknown_token is not None:
self._unk_idx = self.itos.index(self.unknown_token)
def _maybe_add_token(self, token: Optional[str]):
# Only adds tokens if they are not optional
# and are not included in the vocabulary already
if token and (token not in self.itos):
self.itos = self.itos + [token]
def numericalize(self, tokens: List[str]) -> torch.Tensor:
"""Function to transform a list of tokens into the corresponding numeric representation.
Args:
tokens: A single list of tokens to be transformed
Returns:
The corresponding numeric representation
"""
if self.unknown_token is None:
# When in there's no unknown token
# we filter out all of the tokens not in the vocab
tokens = [t for t in tokens if t in self.itos]
return torch.tensor(
[self.stoi.get(it, self._unk_idx) for it in tokens], dtype=torch.long
)
@torch.jit.export
def decode_into_text(self, indices: torch.Tensor) -> List[str]:
"""Function to transform back a list of numbers into the corresponding
tokens.
Args:
indices: Numeric representation. Usually is the result of the model, after a greedy decoding
Returns:
Corresponding tokens
"""
return [self.itos[it] for it in indices]
def add_special_tokens(self, tokens: List[str]) -> List[str]:
"""Function to add the special start and end tokens to some
tokenized text.
Args:
tokens: Tokenized text
Returns:
Text with the special tokens added.
"""
if self.start_token is not None:
tokens = [self.start_token] + tokens
if self.end_token is not None:
tokens = tokens + [self.end_token]
return tokens
@torch.jit.export
def remove_special_tokens(self, text: str) -> str:
"""Function to remove the special tokens from the prediction.
Args:
text: Decoded text
Returns:
Text with the special tokens removed.
"""
text = text.replace(self.blank_token, "")
text = text.replace(self.pad_token, "")
if self.start_token is not None:
text = text.replace(self.start_token, "")
if self.end_token is not None:
text = text.replace(self.end_token, "")
return text
__init__(self, tokens, blank_token='<blank>', pad_token=None, unknown_token=None, start_token=None, end_token=None)
special
Class that represents a vocabulary, with the related methods to numericalize a sequence of tokens into numbers, and do the reverse mapping of numbers back to tokens.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tokens |
List[str] |
Basic list of tokens that will be part of the vocabulary. Check |
required |
blank_token |
str |
Token that will represent the ctc blank. |
'<blank>' |
pad_token |
Optional[str] |
Token that will represent padding, might also act as the ctc blank. |
None |
unknown_token |
Optional[str] |
Token that will represent unknown elements. Notice that this is different than the blank used by ctc. |
None |
start_token |
Optional[str] |
Token that will represent the beginning of the sequence. |
None |
end_token |
Optional[str] |
Token that will represent the end of the sequence. |
None |
Source code in thunder/text_processing/vocab.py
def __init__(
self,
tokens: List[str],
blank_token: str = "<blank>",
pad_token: Optional[str] = None,
unknown_token: Optional[str] = None,
start_token: Optional[str] = None,
end_token: Optional[str] = None,
):
"""Class that represents a vocabulary, with the related methods
to numericalize a sequence of tokens into numbers, and do the
reverse mapping of numbers back to tokens.
Args:
tokens: Basic list of tokens that will be part of the vocabulary. Check [`docs`](https://scart97.github.io/thunder-speech/quick%20reference%20guide/#how-to-get-the-tokens-from-my-dataset)
blank_token: Token that will represent the ctc blank.
pad_token: Token that will represent padding, might also act as the ctc blank.
unknown_token: Token that will represent unknown elements. Notice that this is different than the blank used by ctc.
start_token: Token that will represent the beginning of the sequence.
end_token: Token that will represent the end of the sequence.
"""
super().__init__()
self.unknown_token = unknown_token
self.start_token = start_token
self.end_token = end_token
self.blank_token = blank_token
self.pad_token = pad_token or blank_token
self.itos = tokens
self._maybe_add_token(blank_token)
self._maybe_add_token(pad_token)
self._maybe_add_token(unknown_token)
self._maybe_add_token(start_token)
self._maybe_add_token(end_token)
self.stoi = {token: i for i, token in enumerate(self.itos)}
self.blank_idx = self.itos.index(self.blank_token)
self.pad_idx = self.itos.index(self.pad_token)
self._unk_idx = -1
if self.unknown_token is not None:
self._unk_idx = self.itos.index(self.unknown_token)
add_special_tokens(self, tokens)
Function to add the special start and end tokens to some tokenized text.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tokens |
List[str] |
Tokenized text |
required |
Returns:
Type | Description |
---|---|
List[str] |
Text with the special tokens added. |
Source code in thunder/text_processing/vocab.py
def add_special_tokens(self, tokens: List[str]) -> List[str]:
"""Function to add the special start and end tokens to some
tokenized text.
Args:
tokens: Tokenized text
Returns:
Text with the special tokens added.
"""
if self.start_token is not None:
tokens = [self.start_token] + tokens
if self.end_token is not None:
tokens = tokens + [self.end_token]
return tokens
decode_into_text(self, indices)
Function to transform back a list of numbers into the corresponding tokens.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
indices |
Tensor |
Numeric representation. Usually is the result of the model, after a greedy decoding |
required |
Returns:
Type | Description |
---|---|
List[str] |
Corresponding tokens |
Source code in thunder/text_processing/vocab.py
@torch.jit.export
def decode_into_text(self, indices: torch.Tensor) -> List[str]:
"""Function to transform back a list of numbers into the corresponding
tokens.
Args:
indices: Numeric representation. Usually is the result of the model, after a greedy decoding
Returns:
Corresponding tokens
"""
return [self.itos[it] for it in indices]
numericalize(self, tokens)
Function to transform a list of tokens into the corresponding numeric representation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tokens |
List[str] |
A single list of tokens to be transformed |
required |
Returns:
Type | Description |
---|---|
Tensor |
The corresponding numeric representation |
Source code in thunder/text_processing/vocab.py
def numericalize(self, tokens: List[str]) -> torch.Tensor:
"""Function to transform a list of tokens into the corresponding numeric representation.
Args:
tokens: A single list of tokens to be transformed
Returns:
The corresponding numeric representation
"""
if self.unknown_token is None:
# When in there's no unknown token
# we filter out all of the tokens not in the vocab
tokens = [t for t in tokens if t in self.itos]
return torch.tensor(
[self.stoi.get(it, self._unk_idx) for it in tokens], dtype=torch.long
)
remove_special_tokens(self, text)
Function to remove the special tokens from the prediction.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Decoded text |
required |
Returns:
Type | Description |
---|---|
str |
Text with the special tokens removed. |
Source code in thunder/text_processing/vocab.py
@torch.jit.export
def remove_special_tokens(self, text: str) -> str:
"""Function to remove the special tokens from the prediction.
Args:
text: Decoded text
Returns:
Text with the special tokens removed.
"""
text = text.replace(self.blank_token, "")
text = text.replace(self.pad_token, "")
if self.start_token is not None:
text = text.replace(self.start_token, "")
if self.end_token is not None:
text = text.replace(self.end_token, "")
return text