Tokenize
Text tokenization including character, word or sentencepiece
char_tokenizer(text)
Tokenize input text splitting into characters
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Input text |
required |
Returns:
Type | Description |
---|---|
List[str] |
Tokenized text |
Source code in thunder/text_processing/tokenizer.py
def char_tokenizer(text: str) -> List[str]:
"""Tokenize input text splitting into characters
Args:
text: Input text
Returns:
Tokenized text
"""
return list(text)
get_most_frequent_tokens(corpus, tokenize_function, minimum_frequency=1, max_number_of_tokens=None)
Helper function to get the most frequent tokens from a text corpus.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
corpus |
str |
Text corpus to be used, this is a long string containing all of your text |
required |
tokenize_function |
Callable |
Same tokenizer function that will be used during training |
required |
minimum_frequency |
int |
Remove any token with frequency less than that. Defaults to 1. |
1 |
max_number_of_tokens |
Optional[int] |
Optionally limit to the K most frequent tokens. Defaults to None. |
None |
Returns:
Type | Description |
---|---|
List[str] |
All of the unique, most frequent tokens, ordered by frequency. |
Source code in thunder/text_processing/tokenizer.py
def get_most_frequent_tokens(
corpus: str,
tokenize_function: Callable,
minimum_frequency: int = 1,
max_number_of_tokens: Optional[int] = None,
) -> List[str]:
"""Helper function to get the most frequent tokens from a text corpus.
Args:
corpus: Text corpus to be used, this is a long string containing all of your text
tokenize_function: Same tokenizer function that will be used during training
minimum_frequency: Remove any token with frequency less than that. Defaults to 1.
max_number_of_tokens: Optionally limit to the K most frequent tokens. Defaults to None.
Returns:
All of the unique, most frequent tokens, ordered by frequency.
"""
tokenized = tokenize_function(corpus)
token_counter = Counter(tokenized)
output_tokens = []
for token, count in token_counter.most_common(max_number_of_tokens):
if count >= minimum_frequency:
output_tokens.append(token)
return output_tokens
train_sentencepiece_model(data_file, vocab_size, output_dir, sample_size=-1, do_lower_case=True, tokenizer_type='unigram', character_coverage=1.0, train_extremely_large_corpus=False, max_sentencepiece_length=-1)
Creates sentence piece tokenizer model from data file.
This is a direct port of create_spt_model
present on the NEMO
toolkit (nemo/collections/common/tokenizers/sentencepiece_tokenizer.py)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_file |
str |
text file containing the sentences that will be used to train the model |
required |
vocab_size |
int |
maximum vocabulary size |
required |
output_dir |
str |
folder to save created tokenizer model and vocab |
required |
sample_size |
int |
maximum number of sentences the trainer loads. -1 means to use all the data. |
-1 |
do_lower_case |
bool |
if text should be lower cased before tokenizer model is created |
True |
tokenizer_type |
str |
controls the sentencepiece model type. |
'unigram' |
character_coverage |
float |
float value between 0 and 1 (as a percentage). For languages with a vast charset, can be < 1.0, but for all other languages, it should be set as 1.0 |
1.0 |
train_extremely_large_corpus |
bool |
If training on huge datasets, pass this flag to allow SentencePiece to build the tokenizer. |
False |
max_sentencepiece_length |
int |
Limits the maximum length of the SentencePiece subword that can be constructed. By default, no limit is placed. |
-1 |
Source code in thunder/text_processing/tokenizer.py
def train_sentencepiece_model(
data_file: str,
vocab_size: int,
output_dir: str,
sample_size: int = -1,
do_lower_case: bool = True,
tokenizer_type: str = "unigram",
character_coverage: float = 1.0,
train_extremely_large_corpus: bool = False,
max_sentencepiece_length: int = -1,
) -> str:
"""
Creates sentence piece tokenizer model from data file.
This is a direct port of `create_spt_model` present on the NEMO
toolkit (nemo/collections/common/tokenizers/sentencepiece_tokenizer.py)
Args:
data_file: text file containing the sentences that will be used to train the model
vocab_size: maximum vocabulary size
output_dir: folder to save created tokenizer model and vocab
sample_size: maximum number of sentences the trainer loads. -1 means to use all the data.
do_lower_case: if text should be lower cased before tokenizer model is created
tokenizer_type: controls the sentencepiece model type.
character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
can be < 1.0, but for all other languages, it should be set as 1.0
train_extremely_large_corpus: If training on huge datasets, pass this flag to allow SentencePiece
to build the tokenizer.
max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
By default, no limit is placed.
"""
data_file = Path(data_file)
if not data_file or not data_file.exists():
raise ValueError(f"data_file must be valid file path, but got {data_file}")
output_dir = Path(output_dir)
if (output_dir / "tokenizer.model").exists():
warn(
"There's already a trained sentencepiece model at the output directory. Skipping train."
)
return str(output_dir)
output_dir.mkdir(exist_ok=True)
cmd = (
f"--input={data_file} --model_prefix={output_dir}/tokenizer "
f"--vocab_size={vocab_size} "
f"--shuffle_input_sentence=true --hard_vocab_limit=false "
f"--model_type={tokenizer_type} "
f"--character_coverage={character_coverage}"
)
if do_lower_case:
cmd += " --normalization_rule_name=nmt_nfkc_cf"
if sample_size > 0:
cmd += f" --input_sentence_size={sample_size}"
if train_extremely_large_corpus:
cmd += " --train_extremely_large_corpus=true"
if max_sentencepiece_length >= 0:
cmd += f" --max_sentencepiece_length={max_sentencepiece_length}"
sentencepiece.SentencePieceTrainer.Train(cmd)
return str(output_dir)
word_tokenizer(text)
Tokenize input text splitting into words
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Input text |
required |
Returns:
Type | Description |
---|---|
List[str] |
Tokenized text |
Source code in thunder/text_processing/tokenizer.py
def word_tokenizer(text: str) -> List[str]:
"""Tokenize input text splitting into words
Args:
text: Input text
Returns:
Tokenized text
"""
return text.split()