Preprocess
Text preprocessing functionality
expand_numbers(text, language='en')
Expand the numbers present inside the text. That means converting "42" into "forty two". It also detects if the number is ordinal automatically.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Input text |
required |
language |
str |
Language used to expand the numbers. Defaults to "en". |
'en' |
Returns:
Type | Description |
---|---|
str |
Output text |
Source code in thunder/text_processing/preprocess.py
def expand_numbers(text: str, language: str = "en") -> str:
"""Expand the numbers present inside the text.
That means converting "42" into "forty two".
It also detects if the number is ordinal automatically.
Args:
text: Input text
language: Language used to expand the numbers. Defaults to "en".
Returns:
Output text
"""
number_regex = re.compile(r"\d+º*")
all_numbers = number_regex.findall(text)
for num in all_numbers:
if "º" in num:
pure_number = num.replace("º", "").strip()
expanded = num2words(int(pure_number), lang=language, to="ordinal")
else:
expanded = num2words(int(num), lang=language)
text = text.replace(num, expanded)
return text
lower_text(text)
Transform all the text to lowercase.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Input text |
required |
Returns:
Type | Description |
---|---|
str |
Output text |
Source code in thunder/text_processing/preprocess.py
def lower_text(text: str) -> str:
"""Transform all the text to lowercase.
Args:
text: Input text
Returns:
Output text
"""
return text.lower()
normalize_text(text)
Normalize the text to remove accents and ensure all the characters are valid ascii symbols.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Input text |
required |
Returns:
Type | Description |
---|---|
str |
Output text |
Source code in thunder/text_processing/preprocess.py
def normalize_text(text: str) -> str:
"""Normalize the text to remove accents
and ensure all the characters are valid
ascii symbols.
Args:
text: Input text
Returns:
Output text
"""
nfkd_form = unicodedata.normalize("NFKD", text)
only_ascii = nfkd_form.encode("ASCII", "ignore")
return only_ascii.decode()