Source code for lalandre_core.utils.text_utils
"""
Text normalization utilities.
"""
import re
import unicodedata
[docs]
def strip_accents(text: str) -> str:
"""Remove diacritics (accents) from *text* via NFKD decomposition."""
normalized = unicodedata.normalize("NFKD", text)
return "".join(ch for ch in normalized if not unicodedata.combining(ch))
[docs]
def normalize_text(text: str) -> str:
"""Lowercase, strip accents, collapse whitespace, remove special chars.
Keeps: word characters, whitespace, slashes, parens, dots, colons, hyphens.
"""
lowered = strip_accents(text.lower().strip())
lowered = re.sub(r"[^\w\s/().:-]", " ", lowered)
lowered = re.sub(r"\s+", " ", lowered)
return lowered.strip()