Source code for lalandre_core.utils.text_utils

"""
Text normalization utilities.
"""

import re
import unicodedata



[docs]
def strip_accents(text: str) -> str:
    """Remove diacritics (accents) from *text* via NFKD decomposition."""
    normalized = unicodedata.normalize("NFKD", text)
    return "".join(ch for ch in normalized if not unicodedata.combining(ch))




[docs]
def normalize_text(text: str) -> str:
    """Lowercase, strip accents, collapse whitespace, remove special chars.

    Keeps: word characters, whitespace, slashes, parens, dots, colons, hyphens.
    """
    lowered = strip_accents(text.lower().strip())
    lowered = re.sub(r"[^\w\s/().:-]", " ", lowered)
    lowered = re.sub(r"\s+", " ", lowered)
    return lowered.strip()