Source code for lalandre_core.utils.text_utils

"""
Text normalization utilities.
"""

import re
import unicodedata


[docs] def strip_accents(text: str) -> str: """Remove diacritics (accents) from *text* via NFKD decomposition.""" normalized = unicodedata.normalize("NFKD", text) return "".join(ch for ch in normalized if not unicodedata.combining(ch))
[docs] def normalize_text(text: str) -> str: """Lowercase, strip accents, collapse whitespace, remove special chars. Keeps: word characters, whitespace, slashes, parens, dots, colons, hyphens. """ lowered = strip_accents(text.lower().strip()) lowered = re.sub(r"[^\w\s/().:-]", " ", lowered) lowered = re.sub(r"\s+", " ", lowered) return lowered.strip()