"""
Shared heuristics and regex patterns for legal entity linking.
Centralized here to keep extraction, RAG, and validation rules in sync.
Values are package-local (not config-driven) to avoid leaking concerns.
"""
import re
# Explicit identifiers (CELEX, EU refs, national authority refs)
IDENTIFIER_PATTERNS: tuple[re.Pattern[str], ...] = (
re.compile(r"\b3\d{4}[A-Z]\d{4}\b", re.IGNORECASE),
re.compile(r"\b\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|CEE)\b", re.IGNORECASE),
re.compile(r"\b(?:AMF|EBA|ESMA|EIOPA)-[A-Z0-9][A-Z0-9\-/.]{2,}\b", re.IGNORECASE),
)
# Title cues for acts referenced in free text
TITLE_SPAN_PATTERN = re.compile(
r"\b(?:directive|regulation|r[èe]glement|d[ée]cision|decision|"
r"recommendation|recommandation|guideline|guidelines|"
r"communication|notice|loi|law|d[ée]cret|arr[êe]t[ée]|ordonnance)\b"
r"[^.;:\n]{8,180}",
re.IGNORECASE,
)
LEGAL_CUE_IN_QUOTE = re.compile(
r"\b(directive|regulation|r[èe]glement|d[ée]cision|decision|"
r"recommendation|recommandation|guideline|guidelines|"
r"communication|notice|loi|law|d[ée]cret|arr[êe]t[ée]|ordonnance|article)\b",
re.IGNORECASE,
)
GENERIC_TARGET_VALUES = {
"directive",
"directives",
"regulation",
"regulations",
"reglement",
"règlement",
"règlements",
"decision",
"décision",
"decisions",
"décisions",
"recommendation",
"recommendations",
"recommandation",
"recommandations",
"loi",
"law",
"laws",
"decret",
"décret",
"decrets",
"décrets",
"arrete",
"arrêté",
"arrêtés",
"ordonnance",
"article",
"articles",
}
[docs]
def is_generic_target(target: str) -> bool:
"""Return whether *target* is too generic to resolve as a concrete act."""
return target.lower() in GENERIC_TARGET_VALUES
[docs]
def looks_like_identifier(target: str) -> bool:
"""Return whether *target* resembles an explicit legal identifier."""
if target.startswith(("AMF-", "EBA-", "ESMA-", "EIOPA-")):
return True
if re.match(r"^3\d{4}[A-Z]\d{4}$", target):
return True
if re.match(r"^\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|CEE)$", target, re.IGNORECASE):
return True
if re.match(r"^[A-Z]{2,12}(?:-[A-Z0-9]{1,40})+$", target):
return True
return False
__all__ = [
"IDENTIFIER_PATTERNS",
"TITLE_SPAN_PATTERN",
"LEGAL_CUE_IN_QUOTE",
"GENERIC_TARGET_VALUES",
"is_generic_target",
"looks_like_identifier",
]