Source code for lalandre_core.linking.heuristics

"""
Shared heuristics and regex patterns for legal entity linking.
Centralized here to keep extraction, RAG, and validation rules in sync.
Values are package-local (not config-driven) to avoid leaking concerns.
"""

import re

# Explicit identifiers (CELEX, EU refs, national authority refs)
IDENTIFIER_PATTERNS: tuple[re.Pattern[str], ...] = (
    re.compile(r"\b3\d{4}[A-Z]\d{4}\b", re.IGNORECASE),
    re.compile(r"\b\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|CEE)\b", re.IGNORECASE),
    re.compile(r"\b(?:AMF|EBA|ESMA|EIOPA)-[A-Z0-9][A-Z0-9\-/.]{2,}\b", re.IGNORECASE),
)

# Title cues for acts referenced in free text
TITLE_SPAN_PATTERN = re.compile(
    r"\b(?:directive|regulation|r[èe]glement|d[ée]cision|decision|"
    r"recommendation|recommandation|guideline|guidelines|"
    r"communication|notice|loi|law|d[ée]cret|arr[êe]t[ée]|ordonnance)\b"
    r"[^.;:\n]{8,180}",
    re.IGNORECASE,
)
LEGAL_CUE_IN_QUOTE = re.compile(
    r"\b(directive|regulation|r[èe]glement|d[ée]cision|decision|"
    r"recommendation|recommandation|guideline|guidelines|"
    r"communication|notice|loi|law|d[ée]cret|arr[êe]t[ée]|ordonnance|article)\b",
    re.IGNORECASE,
)

GENERIC_TARGET_VALUES = {
    "directive",
    "directives",
    "regulation",
    "regulations",
    "reglement",
    "règlement",
    "règlements",
    "decision",
    "décision",
    "decisions",
    "décisions",
    "recommendation",
    "recommendations",
    "recommandation",
    "recommandations",
    "loi",
    "law",
    "laws",
    "decret",
    "décret",
    "decrets",
    "décrets",
    "arrete",
    "arrêté",
    "arrêtés",
    "ordonnance",
    "article",
    "articles",
}


[docs] def is_generic_target(target: str) -> bool: """Return whether *target* is too generic to resolve as a concrete act.""" return target.lower() in GENERIC_TARGET_VALUES
[docs] def looks_like_identifier(target: str) -> bool: """Return whether *target* resembles an explicit legal identifier.""" if target.startswith(("AMF-", "EBA-", "ESMA-", "EIOPA-")): return True if re.match(r"^3\d{4}[A-Z]\d{4}$", target): return True if re.match(r"^\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|CEE)$", target, re.IGNORECASE): return True if re.match(r"^[A-Z]{2,12}(?:-[A-Z0-9]{1,40})+$", target): return True return False
__all__ = [ "IDENTIFIER_PATTERNS", "TITLE_SPAN_PATTERN", "LEGAL_CUE_IN_QUOTE", "GENERIC_TARGET_VALUES", "is_generic_target", "looks_like_identifier", ]