Source code for lalandre_core.utils.celex_utils

"""
CELEX Utility Functions
"""

import re
from typing import Optional, Pattern



[docs]
def normalize_celex(celex: str) -> str:
    """
    Handles various input formats and normalizes to the standard CELEX format.
    Removes spaces, handles EUR-Lex format conversions.

    Examples:
        >>> normalize_celex('32016R0679')
        '32016R0679'
        >>> normalize_celex(' 32016 R 0679 ')
        '32016R0679'
        >>> normalize_celex('(UE) 2016/679')
        '32016R0679'
        >>> normalize_celex('(CE) n° 1219/2011')
        '32011R1219'
        >>> normalize_celex('Directive 2003/41/CE')
        '32003L0041'
        >>> normalize_celex('AMF-RG-L1-20250331')
        'AMF-RG-L1-20250331'
        >>> normalize_celex('AMF-SANCTION-SanctionAMF2026-01-20260112')
        'AMF-SAN-2026-01'
    """
    if not celex:
        return celex

    celex = celex.strip()

    # Normalize AMF sanction node labels to canonical CELEX-like id
    # Example: AMF-SANCTION-SanctionAMF2026-01-20260112 -> AMF-SAN-2026-01
    sanction_match = re.search(r"SanctionAMF(\d{4})[-_/ ]?(\d{1,2})", celex, re.IGNORECASE)
    if sanction_match:
        year, number = sanction_match.groups()
        return f"AMF-SAN-{year}-{number.zfill(2)}"

    # Already in standard format (EU: 32016R0679 or AMF: AMF-XXX-XXX)
    if re.match(r"^3\d{4}[A-Z]\d{4}$", celex) or celex.startswith("AMF-"):
        return celex

    # Try to convert common EU formats
    celex_converted = _convert_eu_format(celex)
    if celex_converted:
        return celex_converted

    # Remove spaces and return
    celex = celex.replace(" ", "")

    return celex



def _convert_eu_format(text: str) -> Optional[str]:
    """
    Convert various EU legal act formats to standard CELEX format

    Formats handled:
    - (UE) 2016/679 -> 32016R0679
    - (CE) n° 1219/2011 -> 32011R1219
    - Directive 2003/41/CE -> 32003L0041
    - Règlement (UE) n° 606/2023 -> 32023R0606
    - 2021/338/EU -> 32021L0338 (assumes directive by default)
    - 32003L006/CE -> 32003L0006 (CELEX with suffix)
    - MAR_596/2014 -> 32014R0596 (MAR format)
    - 2003/6/CE -> 32003L0006 (short year/number format)
    - 78/660/EEC -> 31978L0660 (old EEC format)
    - 2000/12/EC -> 32000L0012 (EC format)
    - 596/2014 -> 32014R0596 (number/year)
    - 2014-65 -> 32014L0065 (year-number)

    Returns:
        Standard CELEX format or None if cannot be converted
    """
    text = text.strip()

    # Pattern 0: Already in CELEX format but with suffix /CE, /UE, /EU
    # Examples: 32003L006/CE -> 32003L0006, 32014R596/UE -> 32014R0596
    match = re.match(r"^(3\d{4}[A-Z])(\d+)/(?:CE|UE|EU)$", text, re.IGNORECASE)
    if match:
        prefix, number = match.groups()
        return f"{prefix}{number.zfill(4)}"

    # Pattern 1: Old EEC/EC formats (2-digit year)
    # Examples: 78/660/EEC -> 31978L0660, 93/22/EEC -> 31993L0022, 2000/12/EC -> 32000L0012
    match = re.match(r"^(\d{2,4})/(\d+)/(?:EEC|EC)$", text, re.IGNORECASE)
    if match:
        year, number = match.groups()
        # Convert 2-digit year to 4-digit (78 -> 1978, 00 -> 2000)
        if len(year) == 2:
            year_int = int(year)
            year = f"{1900 + year_int}" if year_int >= 50 else f"{2000 + year_int}"
        act_type = _detect_act_type(text)
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 2: (UE/CE/EU) YEAR/NUMBER or (UE/CE/EU) n°/No NUMBER/YEAR
    # Examples: (UE) 2016/679, (CE) n° 1219/2011, (EU) No 575/2013
    match = re.search(
        r"\((?:UE|CE|EU)\)\s*(?:(?:n°|no\.?)\s*)?(\d{1,4})/(\d{1,4})",
        text,
        re.IGNORECASE,
    )
    if match:
        first_part, second_part = match.groups()
        has_number_marker = bool(re.search(r"\((?:UE|CE|EU)\)\s*(?:n°|no\.?)\s*", text, re.IGNORECASE))
        if has_number_marker:
            number, year = first_part, second_part
        elif len(first_part) == 4:
            # Default for "(UE) 2016/679" style: YEAR/NUMBER
            year, number = first_part, second_part
        elif len(second_part) == 4:
            # Fallback for NUMBER/YEAR without explicit marker
            number, year = first_part, second_part
        else:
            year, number = first_part, second_part
        act_type = _detect_act_type(text)
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 3: MAR, MIF, MIFID formats (with underscore, hyphen, or space)
    # Examples: MAR_596/2014 -> 32014R0596, MAR-596/2014 -> 32014R0596
    match = re.match(r"^(?:MAR|MIF(?:ID)?)[_\s-]?(\d+)/(\d{4})$", text, re.IGNORECASE)
    if match:
        number, year = match.groups()
        # MAR is typically a regulation, MIF/MIFID is a directive
        act_type = "R" if text.upper().startswith("MAR") else "L"
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 4: NUMBER/YEAR format (reverse order)
    # Examples: 596/2014 -> 32014R0596 (assumes regulation if no context)
    match = re.match(r"^(\d+)/(\d{4})$", text)
    if match:
        number, year = match.groups()
        act_type = _detect_act_type(text)
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 5: YEAR/NUMBER/SUFFIX format
    # Examples: 2003/6/CE -> 32003L0006, 2014/65/UE -> 32014L0065
    match = re.match(r"^(\d{4})/(\d+)/(?:CE|UE|EU)$", text, re.IGNORECASE)
    if match:
        year, number = match.groups()
        act_type = _detect_act_type(text)
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 6: YEAR-NUMBER format (with hyphen)
    # Examples: 2014-65 -> 32014L0065, 2004-39 -> 32004L0039
    match = re.match(r"^(\d{4})-(\d+)$", text)
    if match:
        year, number = match.groups()
        act_type = _detect_act_type(text)
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 7: YEAR/NUMBER format (standalone)
    # Examples: 2016/679, 2021/338
    match = re.search(r"^(\d{4})/(\d+)$", text)
    if match:
        year, number = match.groups()
        act_type = _detect_act_type(text)
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 8: Directive/Regulation YEAR/NUMBER/CE
    # Examples: Directive 2003/41/CE, Regulation 2023/606/UE
    match = re.search(
        r"(Directive|Règlement|Décision|Decision|Regulation)"
        r"\s+(\d{4})/(\d+)/(?:CE|UE|EU)",
        text,
        re.IGNORECASE,
    )
    if match:
        act_type_text, year, number = match.groups()
        act_type = _get_type_letter(act_type_text.lower())
        return f"3{year}{act_type}{number.zfill(4)}"

    # Pattern 9: Already in format 32016R0679 but with spaces
    # Examples: 3 2016 R 0679, 32016 R 0679
    match = re.search(r"3\s*(\d{4})\s*([A-Z])\s*(\d{4})", text, re.IGNORECASE)
    if match:
        year, type_letter, number = match.groups()
        return f"3{year}{type_letter.upper()}{number}"

    return None


def _detect_act_type(text: str) -> str:
    """
    Detect the type of legal act from text context

    Returns:
        Type letter: R (regulation), L (directive), D (decision)
    """
    text_lower = text.lower()

    if any(word in text_lower for word in ["règlement", "regulation", "réglement"]):
        return "R"
    elif any(word in text_lower for word in ["directive"]):
        return "L"
    elif any(word in text_lower for word in ["décision", "decision"]):
        return "D"
    else:
        # Default to directive for generic formats like "2016/679"
        return "L"


def _get_type_letter(act_type: str) -> str:
    """
    Convert act type name to CELEX type letter

    Args:
        act_type: 'directive', 'règlement', 'regulation', etc.

    Returns:
        Type letter: R, L, or D
    """
    type_map = {
        "directive": "L",
        "règlement": "R",
        "regulation": "R",
        "décision": "D",
        "decision": "D",
    }
    return type_map.get(act_type.lower(), "L")


# ---------------------------------------------------------------------------
# EUR-Lex CELEX detection
# ---------------------------------------------------------------------------

# Standard EUR-Lex CELEX: sector digit + 4-digit year + type letter(s),
# e.g. 32016R0679
_EURLEX_CELEX_RE: Pattern[str] = re.compile(r"^[1-9][0-9]{4}[A-Z]")



[docs]
def is_eurlex_celex(celex: str) -> bool:
    """Return True iff *celex* follows the EUR-Lex standard format.

    EUR-Lex CELEXes start with a sector digit followed by the 4-digit year and
    a document-type letter (e.g. ``32016R0679``).  All other sources (AMF-,
    EBA-, EIOPA-, ESMA-, LEGITEXT…) use alphabetical prefixes.
    """
    return bool(_EURLEX_CELEX_RE.match(celex))



# ---------------------------------------------------------------------------
# Légifrance CELEX detection
# ---------------------------------------------------------------------------



[docs]
def is_legifrance_celex(celex: str) -> bool:
    """Return True iff *celex* identifies a Légifrance document.

    Légifrance CELEXes start with ``LEGITEXT`` (e.g.
    ``LEGITEXT000006072026`` or ``LEGITEXT000006072026:LEGISCTA000006154980``).
    """
    return celex.startswith("LEGITEXT")



# ---------------------------------------------------------------------------
# CELEX validation (for gateway / user input)
# ---------------------------------------------------------------------------

VALID_CELEX_RE: Pattern[str] = re.compile(
    r"^(?:"
    r"3\d{4}[A-Z]\d{4}"
    r"|\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|EEC|CEE)"
    r"|(?:AMF|EBA|ESMA|EIOPA|ACPR|BCE|ECB)-[A-Z0-9][A-Z0-9\-/.]{1,}"
    r")$",
    re.IGNORECASE,
)



[docs]
def is_valid_celex(celex: str) -> bool:
    """Return True if *celex* looks like a recognisable CELEX identifier."""
    return bool(VALID_CELEX_RE.match(normalize_celex(celex)))