Source code for lalandre_core.utils.celex_utils

"""
CELEX Utility Functions
"""

import re
from typing import Optional, Pattern


[docs] def normalize_celex(celex: str) -> str: """ Handles various input formats and normalizes to the standard CELEX format. Removes spaces, handles EUR-Lex format conversions. Examples: >>> normalize_celex('32016R0679') '32016R0679' >>> normalize_celex(' 32016 R 0679 ') '32016R0679' >>> normalize_celex('(UE) 2016/679') '32016R0679' >>> normalize_celex('(CE) n° 1219/2011') '32011R1219' >>> normalize_celex('Directive 2003/41/CE') '32003L0041' >>> normalize_celex('AMF-RG-L1-20250331') 'AMF-RG-L1-20250331' >>> normalize_celex('AMF-SANCTION-SanctionAMF2026-01-20260112') 'AMF-SAN-2026-01' """ if not celex: return celex celex = celex.strip() # Normalize AMF sanction node labels to canonical CELEX-like id # Example: AMF-SANCTION-SanctionAMF2026-01-20260112 -> AMF-SAN-2026-01 sanction_match = re.search(r"SanctionAMF(\d{4})[-_/ ]?(\d{1,2})", celex, re.IGNORECASE) if sanction_match: year, number = sanction_match.groups() return f"AMF-SAN-{year}-{number.zfill(2)}" # Already in standard format (EU: 32016R0679 or AMF: AMF-XXX-XXX) if re.match(r"^3\d{4}[A-Z]\d{4}$", celex) or celex.startswith("AMF-"): return celex # Try to convert common EU formats celex_converted = _convert_eu_format(celex) if celex_converted: return celex_converted # Remove spaces and return celex = celex.replace(" ", "") return celex
def _convert_eu_format(text: str) -> Optional[str]: """ Convert various EU legal act formats to standard CELEX format Formats handled: - (UE) 2016/679 -> 32016R0679 - (CE) n° 1219/2011 -> 32011R1219 - Directive 2003/41/CE -> 32003L0041 - Règlement (UE) n° 606/2023 -> 32023R0606 - 2021/338/EU -> 32021L0338 (assumes directive by default) - 32003L006/CE -> 32003L0006 (CELEX with suffix) - MAR_596/2014 -> 32014R0596 (MAR format) - 2003/6/CE -> 32003L0006 (short year/number format) - 78/660/EEC -> 31978L0660 (old EEC format) - 2000/12/EC -> 32000L0012 (EC format) - 596/2014 -> 32014R0596 (number/year) - 2014-65 -> 32014L0065 (year-number) Returns: Standard CELEX format or None if cannot be converted """ text = text.strip() # Pattern 0: Already in CELEX format but with suffix /CE, /UE, /EU # Examples: 32003L006/CE -> 32003L0006, 32014R596/UE -> 32014R0596 match = re.match(r"^(3\d{4}[A-Z])(\d+)/(?:CE|UE|EU)$", text, re.IGNORECASE) if match: prefix, number = match.groups() return f"{prefix}{number.zfill(4)}" # Pattern 1: Old EEC/EC formats (2-digit year) # Examples: 78/660/EEC -> 31978L0660, 93/22/EEC -> 31993L0022, 2000/12/EC -> 32000L0012 match = re.match(r"^(\d{2,4})/(\d+)/(?:EEC|EC)$", text, re.IGNORECASE) if match: year, number = match.groups() # Convert 2-digit year to 4-digit (78 -> 1978, 00 -> 2000) if len(year) == 2: year_int = int(year) year = f"{1900 + year_int}" if year_int >= 50 else f"{2000 + year_int}" act_type = _detect_act_type(text) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 2: (UE/CE/EU) YEAR/NUMBER or (UE/CE/EU) n°/No NUMBER/YEAR # Examples: (UE) 2016/679, (CE) n° 1219/2011, (EU) No 575/2013 match = re.search( r"\((?:UE|CE|EU)\)\s*(?:(?:n°|no\.?)\s*)?(\d{1,4})/(\d{1,4})", text, re.IGNORECASE, ) if match: first_part, second_part = match.groups() has_number_marker = bool(re.search(r"\((?:UE|CE|EU)\)\s*(?:n°|no\.?)\s*", text, re.IGNORECASE)) if has_number_marker: number, year = first_part, second_part elif len(first_part) == 4: # Default for "(UE) 2016/679" style: YEAR/NUMBER year, number = first_part, second_part elif len(second_part) == 4: # Fallback for NUMBER/YEAR without explicit marker number, year = first_part, second_part else: year, number = first_part, second_part act_type = _detect_act_type(text) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 3: MAR, MIF, MIFID formats (with underscore, hyphen, or space) # Examples: MAR_596/2014 -> 32014R0596, MAR-596/2014 -> 32014R0596 match = re.match(r"^(?:MAR|MIF(?:ID)?)[_\s-]?(\d+)/(\d{4})$", text, re.IGNORECASE) if match: number, year = match.groups() # MAR is typically a regulation, MIF/MIFID is a directive act_type = "R" if text.upper().startswith("MAR") else "L" return f"3{year}{act_type}{number.zfill(4)}" # Pattern 4: NUMBER/YEAR format (reverse order) # Examples: 596/2014 -> 32014R0596 (assumes regulation if no context) match = re.match(r"^(\d+)/(\d{4})$", text) if match: number, year = match.groups() act_type = _detect_act_type(text) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 5: YEAR/NUMBER/SUFFIX format # Examples: 2003/6/CE -> 32003L0006, 2014/65/UE -> 32014L0065 match = re.match(r"^(\d{4})/(\d+)/(?:CE|UE|EU)$", text, re.IGNORECASE) if match: year, number = match.groups() act_type = _detect_act_type(text) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 6: YEAR-NUMBER format (with hyphen) # Examples: 2014-65 -> 32014L0065, 2004-39 -> 32004L0039 match = re.match(r"^(\d{4})-(\d+)$", text) if match: year, number = match.groups() act_type = _detect_act_type(text) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 7: YEAR/NUMBER format (standalone) # Examples: 2016/679, 2021/338 match = re.search(r"^(\d{4})/(\d+)$", text) if match: year, number = match.groups() act_type = _detect_act_type(text) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 8: Directive/Regulation YEAR/NUMBER/CE # Examples: Directive 2003/41/CE, Regulation 2023/606/UE match = re.search( r"(Directive|Règlement|Décision|Decision|Regulation)" r"\s+(\d{4})/(\d+)/(?:CE|UE|EU)", text, re.IGNORECASE, ) if match: act_type_text, year, number = match.groups() act_type = _get_type_letter(act_type_text.lower()) return f"3{year}{act_type}{number.zfill(4)}" # Pattern 9: Already in format 32016R0679 but with spaces # Examples: 3 2016 R 0679, 32016 R 0679 match = re.search(r"3\s*(\d{4})\s*([A-Z])\s*(\d{4})", text, re.IGNORECASE) if match: year, type_letter, number = match.groups() return f"3{year}{type_letter.upper()}{number}" return None def _detect_act_type(text: str) -> str: """ Detect the type of legal act from text context Returns: Type letter: R (regulation), L (directive), D (decision) """ text_lower = text.lower() if any(word in text_lower for word in ["règlement", "regulation", "réglement"]): return "R" elif any(word in text_lower for word in ["directive"]): return "L" elif any(word in text_lower for word in ["décision", "decision"]): return "D" else: # Default to directive for generic formats like "2016/679" return "L" def _get_type_letter(act_type: str) -> str: """ Convert act type name to CELEX type letter Args: act_type: 'directive', 'règlement', 'regulation', etc. Returns: Type letter: R, L, or D """ type_map = { "directive": "L", "règlement": "R", "regulation": "R", "décision": "D", "decision": "D", } return type_map.get(act_type.lower(), "L") # --------------------------------------------------------------------------- # EUR-Lex CELEX detection # --------------------------------------------------------------------------- # Standard EUR-Lex CELEX: sector digit + 4-digit year + type letter(s), # e.g. 32016R0679 _EURLEX_CELEX_RE: Pattern[str] = re.compile(r"^[1-9][0-9]{4}[A-Z]")
[docs] def is_eurlex_celex(celex: str) -> bool: """Return True iff *celex* follows the EUR-Lex standard format. EUR-Lex CELEXes start with a sector digit followed by the 4-digit year and a document-type letter (e.g. ``32016R0679``). All other sources (AMF-, EBA-, EIOPA-, ESMA-, LEGITEXT…) use alphabetical prefixes. """ return bool(_EURLEX_CELEX_RE.match(celex))
# --------------------------------------------------------------------------- # Légifrance CELEX detection # ---------------------------------------------------------------------------
[docs] def is_legifrance_celex(celex: str) -> bool: """Return True iff *celex* identifies a Légifrance document. Légifrance CELEXes start with ``LEGITEXT`` (e.g. ``LEGITEXT000006072026`` or ``LEGITEXT000006072026:LEGISCTA000006154980``). """ return celex.startswith("LEGITEXT")
# --------------------------------------------------------------------------- # CELEX validation (for gateway / user input) # --------------------------------------------------------------------------- VALID_CELEX_RE: Pattern[str] = re.compile( r"^(?:" r"3\d{4}[A-Z]\d{4}" r"|\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|EEC|CEE)" r"|(?:AMF|EBA|ESMA|EIOPA|ACPR|BCE|ECB)-[A-Z0-9][A-Z0-9\-/.]{1,}" r")$", re.IGNORECASE, )
[docs] def is_valid_celex(celex: str) -> bool: """Return True if *celex* looks like a recognisable CELEX identifier.""" return bool(VALID_CELEX_RE.match(normalize_celex(celex)))