"""
CELEX Utility Functions
"""
import re
from typing import Optional, Pattern
[docs]
def normalize_celex(celex: str) -> str:
"""
Handles various input formats and normalizes to the standard CELEX format.
Removes spaces, handles EUR-Lex format conversions.
Examples:
>>> normalize_celex('32016R0679')
'32016R0679'
>>> normalize_celex(' 32016 R 0679 ')
'32016R0679'
>>> normalize_celex('(UE) 2016/679')
'32016R0679'
>>> normalize_celex('(CE) n° 1219/2011')
'32011R1219'
>>> normalize_celex('Directive 2003/41/CE')
'32003L0041'
>>> normalize_celex('AMF-RG-L1-20250331')
'AMF-RG-L1-20250331'
>>> normalize_celex('AMF-SANCTION-SanctionAMF2026-01-20260112')
'AMF-SAN-2026-01'
"""
if not celex:
return celex
celex = celex.strip()
# Normalize AMF sanction node labels to canonical CELEX-like id
# Example: AMF-SANCTION-SanctionAMF2026-01-20260112 -> AMF-SAN-2026-01
sanction_match = re.search(r"SanctionAMF(\d{4})[-_/ ]?(\d{1,2})", celex, re.IGNORECASE)
if sanction_match:
year, number = sanction_match.groups()
return f"AMF-SAN-{year}-{number.zfill(2)}"
# Already in standard format (EU: 32016R0679 or AMF: AMF-XXX-XXX)
if re.match(r"^3\d{4}[A-Z]\d{4}$", celex) or celex.startswith("AMF-"):
return celex
# Try to convert common EU formats
celex_converted = _convert_eu_format(celex)
if celex_converted:
return celex_converted
# Remove spaces and return
celex = celex.replace(" ", "")
return celex
def _convert_eu_format(text: str) -> Optional[str]:
"""
Convert various EU legal act formats to standard CELEX format
Formats handled:
- (UE) 2016/679 -> 32016R0679
- (CE) n° 1219/2011 -> 32011R1219
- Directive 2003/41/CE -> 32003L0041
- Règlement (UE) n° 606/2023 -> 32023R0606
- 2021/338/EU -> 32021L0338 (assumes directive by default)
- 32003L006/CE -> 32003L0006 (CELEX with suffix)
- MAR_596/2014 -> 32014R0596 (MAR format)
- 2003/6/CE -> 32003L0006 (short year/number format)
- 78/660/EEC -> 31978L0660 (old EEC format)
- 2000/12/EC -> 32000L0012 (EC format)
- 596/2014 -> 32014R0596 (number/year)
- 2014-65 -> 32014L0065 (year-number)
Returns:
Standard CELEX format or None if cannot be converted
"""
text = text.strip()
# Pattern 0: Already in CELEX format but with suffix /CE, /UE, /EU
# Examples: 32003L006/CE -> 32003L0006, 32014R596/UE -> 32014R0596
match = re.match(r"^(3\d{4}[A-Z])(\d+)/(?:CE|UE|EU)$", text, re.IGNORECASE)
if match:
prefix, number = match.groups()
return f"{prefix}{number.zfill(4)}"
# Pattern 1: Old EEC/EC formats (2-digit year)
# Examples: 78/660/EEC -> 31978L0660, 93/22/EEC -> 31993L0022, 2000/12/EC -> 32000L0012
match = re.match(r"^(\d{2,4})/(\d+)/(?:EEC|EC)$", text, re.IGNORECASE)
if match:
year, number = match.groups()
# Convert 2-digit year to 4-digit (78 -> 1978, 00 -> 2000)
if len(year) == 2:
year_int = int(year)
year = f"{1900 + year_int}" if year_int >= 50 else f"{2000 + year_int}"
act_type = _detect_act_type(text)
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 2: (UE/CE/EU) YEAR/NUMBER or (UE/CE/EU) n°/No NUMBER/YEAR
# Examples: (UE) 2016/679, (CE) n° 1219/2011, (EU) No 575/2013
match = re.search(
r"\((?:UE|CE|EU)\)\s*(?:(?:n°|no\.?)\s*)?(\d{1,4})/(\d{1,4})",
text,
re.IGNORECASE,
)
if match:
first_part, second_part = match.groups()
has_number_marker = bool(re.search(r"\((?:UE|CE|EU)\)\s*(?:n°|no\.?)\s*", text, re.IGNORECASE))
if has_number_marker:
number, year = first_part, second_part
elif len(first_part) == 4:
# Default for "(UE) 2016/679" style: YEAR/NUMBER
year, number = first_part, second_part
elif len(second_part) == 4:
# Fallback for NUMBER/YEAR without explicit marker
number, year = first_part, second_part
else:
year, number = first_part, second_part
act_type = _detect_act_type(text)
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 3: MAR, MIF, MIFID formats (with underscore, hyphen, or space)
# Examples: MAR_596/2014 -> 32014R0596, MAR-596/2014 -> 32014R0596
match = re.match(r"^(?:MAR|MIF(?:ID)?)[_\s-]?(\d+)/(\d{4})$", text, re.IGNORECASE)
if match:
number, year = match.groups()
# MAR is typically a regulation, MIF/MIFID is a directive
act_type = "R" if text.upper().startswith("MAR") else "L"
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 4: NUMBER/YEAR format (reverse order)
# Examples: 596/2014 -> 32014R0596 (assumes regulation if no context)
match = re.match(r"^(\d+)/(\d{4})$", text)
if match:
number, year = match.groups()
act_type = _detect_act_type(text)
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 5: YEAR/NUMBER/SUFFIX format
# Examples: 2003/6/CE -> 32003L0006, 2014/65/UE -> 32014L0065
match = re.match(r"^(\d{4})/(\d+)/(?:CE|UE|EU)$", text, re.IGNORECASE)
if match:
year, number = match.groups()
act_type = _detect_act_type(text)
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 6: YEAR-NUMBER format (with hyphen)
# Examples: 2014-65 -> 32014L0065, 2004-39 -> 32004L0039
match = re.match(r"^(\d{4})-(\d+)$", text)
if match:
year, number = match.groups()
act_type = _detect_act_type(text)
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 7: YEAR/NUMBER format (standalone)
# Examples: 2016/679, 2021/338
match = re.search(r"^(\d{4})/(\d+)$", text)
if match:
year, number = match.groups()
act_type = _detect_act_type(text)
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 8: Directive/Regulation YEAR/NUMBER/CE
# Examples: Directive 2003/41/CE, Regulation 2023/606/UE
match = re.search(
r"(Directive|Règlement|Décision|Decision|Regulation)"
r"\s+(\d{4})/(\d+)/(?:CE|UE|EU)",
text,
re.IGNORECASE,
)
if match:
act_type_text, year, number = match.groups()
act_type = _get_type_letter(act_type_text.lower())
return f"3{year}{act_type}{number.zfill(4)}"
# Pattern 9: Already in format 32016R0679 but with spaces
# Examples: 3 2016 R 0679, 32016 R 0679
match = re.search(r"3\s*(\d{4})\s*([A-Z])\s*(\d{4})", text, re.IGNORECASE)
if match:
year, type_letter, number = match.groups()
return f"3{year}{type_letter.upper()}{number}"
return None
def _detect_act_type(text: str) -> str:
"""
Detect the type of legal act from text context
Returns:
Type letter: R (regulation), L (directive), D (decision)
"""
text_lower = text.lower()
if any(word in text_lower for word in ["règlement", "regulation", "réglement"]):
return "R"
elif any(word in text_lower for word in ["directive"]):
return "L"
elif any(word in text_lower for word in ["décision", "decision"]):
return "D"
else:
# Default to directive for generic formats like "2016/679"
return "L"
def _get_type_letter(act_type: str) -> str:
"""
Convert act type name to CELEX type letter
Args:
act_type: 'directive', 'règlement', 'regulation', etc.
Returns:
Type letter: R, L, or D
"""
type_map = {
"directive": "L",
"règlement": "R",
"regulation": "R",
"décision": "D",
"decision": "D",
}
return type_map.get(act_type.lower(), "L")
# ---------------------------------------------------------------------------
# EUR-Lex CELEX detection
# ---------------------------------------------------------------------------
# Standard EUR-Lex CELEX: sector digit + 4-digit year + type letter(s),
# e.g. 32016R0679
_EURLEX_CELEX_RE: Pattern[str] = re.compile(r"^[1-9][0-9]{4}[A-Z]")
[docs]
def is_eurlex_celex(celex: str) -> bool:
"""Return True iff *celex* follows the EUR-Lex standard format.
EUR-Lex CELEXes start with a sector digit followed by the 4-digit year and
a document-type letter (e.g. ``32016R0679``). All other sources (AMF-,
EBA-, EIOPA-, ESMA-, LEGITEXT…) use alphabetical prefixes.
"""
return bool(_EURLEX_CELEX_RE.match(celex))
# ---------------------------------------------------------------------------
# Légifrance CELEX detection
# ---------------------------------------------------------------------------
[docs]
def is_legifrance_celex(celex: str) -> bool:
"""Return True iff *celex* identifies a Légifrance document.
Légifrance CELEXes start with ``LEGITEXT`` (e.g.
``LEGITEXT000006072026`` or ``LEGITEXT000006072026:LEGISCTA000006154980``).
"""
return celex.startswith("LEGITEXT")
# ---------------------------------------------------------------------------
# CELEX validation (for gateway / user input)
# ---------------------------------------------------------------------------
VALID_CELEX_RE: Pattern[str] = re.compile(
r"^(?:"
r"3\d{4}[A-Z]\d{4}"
r"|\d{2,4}/\d{2,4}/(?:CE|UE|EU|EC|EEC|CEE)"
r"|(?:AMF|EBA|ESMA|EIOPA|ACPR|BCE|ECB)-[A-Z0-9][A-Z0-9\-/.]{1,}"
r")$",
re.IGNORECASE,
)
[docs]
def is_valid_celex(celex: str) -> bool:
"""Return True if *celex* looks like a recognisable CELEX identifier."""
return bool(VALID_CELEX_RE.match(normalize_celex(celex)))