Source code for lalandre_rag.citation_sanitizer

"""Normalize malformed citation tags emitted by the main LLM.

The RAG prompt instructs the LLM to use strict native tags like ``[S1]``,
``[G2, L2]``, ``[R3]``, ``[C1]``, ``[CM4]``. In practice the LLM often slips
in extra material between the brackets, e.g.::

    [S2, Annex I C(4) ; RTS 2 Annex III §13.1]
    [G9, article 25(2)]
    [G7, considérant 71]

These ad-hoc forms are not recognized by the front-end regex (which only
matches the strict format) and break the prose_rewriter's integrity check
(it counts strict tags only). This module rewrites them back to the strict
form before any post-processing runs:

- ``[S2, Annex I C(4) ; RTS 2 Annex III §13.1]`` → ``[S2]``
- ``[G9, article 25(2)]`` → ``[G9]``
- ``[S1, L1]`` → ``[S1, L1]`` (preserved — already valid)
- ``[G2, L2 ; article 25]`` → ``[G2, L2]`` (level kept, article precision dropped)

The article precision is not lost — the prompt instructs the LLM to write it
in the surrounding prose (``« L'article 25 [G9, L2] »``). The sanitizer just
strips it from inside the brackets where it doesn't belong.
"""

from __future__ import annotations

import re

# Match anything that starts with a native tag prefix and digits, followed
# by ANY content until the closing bracket.
_RAW_TAG_RE = re.compile(r"\[((?:S|G|R|C|CM)\d+)([^\]]*)\]")

# Match a level suffix ``L<digits>`` somewhere inside the tag's "rest".
_LEVEL_RE = re.compile(r"\bL\d+\b")



[docs]
def normalize_citation_tags(text: str) -> str:
    """Rewrite malformed citation tags into the strict native format.

    Idempotent: a text already in strict form passes through unchanged.
    Never raises.
    """
    if not text:
        return text

    def _repl(match: "re.Match[str]") -> str:
        identifier = match.group(1)
        rest = match.group(2) or ""
        level_match = _LEVEL_RE.search(rest)
        if level_match:
            return f"[{identifier}, {level_match.group(0)}]"
        return f"[{identifier}]"

    return _RAW_TAG_RE.sub(_repl, text)



__all__ = ["normalize_citation_tags"]