Source code for scripts.diag_ner_vs_regex

"""Diagnostic: compare prose linking with regex-only vs regex+NER.

For a curated list of test sentences, run prose_linker twice — once without
the NER external detector (regex+fuzzy alias only) and once with it. Diff
the outputs to show exactly what each layer adds.

Run inside the rag-service container so the DB-backed linker and the NER
client are wired the same way as production:

    docker exec -i rag-service python /tmp/diag_ner_vs_regex.py
"""

from __future__ import annotations

import os
import re
import sys
from dataclasses import dataclass
from typing import List, Tuple

from lalandre_core.config import get_config
from lalandre_core.linking import NerClient
from lalandre_db_postgres import PostgresRepository
from lalandre_rag.linker_factory import build_linker
from lalandre_rag.ner_external import build_ner_external_detector
from lalandre_rag.prose_linker import link_prose

TEST_CASES: List[Tuple[str, str]] = [
    # category, sentence
    ("CELEX canonique", "Le texte 32014L0065 impose une obligation."),
    ("Forme « directive YYYY/NN/UE »", "La directive 2014/65/UE concerne les marchés."),
    ("Forme « règlement YYYY/NN »", "Le règlement 2019/815 sur les rapports annuels."),
    ("Alias DB usuel", "Selon MiFID II, les conseillers doivent..."),
    ("Alias DB usuel #2", "Solvency II impose un capital de solvabilité."),
    ("Article + identifiant", "L'article 25(2) de la directive 2014/65/UE."),
    ("Paraphrase sans identifiant", "La directive sur les marchés d'instruments financiers de 2014 prévoit..."),
    ("Paraphrase doctrinale", "Le règlement européen sur les exigences de fonds propres bancaires."),
    ("Citation tag pur (skip)", "Voir [S1] et [G2] pour la preuve."),
    ("Acte hallucinaire (regex doit ignorer)", "Le règlement 9999/99/UE (inexistant) est cité."),
]


def _markdown_links(text: str) -> List[Tuple[str, str]]:
    """Return list of (anchor_text, url) markdown links found in text."""
    return re.findall(r"\[([^\]]+)\]\((/library/acts/[^)]+)\)", text)


def _diff_lines(a: List[Tuple[str, str]], b: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """Return links present in b but not in a (added by NER)."""
    return [link for link in b if link not in a]



[docs]
@dataclass
class Result:
    """One row of the diagnostic table: a sentence and the links each layer found."""

    category: str
    sentence: str
    regex_links: List[Tuple[str, str]]
    full_links: List[Tuple[str, str]]
    ner_added: List[Tuple[str, str]]




[docs]
def main() -> int:
    """Run the diagnostic and print a per-sentence comparison of regex vs NER."""
    cfg = get_config()
    pg = PostgresRepository(cfg.database.connection_string)

    extraction_cfg = cfg.extraction
    print("[setup] building linker from Postgres acts table...", file=sys.stderr)
    linker = build_linker(
        pg,
        fuzzy_threshold=extraction_cfg.entity_linker_fuzzy_threshold,
        fuzzy_min_gap=extraction_cfg.entity_linker_fuzzy_min_gap,
        fuzzy_limit=extraction_cfg.entity_linker_fuzzy_limit,
        min_alias_chars=extraction_cfg.entity_linker_min_alias_chars,
    )

    ner_url = os.environ.get("NER_SERVICE_URL", "http://ner-service:7800")
    print(f"[setup] connecting NerClient to {ner_url}", file=sys.stderr)
    client = NerClient(ner_url, timeout_seconds=8.0)
    detector = build_ner_external_detector(client, linker)

    results: List[Result] = []
    for category, sentence in TEST_CASES:
        regex_only = link_prose(sentence, linker, external_detector=None)
        full = link_prose(sentence, linker, external_detector=detector)
        regex_links = _markdown_links(regex_only)
        full_links = _markdown_links(full)
        results.append(
            Result(
                category=category,
                sentence=sentence,
                regex_links=regex_links,
                full_links=full_links,
                ner_added=_diff_lines(regex_links, full_links),
            )
        )

    print("\n" + "=" * 80)
    print(" Diagnostic NER vs Regex — link extraction")
    print("=" * 80 + "\n")

    for r in results:
        print(f"[{r.category}]")
        print(f"  Phrase   : {r.sentence}")
        if r.regex_links:
            print(f"  Regex    : {len(r.regex_links)} lien(s)")
            for txt, url in r.regex_links:
                print(f"             • {txt!r:<40} -> {url}")
        else:
            print("  Regex    : —")
        if r.ner_added:
            print(f"  NER add  : {len(r.ner_added)} lien(s) en plus")
            for txt, url in r.ner_added:
                print(f"             • {txt!r:<40} -> {url}")
        elif not r.regex_links:
            print("  NER add  : — (rien trouvé non plus)")
        else:
            print("  NER add  : — (regex suffit ici)")
        print()

    n_regex_total = sum(len(r.regex_links) for r in results)
    n_ner_added = sum(len(r.ner_added) for r in results)
    print(f"Total regex links       : {n_regex_total}")
    print(f"Total NER-added links   : {n_ner_added}")
    return 0



if __name__ == "__main__":
    raise SystemExit(main())