Source code for scripts.diag_ner_vs_regex

"""Diagnostic: compare prose linking with regex-only vs regex+NER.

For a curated list of test sentences, run prose_linker twice — once without
the NER external detector (regex+fuzzy alias only) and once with it. Diff
the outputs to show exactly what each layer adds.

Run inside the rag-service container so the DB-backed linker and the NER
client are wired the same way as production:

    docker exec -i rag-service python /tmp/diag_ner_vs_regex.py
"""

from __future__ import annotations

import os
import re
import sys
from dataclasses import dataclass
from typing import List, Tuple

from lalandre_core.config import get_config
from lalandre_core.linking import NerClient
from lalandre_db_postgres import PostgresRepository
from lalandre_rag.linker_factory import build_linker
from lalandre_rag.ner_external import build_ner_external_detector
from lalandre_rag.prose_linker import link_prose

TEST_CASES: List[Tuple[str, str]] = [
    # category, sentence
    ("CELEX canonique", "Le texte 32014L0065 impose une obligation."),
    ("Forme « directive YYYY/NN/UE »", "La directive 2014/65/UE concerne les marchés."),
    ("Forme « règlement YYYY/NN »", "Le règlement 2019/815 sur les rapports annuels."),
    ("Alias DB usuel", "Selon MiFID II, les conseillers doivent..."),
    ("Alias DB usuel #2", "Solvency II impose un capital de solvabilité."),
    ("Article + identifiant", "L'article 25(2) de la directive 2014/65/UE."),
    ("Paraphrase sans identifiant", "La directive sur les marchés d'instruments financiers de 2014 prévoit..."),
    ("Paraphrase doctrinale", "Le règlement européen sur les exigences de fonds propres bancaires."),
    ("Citation tag pur (skip)", "Voir [S1] et [G2] pour la preuve."),
    ("Acte hallucinaire (regex doit ignorer)", "Le règlement 9999/99/UE (inexistant) est cité."),
]


def _markdown_links(text: str) -> List[Tuple[str, str]]:
    """Return list of (anchor_text, url) markdown links found in text."""
    return re.findall(r"\[([^\]]+)\]\((/library/acts/[^)]+)\)", text)


def _diff_lines(a: List[Tuple[str, str]], b: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """Return links present in b but not in a (added by NER)."""
    return [link for link in b if link not in a]


[docs] @dataclass class Result: """One row of the diagnostic table: a sentence and the links each layer found.""" category: str sentence: str regex_links: List[Tuple[str, str]] full_links: List[Tuple[str, str]] ner_added: List[Tuple[str, str]]
[docs] def main() -> int: """Run the diagnostic and print a per-sentence comparison of regex vs NER.""" cfg = get_config() pg = PostgresRepository(cfg.database.connection_string) extraction_cfg = cfg.extraction print("[setup] building linker from Postgres acts table...", file=sys.stderr) linker = build_linker( pg, fuzzy_threshold=extraction_cfg.entity_linker_fuzzy_threshold, fuzzy_min_gap=extraction_cfg.entity_linker_fuzzy_min_gap, fuzzy_limit=extraction_cfg.entity_linker_fuzzy_limit, min_alias_chars=extraction_cfg.entity_linker_min_alias_chars, ) ner_url = os.environ.get("NER_SERVICE_URL", "http://ner-service:7800") print(f"[setup] connecting NerClient to {ner_url}", file=sys.stderr) client = NerClient(ner_url, timeout_seconds=8.0) detector = build_ner_external_detector(client, linker) results: List[Result] = [] for category, sentence in TEST_CASES: regex_only = link_prose(sentence, linker, external_detector=None) full = link_prose(sentence, linker, external_detector=detector) regex_links = _markdown_links(regex_only) full_links = _markdown_links(full) results.append( Result( category=category, sentence=sentence, regex_links=regex_links, full_links=full_links, ner_added=_diff_lines(regex_links, full_links), ) ) print("\n" + "=" * 80) print(" Diagnostic NER vs Regex — link extraction") print("=" * 80 + "\n") for r in results: print(f"[{r.category}]") print(f" Phrase : {r.sentence}") if r.regex_links: print(f" Regex : {len(r.regex_links)} lien(s)") for txt, url in r.regex_links: print(f" • {txt!r:<40} -> {url}") else: print(" Regex : —") if r.ner_added: print(f" NER add : {len(r.ner_added)} lien(s) en plus") for txt, url in r.ner_added: print(f" • {txt!r:<40} -> {url}") elif not r.regex_links: print(" NER add : — (rien trouvé non plus)") else: print(" NER add : — (regex suffit ici)") print() n_regex_total = sum(len(r.regex_links) for r in results) n_ner_added = sum(len(r.ner_added) for r in results) print(f"Total regex links : {n_regex_total}") print(f"Total NER-added links : {n_ner_added}") return 0
if __name__ == "__main__": raise SystemExit(main())