"""Diagnostic: compare prose linking with regex-only vs regex+NER.
For a curated list of test sentences, run prose_linker twice — once without
the NER external detector (regex+fuzzy alias only) and once with it. Diff
the outputs to show exactly what each layer adds.
Run inside the rag-service container so the DB-backed linker and the NER
client are wired the same way as production:
docker exec -i rag-service python /tmp/diag_ner_vs_regex.py
"""
from __future__ import annotations
import os
import re
import sys
from dataclasses import dataclass
from typing import List, Tuple
from lalandre_core.config import get_config
from lalandre_core.linking import NerClient
from lalandre_db_postgres import PostgresRepository
from lalandre_rag.linker_factory import build_linker
from lalandre_rag.ner_external import build_ner_external_detector
from lalandre_rag.prose_linker import link_prose
TEST_CASES: List[Tuple[str, str]] = [
# category, sentence
("CELEX canonique", "Le texte 32014L0065 impose une obligation."),
("Forme « directive YYYY/NN/UE »", "La directive 2014/65/UE concerne les marchés."),
("Forme « règlement YYYY/NN »", "Le règlement 2019/815 sur les rapports annuels."),
("Alias DB usuel", "Selon MiFID II, les conseillers doivent..."),
("Alias DB usuel #2", "Solvency II impose un capital de solvabilité."),
("Article + identifiant", "L'article 25(2) de la directive 2014/65/UE."),
("Paraphrase sans identifiant", "La directive sur les marchés d'instruments financiers de 2014 prévoit..."),
("Paraphrase doctrinale", "Le règlement européen sur les exigences de fonds propres bancaires."),
("Citation tag pur (skip)", "Voir [S1] et [G2] pour la preuve."),
("Acte hallucinaire (regex doit ignorer)", "Le règlement 9999/99/UE (inexistant) est cité."),
]
def _markdown_links(text: str) -> List[Tuple[str, str]]:
"""Return list of (anchor_text, url) markdown links found in text."""
return re.findall(r"\[([^\]]+)\]\((/library/acts/[^)]+)\)", text)
def _diff_lines(a: List[Tuple[str, str]], b: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
"""Return links present in b but not in a (added by NER)."""
return [link for link in b if link not in a]
[docs]
@dataclass
class Result:
"""One row of the diagnostic table: a sentence and the links each layer found."""
category: str
sentence: str
regex_links: List[Tuple[str, str]]
full_links: List[Tuple[str, str]]
ner_added: List[Tuple[str, str]]
[docs]
def main() -> int:
"""Run the diagnostic and print a per-sentence comparison of regex vs NER."""
cfg = get_config()
pg = PostgresRepository(cfg.database.connection_string)
extraction_cfg = cfg.extraction
print("[setup] building linker from Postgres acts table...", file=sys.stderr)
linker = build_linker(
pg,
fuzzy_threshold=extraction_cfg.entity_linker_fuzzy_threshold,
fuzzy_min_gap=extraction_cfg.entity_linker_fuzzy_min_gap,
fuzzy_limit=extraction_cfg.entity_linker_fuzzy_limit,
min_alias_chars=extraction_cfg.entity_linker_min_alias_chars,
)
ner_url = os.environ.get("NER_SERVICE_URL", "http://ner-service:7800")
print(f"[setup] connecting NerClient to {ner_url}", file=sys.stderr)
client = NerClient(ner_url, timeout_seconds=8.0)
detector = build_ner_external_detector(client, linker)
results: List[Result] = []
for category, sentence in TEST_CASES:
regex_only = link_prose(sentence, linker, external_detector=None)
full = link_prose(sentence, linker, external_detector=detector)
regex_links = _markdown_links(regex_only)
full_links = _markdown_links(full)
results.append(
Result(
category=category,
sentence=sentence,
regex_links=regex_links,
full_links=full_links,
ner_added=_diff_lines(regex_links, full_links),
)
)
print("\n" + "=" * 80)
print(" Diagnostic NER vs Regex — link extraction")
print("=" * 80 + "\n")
for r in results:
print(f"[{r.category}]")
print(f" Phrase : {r.sentence}")
if r.regex_links:
print(f" Regex : {len(r.regex_links)} lien(s)")
for txt, url in r.regex_links:
print(f" • {txt!r:<40} -> {url}")
else:
print(" Regex : —")
if r.ner_added:
print(f" NER add : {len(r.ner_added)} lien(s) en plus")
for txt, url in r.ner_added:
print(f" • {txt!r:<40} -> {url}")
elif not r.regex_links:
print(" NER add : — (rien trouvé non plus)")
else:
print(" NER add : — (regex suffit ici)")
print()
n_regex_total = sum(len(r.regex_links) for r in results)
n_ner_added = sum(len(r.ner_added) for r in results)
print(f"Total regex links : {n_regex_total}")
print(f"Total NER-added links : {n_ner_added}")
return 0
if __name__ == "__main__":
raise SystemExit(main())