Source code for lalandre_rag.ner_external
"""Glue between the NER service and ``prose_linker``'s external_detector hook.
The NER service returns free-text spans like ``("directive 2014/65/UE", 12, 32,
"directive", 0.91)``. ``prose_linker`` needs ``ExternalDetection`` instances
already resolved to an internal ``act_id``. This module bridges the two:
1. Call the NER service (or any other zero-shot detector) to find candidate
spans the regex layer might miss (paraphrases, fuzzy mentions).
2. Run each candidate through ``LegalEntityLinker`` to resolve to an
``act_id``. Spans the linker cannot resolve (or resolves with low
confidence / fallback method) are dropped — never link a span we cannot
back with a chunk.
3. Return the resolved spans as ``ExternalDetection`` for the linker to merge.
The factory ``build_ner_external_detector`` is what callers use; it returns
``None`` when no NER service URL is configured, so the rest of the pipeline
keeps the regex-only behaviour with zero overhead.
"""
from __future__ import annotations
import logging
import re
from typing import List, Sequence
from lalandre_core.linking import LegalEntityLinker, NerClient
from .prose_linker import ExternalDetection, ExternalDetector
logger = logging.getLogger(__name__)
_TRUSTED_METHODS = frozenset({"explicit", "exact_alias", "fuzzy_alias"})
# Defence against vague-span fuzzy false positives: a span like
# ``Le règlement européen`` (no number, no roman numeral, no identifier)
# can fuzzy-match any random act in the catalogue. We require a digit or a
# roman numeral ≥ II in the span before accepting a ``fuzzy_alias``
# resolution. ``explicit`` and ``exact_alias`` resolutions skip this check —
# they are already confident.
_HAS_NUMERIC_TOKEN_RE = re.compile(r"\d|\b[IVX]{2,}\b")
[docs]
def build_ner_external_detector(
ner_client: NerClient,
linker: LegalEntityLinker,
*,
min_span_score: float = 0.5,
min_link_score: float = 0.85,
) -> ExternalDetector:
"""Return an ``ExternalDetector`` callable backed by the NER service.
Args:
ner_client: Configured client for the NER service.
linker: Same linker used for regex-based resolution; reused here to
translate NER text spans into internal ``act_id`` values.
min_span_score: Drop NER spans below this confidence threshold.
min_link_score: Drop linker resolutions below this score.
The returned callable is safe to invoke on every answer: errors are
swallowed and an empty list is returned, so the regex layer always wins
by default.
"""
def _detect(text: str) -> Sequence[ExternalDetection]:
try:
spans = ner_client.detect(text)
except Exception:
logger.warning("NER detector failed; falling back to regex-only", exc_info=True)
return []
detections: List[ExternalDetection] = []
for span in spans:
if span.score < min_span_score:
continue
try:
resolution = linker.resolve(span.text)
except Exception:
continue
if resolution is None:
continue
if resolution.method not in _TRUSTED_METHODS:
continue
if resolution.score < min_link_score:
continue
if resolution.act_id is None:
continue
# Reject vague-span fuzzy hits: span without any numeric token
# cannot disambiguate to a specific act through fuzzy alias
# matching alone.
if resolution.method == "fuzzy_alias" and not _HAS_NUMERIC_TOKEN_RE.search(span.text):
continue
detections.append(
ExternalDetection(
start=span.start,
end=span.end,
act_id=resolution.act_id,
eli=resolution.eli,
)
)
return detections
return _detect
__all__ = ["build_ner_external_detector"]