Source code for lalandre_rag.ner_external

"""Glue between the NER service and ``prose_linker``'s external_detector hook.

The NER service returns free-text spans like ``("directive 2014/65/UE", 12, 32,
"directive", 0.91)``. ``prose_linker`` needs ``ExternalDetection`` instances
already resolved to an internal ``act_id``. This module bridges the two:

1. Call the NER service (or any other zero-shot detector) to find candidate
   spans the regex layer might miss (paraphrases, fuzzy mentions).
2. Run each candidate through ``LegalEntityLinker`` to resolve to an
   ``act_id``. Spans the linker cannot resolve (or resolves with low
   confidence / fallback method) are dropped — never link a span we cannot
   back with a chunk.
3. Return the resolved spans as ``ExternalDetection`` for the linker to merge.

The factory ``build_ner_external_detector`` is what callers use; it returns
``None`` when no NER service URL is configured, so the rest of the pipeline
keeps the regex-only behaviour with zero overhead.
"""

from __future__ import annotations

import logging
import re
from typing import List, Sequence

from lalandre_core.linking import LegalEntityLinker, NerClient

from .prose_linker import ExternalDetection, ExternalDetector

logger = logging.getLogger(__name__)


_TRUSTED_METHODS = frozenset({"explicit", "exact_alias", "fuzzy_alias"})

# Defence against vague-span fuzzy false positives: a span like
# ``Le règlement européen`` (no number, no roman numeral, no identifier)
# can fuzzy-match any random act in the catalogue. We require a digit or a
# roman numeral ≥ II in the span before accepting a ``fuzzy_alias``
# resolution. ``explicit`` and ``exact_alias`` resolutions skip this check —
# they are already confident.
_HAS_NUMERIC_TOKEN_RE = re.compile(r"\d|\b[IVX]{2,}\b")


[docs] def build_ner_external_detector( ner_client: NerClient, linker: LegalEntityLinker, *, min_span_score: float = 0.5, min_link_score: float = 0.85, ) -> ExternalDetector: """Return an ``ExternalDetector`` callable backed by the NER service. Args: ner_client: Configured client for the NER service. linker: Same linker used for regex-based resolution; reused here to translate NER text spans into internal ``act_id`` values. min_span_score: Drop NER spans below this confidence threshold. min_link_score: Drop linker resolutions below this score. The returned callable is safe to invoke on every answer: errors are swallowed and an empty list is returned, so the regex layer always wins by default. """ def _detect(text: str) -> Sequence[ExternalDetection]: try: spans = ner_client.detect(text) except Exception: logger.warning("NER detector failed; falling back to regex-only", exc_info=True) return [] detections: List[ExternalDetection] = [] for span in spans: if span.score < min_span_score: continue try: resolution = linker.resolve(span.text) except Exception: continue if resolution is None: continue if resolution.method not in _TRUSTED_METHODS: continue if resolution.score < min_link_score: continue if resolution.act_id is None: continue # Reject vague-span fuzzy hits: span without any numeric token # cannot disambiguate to a specific act through fuzzy alias # matching alone. if resolution.method == "fuzzy_alias" and not _HAS_NUMERIC_TOKEN_RE.search(span.text): continue detections.append( ExternalDetection( start=span.start, end=span.end, act_id=resolution.act_id, eli=resolution.eli, ) ) return detections return _detect
__all__ = ["build_ner_external_detector"]