Source code for lalandre_rag.retrieval.overview

"""User-facing retrieval overview helpers."""

from typing import Any, Dict, Iterable, Optional


def _optional_int(value: Any) -> Optional[int]:
    return value if isinstance(value, int) else None


def _optional_float(value: Any) -> Optional[float]:
    if isinstance(value, (int, float)):
        return float(value)
    return None


def _clean_text(value: Any) -> Optional[str]:
    if isinstance(value, str):
        stripped = value.strip()
        return stripped or None
    return None


def _normalize_source_kind(value: Any, *, chunk_id: Optional[int]) -> Optional[str]:
    if chunk_id is not None:
        return "chunk"
    text = _clean_text(value)
    if text is None:
        return None
    lowered = text.lower()
    if lowered in {"chunk", "chunks"}:
        return "chunk"
    if lowered in {"subdivision", "subdivisions"}:
        return "subdivision"
    return lowered


def _extract_entry(item: Any) -> Dict[str, Any]:
    if hasattr(item, "act") and hasattr(item, "doc"):
        act = item.act
        doc = item.doc
        return {
            "act_id": _optional_int(getattr(act, "act_id", None)),
            "celex": _clean_text(getattr(act, "celex", None)),
            "title": _clean_text(getattr(act, "title", None)),
            "subdivision_id": _optional_int(getattr(doc, "subdivision_id", None)),
            "chunk_id": _optional_int(getattr(doc, "chunk_id", None)),
            "score": _optional_float(getattr(item, "score", None)),
            "source_kind": _normalize_source_kind(
                getattr(doc, "source_kind", None),
                chunk_id=_optional_int(getattr(doc, "chunk_id", None)),
            ),
        }

    metadata = getattr(item, "metadata", None)
    if not isinstance(metadata, dict):
        metadata = {}

    chunk_id = _optional_int(metadata.get("chunk_id"))
    if chunk_id is None:
        chunk_id = _optional_int(getattr(item, "chunk_id", None))

    act_id = _optional_int(getattr(item, "act_id", None))
    if act_id is None:
        act_id = _optional_int(metadata.get("act_id"))

    celex = _clean_text(getattr(item, "celex", None))
    if celex is None:
        celex = _clean_text(metadata.get("celex"))

    title = _clean_text(metadata.get("act_title")) or _clean_text(metadata.get("title")) or celex

    return {
        "act_id": act_id,
        "celex": celex,
        "title": title,
        "subdivision_id": _optional_int(getattr(item, "subdivision_id", None)),
        "chunk_id": chunk_id,
        "score": _optional_float(getattr(item, "score", None)),
        "source_kind": _normalize_source_kind(
            metadata.get("source_kind") or metadata.get("collection"),
            chunk_id=chunk_id,
        ),
    }


[docs] def build_retrieval_overview( items: Iterable[Any], *, effective_granularity: Optional[str], candidate_counts: Optional[Dict[str, int]] = None, top_acts_limit: int = 3, ) -> Dict[str, Any]: """Aggregate textual evidence into a product-facing hierarchy overview.""" acts: Dict[int, Dict[str, Any]] = {} subdivision_ids: set[int] = set() chunk_ids: set[int] = set() source_kind_counts: Dict[str, int] = {} for item in items: entry = _extract_entry(item) act_id = entry["act_id"] subdivision_id = entry["subdivision_id"] chunk_id = entry["chunk_id"] score = entry["score"] source_kind = entry["source_kind"] or ("chunk" if chunk_id is not None else "subdivision") if subdivision_id is not None: subdivision_ids.add(subdivision_id) if chunk_id is not None: chunk_ids.add(chunk_id) source_kind_counts[source_kind] = source_kind_counts.get(source_kind, 0) + 1 if act_id is None: continue act_entry = acts.setdefault( act_id, { "act_id": act_id, "celex": entry["celex"], "title": entry["title"] or entry["celex"] or f"Acte {act_id}", "hit_count": 0, "best_score": None, }, ) act_entry["hit_count"] += 1 if score is not None and (act_entry["best_score"] is None or score > act_entry["best_score"]): act_entry["best_score"] = round(score, 4) if act_entry.get("celex") is None and entry["celex"] is not None: act_entry["celex"] = entry["celex"] if act_entry.get("title") is None and entry["title"] is not None: act_entry["title"] = entry["title"] top_acts = sorted( acts.values(), key=lambda item: ( -int(item["hit_count"]), -(float(item["best_score"]) if item["best_score"] is not None else -1.0), str(item.get("celex") or ""), ), )[: max(int(top_acts_limit), 1)] overview: Dict[str, Any] = { "effective_granularity": effective_granularity, "selected_counts": { "acts": len(acts), "subdivisions": len(subdivision_ids), "chunks": len(chunk_ids), }, "source_kind_counts": source_kind_counts, "top_acts": top_acts, } if candidate_counts: overview["candidate_counts"] = candidate_counts return overview