"""User-facing retrieval overview helpers."""
from typing import Any, Dict, Iterable, Optional
def _optional_int(value: Any) -> Optional[int]:
return value if isinstance(value, int) else None
def _optional_float(value: Any) -> Optional[float]:
if isinstance(value, (int, float)):
return float(value)
return None
def _clean_text(value: Any) -> Optional[str]:
if isinstance(value, str):
stripped = value.strip()
return stripped or None
return None
def _normalize_source_kind(value: Any, *, chunk_id: Optional[int]) -> Optional[str]:
if chunk_id is not None:
return "chunk"
text = _clean_text(value)
if text is None:
return None
lowered = text.lower()
if lowered in {"chunk", "chunks"}:
return "chunk"
if lowered in {"subdivision", "subdivisions"}:
return "subdivision"
return lowered
def _extract_entry(item: Any) -> Dict[str, Any]:
if hasattr(item, "act") and hasattr(item, "doc"):
act = item.act
doc = item.doc
return {
"act_id": _optional_int(getattr(act, "act_id", None)),
"celex": _clean_text(getattr(act, "celex", None)),
"title": _clean_text(getattr(act, "title", None)),
"subdivision_id": _optional_int(getattr(doc, "subdivision_id", None)),
"chunk_id": _optional_int(getattr(doc, "chunk_id", None)),
"score": _optional_float(getattr(item, "score", None)),
"source_kind": _normalize_source_kind(
getattr(doc, "source_kind", None),
chunk_id=_optional_int(getattr(doc, "chunk_id", None)),
),
}
metadata = getattr(item, "metadata", None)
if not isinstance(metadata, dict):
metadata = {}
chunk_id = _optional_int(metadata.get("chunk_id"))
if chunk_id is None:
chunk_id = _optional_int(getattr(item, "chunk_id", None))
act_id = _optional_int(getattr(item, "act_id", None))
if act_id is None:
act_id = _optional_int(metadata.get("act_id"))
celex = _clean_text(getattr(item, "celex", None))
if celex is None:
celex = _clean_text(metadata.get("celex"))
title = _clean_text(metadata.get("act_title")) or _clean_text(metadata.get("title")) or celex
return {
"act_id": act_id,
"celex": celex,
"title": title,
"subdivision_id": _optional_int(getattr(item, "subdivision_id", None)),
"chunk_id": chunk_id,
"score": _optional_float(getattr(item, "score", None)),
"source_kind": _normalize_source_kind(
metadata.get("source_kind") or metadata.get("collection"),
chunk_id=chunk_id,
),
}
[docs]
def build_retrieval_overview(
items: Iterable[Any],
*,
effective_granularity: Optional[str],
candidate_counts: Optional[Dict[str, int]] = None,
top_acts_limit: int = 3,
) -> Dict[str, Any]:
"""Aggregate textual evidence into a product-facing hierarchy overview."""
acts: Dict[int, Dict[str, Any]] = {}
subdivision_ids: set[int] = set()
chunk_ids: set[int] = set()
source_kind_counts: Dict[str, int] = {}
for item in items:
entry = _extract_entry(item)
act_id = entry["act_id"]
subdivision_id = entry["subdivision_id"]
chunk_id = entry["chunk_id"]
score = entry["score"]
source_kind = entry["source_kind"] or ("chunk" if chunk_id is not None else "subdivision")
if subdivision_id is not None:
subdivision_ids.add(subdivision_id)
if chunk_id is not None:
chunk_ids.add(chunk_id)
source_kind_counts[source_kind] = source_kind_counts.get(source_kind, 0) + 1
if act_id is None:
continue
act_entry = acts.setdefault(
act_id,
{
"act_id": act_id,
"celex": entry["celex"],
"title": entry["title"] or entry["celex"] or f"Acte {act_id}",
"hit_count": 0,
"best_score": None,
},
)
act_entry["hit_count"] += 1
if score is not None and (act_entry["best_score"] is None or score > act_entry["best_score"]):
act_entry["best_score"] = round(score, 4)
if act_entry.get("celex") is None and entry["celex"] is not None:
act_entry["celex"] = entry["celex"]
if act_entry.get("title") is None and entry["title"] is not None:
act_entry["title"] = entry["title"]
top_acts = sorted(
acts.values(),
key=lambda item: (
-int(item["hit_count"]),
-(float(item["best_score"]) if item["best_score"] is not None else -1.0),
str(item.get("celex") or ""),
),
)[: max(int(top_acts_limit), 1)]
overview: Dict[str, Any] = {
"effective_granularity": effective_granularity,
"selected_counts": {
"acts": len(acts),
"subdivisions": len(subdivision_ids),
"chunks": len(chunk_ids),
},
"source_kind_counts": source_kind_counts,
"top_acts": top_acts,
}
if candidate_counts:
overview["candidate_counts"] = candidate_counts
return overview