Source code for lalandre_rag.summaries.generator

"""Canonical summary generation: LLM-based and deterministic fallback."""

from __future__ import annotations

import logging
from typing import Any, Dict, List, Optional, Sequence

from lalandre_core.config import get_config
from lalandre_core.http.llm_client import JSONHTTPLLMClient

from .agent import run_summary_agent
from .models import CANONICAL_SUMMARY_PROMPT_VERSION, SummaryTraceRecorder

logger = logging.getLogger(__name__)


[docs] class CanonicalSummaryGenerator: """Generate stable, reusable act summaries from structured act content.""" def __init__( self, *, llm_client: Optional[JSONHTTPLLMClient], prompt_version: str = CANONICAL_SUMMARY_PROMPT_VERSION, model_id: Optional[str] = None, ) -> None: self.llm_client = llm_client self.prompt_version = prompt_version self.model_id = model_id or "deterministic:fallback" self.max_context_chars = int(get_config().generation.summarize_max_context_chars)
[docs] def generate( self, *, act: Any, version: Any, subdivisions: Sequence[Any], ) -> Dict[str, Any]: """Generate a canonical summary with LLM-first and deterministic fallback.""" context = self._build_structured_context(act=act, version=version, subdivisions=subdivisions) if self.llm_client is None: return { "summary_text": self._build_deterministic_summary(act=act, version=version, subdivisions=subdivisions), "trace": SummaryTraceRecorder.generation( mode="deterministic", context_chars=len(context), subdivisions_used=len(subdivisions), model_id="deterministic:fallback", prompt_version=self.prompt_version, ), "model_id": "deterministic:fallback", } prompt = self._build_prompt(act=act, version=version, context=context) try: output, _retries = run_summary_agent( prompt=prompt, generate_text=self.llm_client.generate, model_name=self.model_id, ) return { "summary_text": output.summary, "trace": SummaryTraceRecorder.generation( mode="llm", context_chars=len(context), subdivisions_used=len(subdivisions), model_id=self.model_id, prompt_version=self.prompt_version, ), "model_id": self.model_id, } except Exception as exc: logger.warning("Canonical act summary generation failed for %s: %s", getattr(act, "celex", "?"), exc) return { "summary_text": self._build_deterministic_summary(act=act, version=version, subdivisions=subdivisions), "trace": SummaryTraceRecorder.generation( mode="deterministic_fallback", context_chars=len(context), subdivisions_used=len(subdivisions), model_id="deterministic:fallback", prompt_version=self.prompt_version, ), "model_id": "deterministic:fallback", }
def _build_prompt(self, *, act: Any, version: Any, context: str) -> str: version_label = ( f"version {getattr(version, 'version_number', '?')} du {getattr(version, 'version_date', '')}" if version is not None else "version non précisée" ) return ( "Tu résumes un acte juridique pour un panneau de bibliothèque documentaire.\n" "Retourne uniquement un objet JSON de la forme " '{"summary":"..."}.\n' "Règles:\n" "- 4 à 6 phrases maximum.\n" "- Ton factuel, sans citations, sans markdown, sans liste.\n" "- Décris l'objet du texte, sa portée et sa structure si visible.\n" "- N'invente aucun élément absent du contenu fourni.\n\n" f"Acte: {getattr(act, 'title', '')}\n" f"CELEX: {getattr(act, 'celex', '')}\n" f"Type: {getattr(act, 'act_type', '')}\n" f"Langue: {getattr(act, 'language', '')}\n" f"Version: {version_label}\n\n" "Contenu structuré:\n" f"{context}" ) def _build_structured_context( self, *, act: Any, version: Any, subdivisions: Sequence[Any], ) -> str: header_lines = [ f"Titre: {getattr(act, 'title', '')}", f"CELEX: {getattr(act, 'celex', '')}", f"Type: {getattr(act, 'act_type', '')}", f"Langue: {getattr(act, 'language', '')}", ] if version is not None: header_lines.append( f"Version courante: {getattr(version, 'version_number', '?')} ({getattr(version, 'version_type', '')})" ) blocks: List[str] = ["\n".join(header_lines)] remaining = self.max_context_chars - len(blocks[0]) for subdivision in subdivisions: label_parts = [str(getattr(subdivision, "subdivision_type", "") or "").strip().lower()] number = getattr(subdivision, "number", None) title = getattr(subdivision, "title", None) if isinstance(number, str) and number.strip(): label_parts.append(number.strip()) if isinstance(title, str) and title.strip(): label_parts.append(title.strip()) label = " ".join(part for part in label_parts if part).strip() content = " ".join(str(getattr(subdivision, "content", "")).split()) if not content: continue block = f"\n[{label or 'contenu'}]\n{content}" if remaining - len(block) < 0 and len(blocks) > 1: break blocks.append(block[:remaining]) remaining = self.max_context_chars - sum(len(item) for item in blocks) if remaining <= 0: break return "\n".join(blocks).strip() def _build_deterministic_summary( self, *, act: Any, version: Any, subdivisions: Sequence[Any], ) -> str: act_type = str(getattr(act, "act_type", "") or "texte") title = str(getattr(act, "title", "") or getattr(act, "celex", "Cet acte")) version_bits: List[str] = [] if version is not None: version_number = getattr(version, "version_number", None) version_type = getattr(version, "version_type", None) if version_number is not None: version_bits.append(f"version {version_number}") if version_type: version_bits.append(str(version_type)) article_count = 0 section_titles: List[str] = [] for subdivision in subdivisions: subtype = str(getattr(subdivision, "subdivision_type", "") or "").lower() if subtype == "article": article_count += 1 title_value = getattr(subdivision, "title", None) if isinstance(title_value, str) and title_value.strip(): section_titles.append(title_value.strip()) if len(section_titles) >= 3: break sentences = [f"{title} est un {act_type} en langue {getattr(act, 'language', '')}."] if version_bits: sentences.append(f"La synthèse disponible repose sur la {' '.join(version_bits)} du texte.") if article_count > 0: article_suffix = "s" if article_count > 1 else "" sentences.append(f"Le document structuré couvre au moins {article_count} article{article_suffix}.") if section_titles: sentences.append(f"Les premières sections identifiables portent notamment sur {', '.join(section_titles)}.") if not section_titles and subdivisions: first = " ".join(str(getattr(subdivisions[0], "content", "")).split()) if first: sentences.append(f"Le texte ouvre sur : {first[:220].rstrip()}...") sentences.append( "Ce résumé a été produit en mode déterministe de secours et peut être raffiné lors d'un prochain recalcul." ) return " ".join(sentence.strip() for sentence in sentences if sentence.strip())