Source code for lalandre_rag.adapters.llamaindex

"""
LlamaIndex Adapter
Utilities for using LlamaIndex with context slices
"""

from typing import Any, Callable, Dict, List, Optional, Tuple, cast

from llama_index.core import response_synthesizers as _li_response_synth
from llama_index.core.llms import LLM
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.schema import NodeWithScore, TextNode

from lalandre_rag.retrieval.context import ContextSlice

from ..prompts import get_llamaindex_prompt
from ..response import format_doc_location, format_source_header

ResponseSynthesizerFactory = Callable[..., Any]
get_response_synthesizer = cast(
    ResponseSynthesizerFactory,
    getattr(_li_response_synth, "get_response_synthesizer"),
)



[docs]
class LlamaIndexAdapter:
    """
    Adapter for using LlamaIndex with context slice objects

    Provides:
    - Document to node conversion
    - TreeSummarize for long documents
    - Multi-document comparison
    """

    def __init__(self, llama_llm: LLM):
        """
        Initialize LlamaIndex adapter

        Args:
            llama_llm: LlamaIndex-compatible LLM client
        """
        self.llama_llm = llama_llm


[docs]
    @staticmethod
    def context_slice_key(doc: ContextSlice) -> Tuple[str, int, Optional[int]]:
        """Return the stable lookup key used for source identifiers."""
        if doc.doc.chunk_id is not None:
            return ("chunk", doc.doc.chunk_id, doc.act.act_id)
        return ("subdivision", doc.doc.subdivision_id, doc.act.act_id)



[docs]
    def context_slices_to_nodes(
        self,
        context_slices: List[ContextSlice],
        source_id_map: Optional[Dict[Tuple[str, int, Optional[int]], str]] = None,
    ) -> List[NodeWithScore]:
        """
        Convert ContextSlice objects to LlamaIndex NodeWithScore

        Args:
            context_slices: List of context slices

        Returns:
            List of LlamaIndex nodes with scores
        """
        nodes: List[NodeWithScore] = []
        for idx, doc in enumerate(context_slices, start=1):
            # Prefer upstream payload metadata (already standardized in common builders)
            metadata: Dict[str, Any] = dict(doc.doc.payload or {})
            key = self.context_slice_key(doc)
            source_id = (source_id_map.get(key) if source_id_map else None) or f"S{idx}"
            metadata.setdefault("celex", doc.act.celex)
            metadata.setdefault("title", doc.act.title)
            metadata.setdefault("subdivision_type", doc.doc.subdivision_type)
            metadata.setdefault("act_type", doc.act.act_type)
            metadata.setdefault("sequence_order", doc.doc.sequence_order)
            metadata.setdefault("subdivision_id", doc.doc.subdivision_id)
            metadata.setdefault("act_id", doc.act.act_id)
            metadata.setdefault("source_kind", doc.doc.source_kind)
            metadata.setdefault("source_id", source_id)
            if doc.doc.chunk_id is not None:
                metadata.setdefault("chunk_id", doc.doc.chunk_id)
            if doc.doc.chunk_index is not None:
                metadata.setdefault("chunk_index", doc.doc.chunk_index)
            if doc.doc.char_start is not None:
                metadata.setdefault("char_start", doc.doc.char_start)
            if doc.doc.char_end is not None:
                metadata.setdefault("char_end", doc.doc.char_end)
            if doc.act.url_eurlex:
                metadata.setdefault("url_eurlex", doc.act.url_eurlex)

            # Create TextNode with inline source header for citations
            node_id = (
                f"{doc.act.celex}_{doc.doc.chunk_id}"
                if doc.doc.chunk_id is not None
                else f"{doc.act.celex}_{doc.doc.subdivision_id}"
            )
            location = format_doc_location(
                doc.doc.chunk_id,
                doc.doc.chunk_index,
                doc.doc.subdivision_type,
                doc.doc.subdivision_id,
            )
            header = format_source_header(
                source_id,
                doc.act.celex,
                location,
                doc.act.title,
                regulatory_level=doc.act.regulatory_level,
            )
            node = TextNode(text=f"{header}\n{doc.content}", id_=node_id)
            node.metadata = metadata

            # Wrap in NodeWithScore
            node_with_score = NodeWithScore.model_validate(
                {
                    "node": node,
                    "score": doc.score,
                }
            )
            nodes.append(node_with_score)

        return nodes



[docs]
    def summarize(
        self,
        topic: str,
        context_slices: List[ContextSlice],
        source_id_map: Optional[Dict[Tuple[str, int, Optional[int]], str]] = None,
    ) -> str:
        """
        Use LlamaIndex TreeSummarize for hierarchical summarization

        Better for long documents as it summarizes in chunks then combines

        Args:
            topic: Topic to summarize
            context_slices: Context slices to summarize

        Returns:
            Summary text
        """
        # Convert to LlamaIndex nodes
        nodes = self.context_slices_to_nodes(context_slices, source_id_map=source_id_map)

        # Get summary template from centralized prompts
        summary_template = get_llamaindex_prompt("summary")

        # Create response synthesizer with TreeSummarize
        synthesizer = get_response_synthesizer(
            response_mode=ResponseMode.TREE_SUMMARIZE,
            llm=self.llama_llm,
            summary_template=summary_template,
            use_async=False,
        )

        # Generate summary
        response = synthesizer.synthesize(query=topic, nodes=nodes)

        return str(response)



[docs]
    def compare(
        self,
        comparison_question: str,
        context_slices: List[ContextSlice],
        celex_list: List[str],
        source_id_map: Optional[Dict[Tuple[str, int, Optional[int]], str]] = None,
    ) -> str:
        """
        Use LlamaIndex for intelligent multi-document comparison

        Groups documents by CELEX and compares systematically

        Args:
            comparison_question: Question for comparison
            context_slices: Context slices to compare
            celex_list: List of CELEX codes being compared

        Returns:
            Comparison text
        """
        # Group documents by CELEX
        docs_by_celex: Dict[str, List[ContextSlice]] = {}
        for doc in context_slices:
            if doc.act.celex:
                if doc.act.celex in docs_by_celex:
                    docs_by_celex[doc.act.celex].append(doc)
                else:
                    docs_by_celex[doc.act.celex] = [doc]

        # Create nodes per document
        all_nodes: List[NodeWithScore] = []
        for docs in docs_by_celex.values():
            nodes = self.context_slices_to_nodes(docs, source_id_map=source_id_map)
            all_nodes.extend(nodes)

        # Le template de comparaison est centralisé dans prompts.py
        comparison_template = get_llamaindex_prompt("comparison")

        # Create response synthesizer
        synthesizer = get_response_synthesizer(
            response_mode=ResponseMode.TREE_SUMMARIZE,
            llm=self.llama_llm,
            summary_template=comparison_template,
            use_async=False,
        )

        # Generate comparison
        response = synthesizer.synthesize(query=comparison_question, nodes=all_nodes)

        return str(response)