Source code for lalandre_rag.modes.llm_mode

"""
LLM Only Mode
Pure LLM generation without retrieval
"""

import time
from typing import Any, Dict, Iterator

from langchain_core.output_parsers import StrOutputParser

from ..prompts import render_llm_only_prompt
from ..response import create_llm_only_response



[docs]
class LLMMode:
    """
    MODE 2: Pure LLM (100% Generation)
    Generate answer using only LLM knowledge (no retrieval)
    """

    def __init__(self, llm: Any):
        """
        Initialize LLM only mode

        Args:
            llm: LLM client
        """
        self.llm = llm


[docs]
    def query(self, question: str, include_warning: bool = True) -> Dict[str, Any]:
        """
        Generate answer using only LLM knowledge

        Args:
            question: User question
            include_warning: Include warning about no document grounding

        Returns:
            Dictionary with LLM answer (no sources)
        """
        generation_started_at = time.perf_counter()
        prompt = render_llm_only_prompt(question=question)

        answer_text = (self.llm | StrOutputParser()).invoke(prompt)
        generation_ms = (time.perf_counter() - generation_started_at) * 1000.0
        response = create_llm_only_response(query=question, answer=answer_text, include_warning=include_warning)
        response["metadata"]["phase_timings_ms"] = {
            "generation_ms": round(generation_ms, 1),
            "total_ms": round(generation_ms, 1),
        }
        return response



[docs]
    def stream_query(self, question: str) -> Iterator[str]:
        """Stream LLM answer token by token."""
        prompt = render_llm_only_prompt(question=question)
        chain = self.llm | StrOutputParser()
        for chunk in chain.stream(prompt):
            yield chunk