Source code for lalandre_rag.modes.llm_mode
"""
LLM Only Mode
Pure LLM generation without retrieval
"""
import time
from typing import Any, Dict, Iterator
from langchain_core.output_parsers import StrOutputParser
from ..prompts import render_llm_only_prompt
from ..response import create_llm_only_response
[docs]
class LLMMode:
"""
MODE 2: Pure LLM (100% Generation)
Generate answer using only LLM knowledge (no retrieval)
"""
def __init__(self, llm: Any):
"""
Initialize LLM only mode
Args:
llm: LLM client
"""
self.llm = llm
[docs]
def query(self, question: str, include_warning: bool = True) -> Dict[str, Any]:
"""
Generate answer using only LLM knowledge
Args:
question: User question
include_warning: Include warning about no document grounding
Returns:
Dictionary with LLM answer (no sources)
"""
generation_started_at = time.perf_counter()
prompt = render_llm_only_prompt(question=question)
answer_text = (self.llm | StrOutputParser()).invoke(prompt)
generation_ms = (time.perf_counter() - generation_started_at) * 1000.0
response = create_llm_only_response(query=question, answer=answer_text, include_warning=include_warning)
response["metadata"]["phase_timings_ms"] = {
"generation_ms": round(generation_ms, 1),
"total_ms": round(generation_ms, 1),
}
return response
[docs]
def stream_query(self, question: str) -> Iterator[str]:
"""Stream LLM answer token by token."""
prompt = render_llm_only_prompt(question=question)
chain = self.llm | StrOutputParser()
for chunk in chain.stream(prompt):
yield chunk