Source code for lalandre_chunking

"""
Chunking Service
"""

from __future__ import annotations

from lalandre_embedding.base import EmbeddingProvider

from .chunker import Chunker
from .pipeline import (
    ArticleLevelPlan,
    make_article_level_chunks,
    prepare_article_level_plan,
    serialize_chunk_records,
)
from .sac_chunker import SACChunker

__all__ = [
    "ArticleLevelPlan",
    "Chunker",
    "EmbeddingProvider",
    "SACChunker",
    "get_chunker",
    "make_article_level_chunks",
    "prepare_article_level_plan",
    "serialize_chunk_records",
]


[docs] def get_chunker( *, embedding_provider: EmbeddingProvider, min_chunk_size: int, max_chunk_size: int, chunk_overlap: int, chars_per_token: float = 3.3, breakpoint_percentile: float = 90.0, breakpoint_max_threshold: float = 1.0, sentence_window_size: int = 1, embedding_batch_size: int = 32, ) -> SACChunker: """ Factory for the SAC chunker. """ return SACChunker( embedding_provider=embedding_provider, min_chunk_size=min_chunk_size, max_chunk_size=max_chunk_size, chunk_overlap=chunk_overlap, chars_per_token=chars_per_token, breakpoint_percentile=breakpoint_percentile, breakpoint_max_threshold=breakpoint_max_threshold, sentence_window_size=sentence_window_size, batch_size=embedding_batch_size, )