Source code for lalandre_chunking
"""
Chunking Service
"""
from __future__ import annotations
from lalandre_embedding.base import EmbeddingProvider
from .chunker import Chunker
from .pipeline import (
ArticleLevelPlan,
make_article_level_chunks,
prepare_article_level_plan,
serialize_chunk_records,
)
from .sac_chunker import SACChunker
__all__ = [
"ArticleLevelPlan",
"Chunker",
"EmbeddingProvider",
"SACChunker",
"get_chunker",
"make_article_level_chunks",
"prepare_article_level_plan",
"serialize_chunk_records",
]
[docs]
def get_chunker(
*,
embedding_provider: EmbeddingProvider,
min_chunk_size: int,
max_chunk_size: int,
chunk_overlap: int,
chars_per_token: float = 3.3,
breakpoint_percentile: float = 90.0,
breakpoint_max_threshold: float = 1.0,
sentence_window_size: int = 1,
embedding_batch_size: int = 32,
) -> SACChunker:
"""
Factory for the SAC chunker.
"""
return SACChunker(
embedding_provider=embedding_provider,
min_chunk_size=min_chunk_size,
max_chunk_size=max_chunk_size,
chunk_overlap=chunk_overlap,
chars_per_token=chars_per_token,
breakpoint_percentile=breakpoint_percentile,
breakpoint_max_threshold=breakpoint_max_threshold,
sentence_window_size=sentence_window_size,
batch_size=embedding_batch_size,
)