Source code for scripts.split_scope

"""Split SCOPE.md into N balanced chunks for parallel sub-agent processing.

Splits on act boundaries (## heading) so no act is truncated.
"""

from __future__ import annotations

import argparse
import os
import re
import sys

ACT_DELIMITER = re.compile(r"^## (\S+) —", re.MULTILINE)


[docs] def main(): """Split ``SCOPE.md`` into balanced chunks without cutting act sections.""" parser = argparse.ArgumentParser() parser.add_argument("--input", default="SCOPE.md") parser.add_argument("--chunks", type=int, default=8) parser.add_argument("--output-dir", default="scripts/chunks") args = parser.parse_args() with open(args.input, encoding="utf-8") as f: content = f.read() # Find all act start positions matches = list(ACT_DELIMITER.finditer(content)) if not matches: print("No acts found in SCOPE.md", file=sys.stderr) sys.exit(1) # Header is everything before the first act header = content[: matches[0].start()] # Build list of (celex, start, end) for each act acts = [] for i, m in enumerate(matches): start = m.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(content) acts.append((m.group(1), start, end, content[start:end])) total_chars = sum(len(a[3]) for a in acts) target_chunk_size = total_chars // args.chunks print( f"[split_scope] {len(acts)} acts, {total_chars:,} chars total, target {target_chunk_size:,} chars/chunk", file=sys.stderr, ) os.makedirs(args.output_dir, exist_ok=True) chunk_idx = 0 chunk_acts: list[tuple] = [] chunk_size = 0 def flush(chunk_idx: int, chunk_acts: list): path = os.path.join(args.output_dir, f"SCOPE_part_{chunk_idx:02d}.md") with open(path, "w", encoding="utf-8") as f: f.write(f"# EUR-Lex Scope — Chunk {chunk_idx} / {args.chunks}\n\n") f.write(f"Acts in this chunk: {len(chunk_acts)}\n\n---\n\n") for _, _, _, body in chunk_acts: f.write(body) total = sum(len(a[3]) for a in chunk_acts) print( f"[split_scope] chunk {chunk_idx}: {len(chunk_acts)} acts, {total:,} chars → {path}", file=sys.stderr, ) for act in acts: chunk_acts.append(act) chunk_size += len(act[3]) if chunk_size >= target_chunk_size and chunk_idx < args.chunks - 1: chunk_idx += 1 flush(chunk_idx, chunk_acts) chunk_acts = [] chunk_size = 0 # Last chunk if chunk_acts: chunk_idx += 1 flush(chunk_idx, chunk_acts) print(f"[split_scope] wrote {chunk_idx} chunks to {args.output_dir}/", file=sys.stderr)
if __name__ == "__main__": main()