Source code for scripts.split_scope
"""Split SCOPE.md into N balanced chunks for parallel sub-agent processing.
Splits on act boundaries (## heading) so no act is truncated.
"""
from __future__ import annotations
import argparse
import os
import re
import sys
ACT_DELIMITER = re.compile(r"^## (\S+) —", re.MULTILINE)
[docs]
def main():
"""Split ``SCOPE.md`` into balanced chunks without cutting act sections."""
parser = argparse.ArgumentParser()
parser.add_argument("--input", default="SCOPE.md")
parser.add_argument("--chunks", type=int, default=8)
parser.add_argument("--output-dir", default="scripts/chunks")
args = parser.parse_args()
with open(args.input, encoding="utf-8") as f:
content = f.read()
# Find all act start positions
matches = list(ACT_DELIMITER.finditer(content))
if not matches:
print("No acts found in SCOPE.md", file=sys.stderr)
sys.exit(1)
# Header is everything before the first act
header = content[: matches[0].start()]
# Build list of (celex, start, end) for each act
acts = []
for i, m in enumerate(matches):
start = m.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
acts.append((m.group(1), start, end, content[start:end]))
total_chars = sum(len(a[3]) for a in acts)
target_chunk_size = total_chars // args.chunks
print(
f"[split_scope] {len(acts)} acts, {total_chars:,} chars total, target {target_chunk_size:,} chars/chunk",
file=sys.stderr,
)
os.makedirs(args.output_dir, exist_ok=True)
chunk_idx = 0
chunk_acts: list[tuple] = []
chunk_size = 0
def flush(chunk_idx: int, chunk_acts: list):
path = os.path.join(args.output_dir, f"SCOPE_part_{chunk_idx:02d}.md")
with open(path, "w", encoding="utf-8") as f:
f.write(f"# EUR-Lex Scope — Chunk {chunk_idx} / {args.chunks}\n\n")
f.write(f"Acts in this chunk: {len(chunk_acts)}\n\n---\n\n")
for _, _, _, body in chunk_acts:
f.write(body)
total = sum(len(a[3]) for a in chunk_acts)
print(
f"[split_scope] chunk {chunk_idx}: {len(chunk_acts)} acts, {total:,} chars → {path}",
file=sys.stderr,
)
for act in acts:
chunk_acts.append(act)
chunk_size += len(act[3])
if chunk_size >= target_chunk_size and chunk_idx < args.chunks - 1:
chunk_idx += 1
flush(chunk_idx, chunk_acts)
chunk_acts = []
chunk_size = 0
# Last chunk
if chunk_acts:
chunk_idx += 1
flush(chunk_idx, chunk_acts)
print(f"[split_scope] wrote {chunk_idx} chunks to {args.output_dir}/", file=sys.stderr)
if __name__ == "__main__":
main()