"""Fast extraction of scope articles (Article 1-3) from EUR-Lex acts.
Strategy: single bulk SELECT joining acts + article subdivisions + their subtrees.
Caps per-article content at ~2500 chars (scope info is always in first paragraphs).
"""
from __future__ import annotations
import os
import sys
import time
import psycopg2
from psycopg2.extras import RealDictCursor
DATABASE_URL = os.environ.get("DATABASE_URL", "postgres://dev:dev@127.0.0.1:5432/document")
OUTPUT = os.environ.get("SCOPE_OUTPUT", "SCOPE.md")
MAX_ARTICLE_CHARS = 2500
BULK_QUERY = """
WITH scope_articles AS (
SELECT DISTINCT ON (s.act_id, s.number)
s.id, s.act_id, s.number, s.hierarchy_path
FROM subdivisions s
JOIN acts a ON a.id = s.act_id
WHERE a.celex NOT LIKE 'LEGI%%'
AND s.subdivision_type = 'article'
AND s.number IN ('1', '2', '3')
ORDER BY s.act_id, s.number, LENGTH(COALESCE(s.content, '')) DESC, s.id
)
SELECT
a.celex, a.title AS act_title, a.language, a.url_eurlex,
sa.number AS article_num,
sa.hierarchy_path AS article_path,
s.hierarchy_path AS sub_path,
s.content AS sub_content,
s.title AS sub_title
FROM scope_articles sa
JOIN acts a ON a.id = sa.act_id
JOIN subdivisions s ON s.act_id = sa.act_id
AND (s.hierarchy_path = sa.hierarchy_path
OR s.hierarchy_path LIKE sa.hierarchy_path || '/%%')
ORDER BY a.celex, sa.number, s.hierarchy_path;
"""
[docs]
def main():
"""Extract the first scope articles for each act into a Markdown digest."""
t0 = time.time()
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor(name="scope_bulk", cursor_factory=RealDictCursor) # server-side
cur.itersize = 5000
print("[extract_scope] running bulk query…", file=sys.stderr)
cur.execute(BULK_QUERY)
print(f"[extract_scope] query done in {time.time() - t0:.1f}s, streaming rows", file=sys.stderr)
# Group by (celex, article_num) in streaming fashion (ORDER BY guarantees locality)
acts_written = 0
articles_written = 0
cur_celex = None
cur_article_num = None
cur_article_buf: list[str] = []
cur_article_chars = 0
cur_act_meta: dict = {}
with open(OUTPUT, "w", encoding="utf-8") as f:
f.write("# EUR-Lex Scope Articles\n\n")
f.write(
f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
f"Source: EUR-Lex acts, Article 1-3 with direct subtree (capped)\n\n---\n\n"
)
def flush_article():
nonlocal articles_written, cur_article_buf, cur_article_chars
if cur_article_num is None or not cur_article_buf:
cur_article_buf = []
cur_article_chars = 0
return
f.write(f"### Article {cur_article_num}\n\n")
f.write("".join(cur_article_buf).rstrip() + "\n\n")
articles_written += 1
cur_article_buf = []
cur_article_chars = 0
def flush_act():
nonlocal acts_written
flush_article()
if cur_act_meta:
f.write("---\n\n")
acts_written += 1
def open_act(meta):
f.write(f"## {meta['celex']} — {meta['act_title'][:200]}\n\n")
f.write(f"**Language:** {meta['language']}\n")
if meta.get("url_eurlex"):
f.write(f"**URL:** {meta['url_eurlex']}\n")
f.write("\n")
n = 0
for row in cur:
n += 1
if n % 20000 == 0:
print(
f"[extract_scope] {n} rows, {acts_written} acts, "
f"{articles_written} articles, {time.time() - t0:.0f}s",
file=sys.stderr,
)
celex = row["celex"]
article_num = row["article_num"]
# Act transition
if celex != cur_celex:
flush_act()
cur_act_meta = {
"celex": celex,
"act_title": row["act_title"],
"language": row["language"],
"url_eurlex": row["url_eurlex"],
}
open_act(cur_act_meta)
cur_celex = celex
cur_article_num = None
# Article transition
if article_num != cur_article_num:
flush_article()
cur_article_num = article_num
# Skip if we've already hit the cap for this article
if cur_article_chars >= MAX_ARTICLE_CHARS:
continue
content = (row["sub_content"] or "").strip()
title = (row["sub_title"] or "").strip()
if not content and not title:
continue
# Compute indentation from depth relative to the article root
rel_depth = max(
0,
row["sub_path"].count("/") - row["article_path"].count("/"),
)
prefix = " " * rel_depth
piece = []
if title and title not in content[:200]:
piece.append(f"{prefix}**{title}**\n")
if content:
# Truncate long single content
remaining = MAX_ARTICLE_CHARS - cur_article_chars
if len(content) > remaining:
content = content[: max(0, remaining - 20)] + "…"
for line in content.split("\n"):
piece.append(f"{prefix}{line}\n")
piece.append("\n")
blob = "".join(piece)
cur_article_buf.append(blob)
cur_article_chars += len(blob)
flush_act()
elapsed = time.time() - t0
print(
f"[extract_scope] DONE: {acts_written} acts, {articles_written} articles in {elapsed:.0f}s → {OUTPUT}",
file=sys.stderr,
)
if __name__ == "__main__":
main()