synta 0.2.3

ASN.1 parser, decoder, and encoder library with DER/BER support and C FFI
Documentation
#!/usr/bin/env python3
"""doc-python-samples.py — extract and classify Python code blocks from Synta docs.

Called by doc-python-samples.sh with:
    python3 doc-python-samples.py <work_dir> <file1.md> [file2.md ...]

Outputs to stdout a single line:
    total_blocks<TAB>skip_blocks

Writes to <work_dir>:

  raw/NNNNN.py     — raw extracted code (no wrapper)
  src/NNNNN.py     — wrapped, ready-to-syntax-check translation unit
  manifest.tsv     — tab-separated, one row per checkable block:
      doc_file  start_line  lang  src_file  kind  raw_file

kind values
-----------
  skip_annotated — fenced block has ``ignore`` or ``notest`` annotation
  skip_nonsynta  — no ``synta`` identifier found; skip
  stub           — contains bare ``...`` ellipsis (type-stub / pseudo-code); syntax-check only
  program        — has ``if __name__ == "__main__":``; checked as-is
  fragment       — standalone statements; wrapped to import synta at top
"""

import os
import re
import sys


# ── Regexps ──────────────────────────────────────────────────────────────────

# Block must reference the synta module to be worth checking.
SYNTA_RE = re.compile(r"\bsynta\b", re.IGNORECASE)

# Bare ``...`` on its own line indicates a type-stub / pseudo-code block
# (e.g. class/method signature listings).  These are still valid Python
# syntax and can be py_compile-checked; we just classify them distinctly
# so the shell script can report them appropriately.
ELLIPSIS_RE = re.compile(r"^\s*\.\.\.\s*$", re.MULTILINE)

# ``if __name__ == "__main__":`` marks a complete program.
MAIN_RE = re.compile(r'if\s+__name__\s*==\s*["\']__main__["\']')

# Fenced-block annotations meaning "deliberately not runnable".
SKIP_ANNOTATIONS = frozenset({"ignore", "notest"})


# ── Source wrappers ──────────────────────────────────────────────────────────

# Minimal preamble: make ``import synta`` and common stdlib imports available
# so that syntax errors in doc fragments are not masked by NameError.
# py_compile only checks syntax — it never actually imports anything — so
# this preamble is purely documentary; it keeps the intent clear.
PREAMBLE = """\
# Auto-generated wrapper for Synta Python documentation snippet.
# Validated with: python3 -m py_compile (syntax check only; synta not imported)
import synta  # noqa: F401
"""


def classify(code: str, annotation: str) -> str:
    """Classify a Python code block.  Returns one of the kind strings above."""
    if annotation in SKIP_ANNOTATIONS:
        return "skip_annotated"
    if not SYNTA_RE.search(code):
        return "skip_nonsynta"
    if ELLIPSIS_RE.search(code):
        return "stub"
    if MAIN_RE.search(code):
        return "program"
    return "fragment"


def wrap(code: str, kind: str) -> str:
    """Return a syntax-checkable source string for the given kind."""
    if kind in ("program", "stub"):
        # Complete programs and stubs are valid on their own; just prepend
        # the preamble comment so the file origin is clear in error messages.
        return (
            "# Auto-generated wrapper for Synta Python documentation snippet.\n"
            + code
            + "\n"
        )
    # fragment: prepend the synta import so the intent is clear.
    return PREAMBLE + "\n" + code + "\n"


# ── Main extraction loop ──────────────────────────────────────────────────────


def main() -> None:
    args = sys.argv[1:]
    if not args:
        print(
            f"Usage: {sys.argv[0]} <work_dir> [file.md ...]",
            file=sys.stderr,
        )
        sys.exit(1)

    work_dir = args[0]
    md_files = args[1:]

    raw_dir = os.path.join(work_dir, "raw")
    src_dir = os.path.join(work_dir, "src")
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(src_dir, exist_ok=True)

    manifest_rows: list[str] = []
    block_n = 0
    skip_n = 0

    for md_path in md_files:
        try:
            with open(md_path, encoding="utf-8") as fh:
                lines = fh.readlines()
        except OSError as exc:
            print(f"warning: cannot read {md_path}: {exc}", file=sys.stderr)
            continue

        in_block = False
        annotation = ""
        start_line = 0
        buf: list[str] = []

        for lineno, line in enumerate(lines, 1):
            if not in_block:
                # Match ```python or ```python,<annotation> fenced-block openers.
                m = re.match(r"^```python(?:,(\S+))?\s*$", line, re.IGNORECASE)
                if m:
                    annotation = (m.group(1) or "").lower()
                    in_block = True
                    start_line = (
                        lineno + 1
                    )  # +1: opening ``` line is not part of the block
                    buf = []
            else:
                if line.startswith("```"):
                    in_block = False
                    if buf:
                        block_n += 1
                        code = "".join(buf)

                        raw_path = os.path.join(raw_dir, f"{block_n:05d}.py")
                        with open(raw_path, "w", encoding="utf-8") as fh:
                            fh.write(code)

                        kind = classify(code, annotation)

                        if kind.startswith("skip"):
                            skip_n += 1
                        else:
                            src_path = os.path.join(src_dir, f"{block_n:05d}.py")
                            with open(src_path, "w", encoding="utf-8") as fh:
                                fh.write(wrap(code, kind))

                            manifest_rows.append(
                                "\t".join(
                                    [
                                        md_path,
                                        str(start_line),
                                        "python",
                                        src_path,
                                        kind,
                                        raw_path,
                                    ]
                                )
                            )

                    buf = []
                else:
                    buf.append(line)

    manifest_path = os.path.join(work_dir, "manifest.tsv")
    with open(manifest_path, "w", encoding="utf-8") as fh:
        fh.write("\n".join(manifest_rows))
        if manifest_rows:
            fh.write("\n")

    # Single line on stdout consumed by the shell script:
    # total_blocks<TAB>skip_blocks
    print(f"{block_n}\t{skip_n}")


if __name__ == "__main__":
    main()