synta 0.2.6 - Docs.rs

#!/usr/bin/env python3
"""doc-c-samples.py — extract and wrap C/C++ code blocks from Synta docs.

Called by doc-c-samples.sh with:
    python3 doc-c-samples.py <work_dir> <file1.md> [file2.md ...]

Outputs to stdout a single line with the total number of code blocks found,
then writes to <work_dir>:

  raw/NNNNN.{c,cpp}          — raw extracted code (no wrapper)
  src/NNNNN.{c,cpp}          — wrapped, ready-to-compile translation unit
  src/NNNNN.combined.{c,cpp} — prev toplevel blocks + current, re-wrapped
                               (only when the current block may reference types
                               defined in the preceding block of the same file)
  manifest.tsv               — tab-separated, one row per block:
      doc_file  start_line  lang  src_file  kind  raw_file  combined_file

kind values
-----------
  skip_nonsynta   — no synta_ / Synta* identifiers found; skip
  skip_foreign    — OpenSSL or libtasn1 API detected; skip
  skip_annotated  — block is annotated with ,ignore (e.g. ```c,ignore); skip
  skip_pitfall    — block mixes intentional WRONG + RIGHT examples; skip
  skip_pseudocode — block uses literal '...' as argument placeholder; skip
  skip_header_doc — block reproduces synta.h type/function declarations; skip
  program         — has int main(); compile as a complete program
  toplevel        — top-level function/struct definitions
  fragment        — standalone statements; wrapped in a harness function
"""

import os
import re
import sys


# ── Regexps ──────────────────────────────────────────────────────────────────

# Block must reference the Synta API or type names to be worth compiling.
SYNTA_RE = re.compile(r"synta_|Synta[A-Z]")

# Identifiers that indicate foreign (OpenSSL / libtasn1) code.
FOREIGN_RE = re.compile(
    r"#include\s*<openssl/"
    r"|(?:^|\W)d2i_[A-Z]\w+"
    r"|(?:^|\W)i2d_[A-Z]\w+"
    r"|(?:^|\W)ERR_get_error\s*\("
    r"|(?:^|\W)X509_free\s*\("
    r"|(?:^|\W)BN_free\s*\("
    r"|(?:^|\W)ASN1_INTEGER_get\s*\("
    r"|(?:^|\W)asn1_read_value\s*\("
    r"|(?:^|\W)asn1_write_value\s*\("
    r"|(?:^|\W)asn1_der_decoding\s*\("
    r"|(?:^|\W)asn1_create_element\s*\(",
    re.MULTILINE,
)

# Fenced block language tag → extension mapping.
LANG_EXT = {"c": "c", "cpp": "cpp", "c++": "cpp"}

# Complete program.
MAIN_RE = re.compile(r"\bint\s+main\s*\(")

# Top-level definitions: typedef struct/enum/union, or a function whose return
# type starts at column 0, immediately followed by the function name and a
# parameter list opening.  Covers both Synta API wrappers and generated stubs.
TOPLEVEL_RE = re.compile(
    r"^(?:typedef\s+(?:struct|enum|union)\b"
    r"|(?:static\s+)?(?:inline\s+)?"
    r"(?:void|int|bool|SyntaErrorCode|size_t|uintptr_t|uint\d+_t)\s+\w+\s*\()",
    re.MULTILINE,
)

# Blocks that interleave intentionally-wrong and correct usage in the same code
# block (pitfall / anti-pattern sections).  The "WRONG" half uses intentionally
# incorrect patterns (double-free, use-after-free, etc.) that the compiler would
# flag; skipping the whole block avoids false failures.
#
# Both patterns must appear in the same block.  Typical trigger lines:
#   synta_integer_free(p); synta_integer_free(p); // WRONG — double free
#   integer = NULL;                               // RIGHT — prevent reuse
PITFALL_WRONG_RE = re.compile(r"//\s*WRONG\b|/\*\s*WRONG\b")
PITFALL_RIGHT_RE = re.compile(r"//\s*RIGHT\b|/\*\s*RIGHT\b")

# Pseudocode blocks that use literal '...' as a stand-in for arguments.
# These are illustrative and cannot be compiled.
# Example trigger:  SyntaDecoder *d = synta_decoder_new(...);
# The pattern anchors on a word character before '(' to avoid matching
# C-style variadic declarations like 'void f(int n, ...)'.
#
# Note: variadic macro headers (#define FOO(...)) also superficially match
# because the macro parameter list ends with `(...)`.  strip_defines() removes
# #define lines before the check so those false-positive matches are avoided.
PSEUDOCODE_RE = re.compile(r"\w\s*\(\s*\.\.\.\s*[,)]")

# Blocks that just reproduce definitions already present in synta.h.
# Compiling these alongside #include <synta.h> would produce redeclaration
# errors that are false positives — the blocks are reference documentation,
# not user code.
#
# 1. typedef enum/struct/union Synta* — reproduces a synta.h type definition.
#    Also matches anonymous struct/union typedefs like `typedef struct { } SyntaFoo;`
#    detected by the closing `} SyntaFoo;` pattern.
HEADER_TYPEDEF_RE = re.compile(
    r"^typedef\s+(?:enum|struct|union)\s+Synta\w+"
    r"|^\}\s+Synta\w+\s*;",
    re.MULTILINE,
)
# 2. synta_* function declarations / prototypes — reproduces synta.h API.
#    Detects lines that start with a return type followed by 'synta_'.
HEADER_PROTO_RE = re.compile(
    r"^(?:enum\s+SyntaErrorCode|struct\s+Synta\w+\s*\*|uintptr_t|void|bool"
    r"|const\s+char\s*\*)\s+synta_\w+\s*\(",
    re.MULTILINE,
)


# ── Source wrappers ──────────────────────────────────────────────────────────

C_PREAMBLE = """\
#include <synta.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

/* Stubs used by doc samples */
static void write_to_file(const uint8_t *d, uint32_t n) { (void)d; (void)n; }
static void process(const uint8_t *d, uint32_t n) { (void)d; (void)n; }

/* Convenience macro used in doc error-handling patterns */
#ifndef ERROR
#  define ERROR (-1)
#endif

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-parameter"
"""

C_EPILOGUE = "\n#pragma GCC diagnostic pop\n"

# Every identifier referenced (but not defined) in the doc snippets is declared
# here so that the type-checker sees API-level mistakes rather than noise about
# undeclared variables.
FRAGMENT_VARS = """\
    /* ── pre-declared identifiers for doc fragment compilation ── */
    SyntaDecoder         *decoder          = NULL;
    SyntaDecoder         *cert_decoder     = NULL;
    SyntaDecoder         *cert_seq         = NULL;
    SyntaDecoder         *tbs              = NULL;
    SyntaDecoder         *outer            = NULL;
    SyntaDecoder         *inner            = NULL;
    SyntaDecoder         *seq_decoder      = NULL;
    SyntaDecoder         *sig_alg_decoder  = NULL;
    SyntaDecoder         *version_decoder  = NULL;
    SyntaDecoder         *optional         = NULL;
    SyntaEncoder         *encoder          = NULL;
    SyntaEncoder         *seq              = NULL;
    SyntaEncoder         *seq_encoder      = NULL;
    SyntaEncoder         *ctx_encoder      = NULL;
    SyntaInteger         *integer          = NULL;
    SyntaInteger         *serial           = NULL;
    SyntaInteger         *version_int      = NULL;
    SyntaInteger         *field1           = NULL;
    SyntaOctetString     *octet_string     = NULL;
    SyntaObjectIdentifier *oid             = NULL;
    SyntaObjectIdentifier *oid1            = NULL;
    SyntaObjectIdentifier *oid2            = NULL;
    SyntaObjectIdentifier *known_oid       = NULL;
    SyntaObjectIdentifier *sig_oid         = NULL;
    SyntaCertificate     *cert             = NULL;
    SyntaByteArray        array            = {NULL, 0, 0};
    SyntaByteArray        output           = {NULL, 0, 0};
    SyntaByteArray        string_data      = {NULL, 0, 0};
    SyntaByteArray        serial_bytes     = {NULL, 0, 0};
    SyntaByteArray        issuer           = {NULL, 0, 0};
    SyntaByteArray        subject          = {NULL, 0, 0};
    SyntaByteArray        bits             = {NULL, 0, 0};
    SyntaByteArray        encoded          = {NULL, 0, 0};
    SyntaByteArray        field2           = {NULL, 0, 0};
    SyntaTag              tag              = {SyntaTagClass_Universal, false, 0};
    SyntaDecoderConfig    config           = {128, 10000, 16777216};
    SyntaErrorCode        err              = SyntaErrorCode_Success;
    uint8_t               unused_bits      = 0;
    int64_t               value            = 0;
    int64_t               version          = 0;
    uint64_t              uvalue           = 0;
    bool                  bval             = false;
    bool                  equal            = false;
    const uint8_t        *buf              = NULL;
    const uint8_t        *data             = NULL;
    const uint8_t        *der              = NULL;
    const uint8_t        *cert_data        = NULL;
    const uint8_t        *plaintext        = NULL;
    size_t                len              = 0;
    uintptr_t             der_len          = 0;
    uintptr_t             cert_len         = 0;
    uintptr_t             plaintext_len    = 0;
    const uint32_t       *components       = NULL;
    uint32_t              components_buf[32];
    char                  buffer[256];
    char                  oid_str[256];
    size_t                written          = 0;
    int                   result           = 0;
    int                   i               = 0;
    uintptr_t             n               = 0;
"""

CPP_PREAMBLE = """\
#include <synta.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <memory>
#include <stdexcept>
#include <iostream>

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-parameter"
"""


def _strip_define_lines(code: str) -> str:
    """Remove #define macro header lines (including line continuations).

    Used before the pseudocode check to prevent false positives from C99
    variadic macro definitions such as ``#define ALLOC_DECODER(...)`` where
    the ``(...)`` matches the pseudocode pattern but is valid C syntax.
    """
    # Match a #define directive and any continuation lines ending with '\'.
    return re.sub(
        r"^\s*#\s*define\b[^\n]*(?:\\\n[^\n]*)*", "", code, flags=re.MULTILINE
    )


def classify(code: str) -> str:
    """Classify a code block. Returns one of the kind strings listed above."""
    if not SYNTA_RE.search(code):
        return "skip_nonsynta"
    if FOREIGN_RE.search(code):
        return "skip_foreign"
    # Pitfall blocks mix intentionally-wrong code with correct code in the
    # same fenced block.  Compiling them would produce false failures.
    if PITFALL_WRONG_RE.search(code) and PITFALL_RIGHT_RE.search(code):
        return "skip_pitfall"
    # Pseudocode blocks use literal '...' as argument placeholders.
    # Strip #define lines first to avoid false positives from variadic macros.
    if PSEUDOCODE_RE.search(_strip_define_lines(code)):
        return "skip_pseudocode"
    if MAIN_RE.search(code):
        return "program"
    if TOPLEVEL_RE.search(code):
        # Distinguish user-defined top-level code from blocks that just
        # reproduce synta.h type / function declarations verbatim.  The latter
        # would trigger redeclaration errors when compiled with #include <synta.h>
        # and are false positives — they are reference documentation, not usage.
        if HEADER_TYPEDEF_RE.search(code) or HEADER_PROTO_RE.search(code):
            return "skip_header_doc"
        return "toplevel"
    return "fragment"


def wrap_c(code: str, kind: str) -> str:
    """Return a compilable C translation unit for the given block and kind."""
    if kind == "program":
        # Inject standard includes only if the snippet omitted them.
        if "#include" in code:
            return code
        return C_PREAMBLE + "\n" + code + "\n" + C_EPILOGUE

    if kind == "toplevel":
        # Function / struct definitions: place at file scope with guard pragmas.
        return C_PREAMBLE + "\n" + code + "\n" + C_EPILOGUE

    # fragment: wrap in a harness function with all common variables pre-declared.
    # The fragment itself lives in a nested scope so that re-declarations of the
    # same identifiers shadow (rather than clash with) the outer pre-declarations.
    indented = "\n".join("        " + line for line in code.splitlines())
    return (
        C_PREAMBLE
        + "\nstatic int _synta_doc_sample(void) {\n"
        + FRAGMENT_VARS
        + "    /* ── fragment (nested scope allows re-declaration) ── */\n"
        + "    {\n"
        + indented
        + "\n"
        + "    }\n"
        + "    return 0;\n"
        + "}\n"
        + C_EPILOGUE
    )


def wrap_cpp(code: str) -> str:
    """Return a compilable C++ translation unit."""
    return CPP_PREAMBLE + "\n" + code + "\n" + C_EPILOGUE


def strip_local_includes(code: str) -> str:
    """Remove #include "..." lines from a code block.

    Used when building combined files: the preceding block(s) already supply
    the type definitions that would have come from those local headers, so the
    #include "..." in the current block would cause a file-not-found error or
    double-definition if left in.  System includes (#include <...>) are kept.
    """
    return re.sub(r'^#include\s+"[^"]*"[^\n]*\n', "", code, flags=re.MULTILINE)


# ── Main extraction loop ──────────────────────────────────────────────────────


def main() -> None:
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <work_dir> [file.md ...]", file=sys.stderr)
        sys.exit(1)

    work_dir = sys.argv[1]
    md_files = sys.argv[2:]

    raw_dir = os.path.join(work_dir, "raw")
    src_dir = os.path.join(work_dir, "src")
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(src_dir, exist_ok=True)

    manifest_rows: list[str] = []
    skipped_rows: list[str] = []
    block_n = 0
    skip_n = 0

    # Per-file accumulation of raw code from toplevel/program blocks.  When a
    # later block in the same file fails to compile standalone (e.g. it uses a
    # struct defined in an earlier block), bash retries with a combined file
    # that prepends ALL preceding toplevel/program blocks.  Key = md_file path,
    # value = list of raw code strings in document order.
    file_toplevel_history: dict[str, list[str]] = {}

    for md_path in md_files:
        try:
            with open(md_path, encoding="utf-8") as fh:
                lines = fh.readlines()
        except OSError as exc:
            print(f"warning: cannot read {md_path}: {exc}", file=sys.stderr)
            continue

        in_block = False
        lang = ""
        annotated = False
        start_line = 0
        buf: list[str] = []

        for lineno, line in enumerate(lines, 1):
            if not in_block:
                m = re.match(r"^```(\w+)(?:,(\S+))?\s*$", line)
                if m and m.group(1).lower() in LANG_EXT:
                    lang = m.group(1).lower()
                    annotation = (m.group(2) or "").lower()
                    annotated = annotation == "ignore"
                    in_block = True
                    start_line = (
                        lineno + 1
                    )  # +1: opening ``` line is not part of the block
                    buf = []
            else:
                if line.startswith("```"):
                    in_block = False
                    if buf:
                        block_n += 1
                        code = "".join(buf)
                        ext = LANG_EXT[lang]

                        raw_path = os.path.join(raw_dir, f"{block_n:05d}.{ext}")
                        with open(raw_path, "w", encoding="utf-8") as fh:
                            fh.write(code)

                        kind = "skip_annotated" if annotated else classify(code)

                        if kind.startswith("skip"):
                            skip_n += 1
                            skipped_rows.append(
                                "\t".join(
                                    [
                                        md_path,
                                        str(start_line),
                                        lang,
                                        kind,
                                        raw_path,
                                    ]
                                )
                            )
                        else:
                            src_path = os.path.join(src_dir, f"{block_n:05d}.{ext}")
                            with open(src_path, "w", encoding="utf-8") as fh:
                                if lang in ("cpp", "c++"):
                                    fh.write(wrap_cpp(code))
                                else:
                                    fh.write(wrap_c(code, kind))

                            # Build a combined file so bash can retry when the
                            # current block depends on types defined in a previous
                            # block of the same doc file.
                            combined_path = ""
                            prev = file_toplevel_history.get(md_path, [])
                            if prev:
                                # Strip local #include "..." from the current
                                # block: the preceding block(s) already supply
                                # those type definitions inline.
                                combined_code = (
                                    "\n".join(prev) + "\n" + strip_local_includes(code)
                                )
                                combined_path = os.path.join(
                                    src_dir, f"{block_n:05d}.combined.{ext}"
                                )
                                # Always wrap combined as 'toplevel' so that
                                # struct/function definitions from prev blocks
                                # remain at file scope.
                                with open(combined_path, "w", encoding="utf-8") as fh:
                                    if lang in ("cpp", "c++"):
                                        fh.write(wrap_cpp(combined_code))
                                    else:
                                        fh.write(wrap_c(combined_code, "toplevel"))

                            # Only accumulate toplevel blocks (struct/function definitions)
                            # for combining with later blocks that reference them.
                            # Program blocks (with main()) are complete standalone examples
                            # and should not be combined with later blocks to avoid conflicts.
                            if kind == "toplevel":
                                file_toplevel_history.setdefault(md_path, []).append(
                                    code
                                )

                            # Only compilable blocks enter the manifest.
                            # Skipped blocks are excluded to avoid empty-field
                            # issues when bash reads tab-separated fields (bash
                            # IFS collapses consecutive tab whitespace).
                            manifest_rows.append(
                                "\t".join(
                                    [
                                        md_path,
                                        str(start_line),
                                        lang,
                                        src_path,
                                        kind,
                                        raw_path,
                                        combined_path,
                                    ]
                                )
                            )

                    buf = []
                else:
                    buf.append(line)

    manifest_path = os.path.join(work_dir, "manifest.tsv")
    with open(manifest_path, "w", encoding="utf-8") as fh:
        fh.write("\n".join(manifest_rows))
        if manifest_rows:
            fh.write("\n")

    # skipped.tsv — one row per skipped block, consumed by doc-c-analyze.py.
    # Columns: doc_file  start_line  lang  kind  raw_file
    skipped_path = os.path.join(work_dir, "skipped.tsv")
    with open(skipped_path, "w", encoding="utf-8") as fh:
        fh.write("\n".join(skipped_rows))
        if skipped_rows:
            fh.write("\n")

    # Single line on stdout consumed by the shell script:
    # total_blocks<TAB>skip_blocks
    print(f"{block_n}\t{skip_n}")


if __name__ == "__main__":
    main()