import os
import re
import sys
SYNTA_RE = re.compile(r"synta_|Synta[A-Z]")
FOREIGN_RE = re.compile(
r"#include\s*<openssl/"
r"|(?:^|\W)d2i_[A-Z]\w+"
r"|(?:^|\W)i2d_[A-Z]\w+"
r"|(?:^|\W)ERR_get_error\s*\("
r"|(?:^|\W)X509_free\s*\("
r"|(?:^|\W)BN_free\s*\("
r"|(?:^|\W)ASN1_INTEGER_get\s*\("
r"|(?:^|\W)asn1_read_value\s*\("
r"|(?:^|\W)asn1_write_value\s*\("
r"|(?:^|\W)asn1_der_decoding\s*\("
r"|(?:^|\W)asn1_create_element\s*\(",
re.MULTILINE,
)
LANG_EXT = {"c": "c", "cpp": "cpp", "c++": "cpp"}
MAIN_RE = re.compile(r"\bint\s+main\s*\(")
TOPLEVEL_RE = re.compile(
r"^(?:typedef\s+(?:struct|enum|union)\b"
r"|(?:static\s+)?(?:inline\s+)?"
r"(?:void|int|bool|SyntaErrorCode|size_t|uintptr_t|uint\d+_t)\s+\w+\s*\()",
re.MULTILINE,
)
PITFALL_WRONG_RE = re.compile(r"//\s*WRONG\b|/\*\s*WRONG\b")
PITFALL_RIGHT_RE = re.compile(r"//\s*RIGHT\b|/\*\s*RIGHT\b")
PSEUDOCODE_RE = re.compile(r"\w\s*\(\s*\.\.\.\s*[,)]")
HEADER_TYPEDEF_RE = re.compile(
r"^typedef\s+(?:enum|struct|union)\s+Synta\w+"
r"|^\}\s+Synta\w+\s*;",
re.MULTILINE,
)
HEADER_PROTO_RE = re.compile(
r"^(?:enum\s+SyntaErrorCode|struct\s+Synta\w+\s*\*|uintptr_t|void|bool"
r"|const\s+char\s*\*)\s+synta_\w+\s*\(",
re.MULTILINE,
)
C_PREAMBLE = """\
#include <synta.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
/* Stubs used by doc samples */
static void write_to_file(const uint8_t *d, uint32_t n) { (void)d; (void)n; }
static void process(const uint8_t *d, uint32_t n) { (void)d; (void)n; }
/* Convenience macro used in doc error-handling patterns */
#ifndef ERROR
# define ERROR (-1)
#endif
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-parameter"
"""
C_EPILOGUE = "\n#pragma GCC diagnostic pop\n"
FRAGMENT_VARS = """\
/* ── pre-declared identifiers for doc fragment compilation ── */
SyntaDecoder *decoder = NULL;
SyntaDecoder *cert_decoder = NULL;
SyntaDecoder *cert_seq = NULL;
SyntaDecoder *tbs = NULL;
SyntaDecoder *outer = NULL;
SyntaDecoder *inner = NULL;
SyntaDecoder *seq_decoder = NULL;
SyntaDecoder *sig_alg_decoder = NULL;
SyntaDecoder *version_decoder = NULL;
SyntaDecoder *optional = NULL;
SyntaEncoder *encoder = NULL;
SyntaEncoder *seq = NULL;
SyntaEncoder *seq_encoder = NULL;
SyntaEncoder *ctx_encoder = NULL;
SyntaInteger *integer = NULL;
SyntaInteger *serial = NULL;
SyntaInteger *version_int = NULL;
SyntaInteger *field1 = NULL;
SyntaOctetString *octet_string = NULL;
SyntaObjectIdentifier *oid = NULL;
SyntaObjectIdentifier *oid1 = NULL;
SyntaObjectIdentifier *oid2 = NULL;
SyntaObjectIdentifier *known_oid = NULL;
SyntaObjectIdentifier *sig_oid = NULL;
SyntaCertificate *cert = NULL;
SyntaByteArray array = {NULL, 0, 0};
SyntaByteArray output = {NULL, 0, 0};
SyntaByteArray string_data = {NULL, 0, 0};
SyntaByteArray serial_bytes = {NULL, 0, 0};
SyntaByteArray issuer = {NULL, 0, 0};
SyntaByteArray subject = {NULL, 0, 0};
SyntaByteArray bits = {NULL, 0, 0};
SyntaByteArray encoded = {NULL, 0, 0};
SyntaByteArray field2 = {NULL, 0, 0};
SyntaTag tag = {SyntaTagClass_Universal, false, 0};
SyntaDecoderConfig config = {128, 10000, 16777216};
SyntaErrorCode err = SyntaErrorCode_Success;
uint8_t unused_bits = 0;
int64_t value = 0;
int64_t version = 0;
uint64_t uvalue = 0;
bool bval = false;
bool equal = false;
const uint8_t *buf = NULL;
const uint8_t *data = NULL;
const uint8_t *der = NULL;
const uint8_t *cert_data = NULL;
const uint8_t *plaintext = NULL;
size_t len = 0;
uintptr_t der_len = 0;
uintptr_t cert_len = 0;
uintptr_t plaintext_len = 0;
const uint32_t *components = NULL;
uint32_t components_buf[32];
char buffer[256];
char oid_str[256];
size_t written = 0;
int result = 0;
int i = 0;
uintptr_t n = 0;
"""
CPP_PREAMBLE = """\
#include <synta.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <memory>
#include <stdexcept>
#include <iostream>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-parameter"
"""
def _strip_define_lines(code: str) -> str:
return re.sub(
r"^\s*#\s*define\b[^\n]*(?:\\\n[^\n]*)*", "", code, flags=re.MULTILINE
)
def classify(code: str) -> str:
if not SYNTA_RE.search(code):
return "skip_nonsynta"
if FOREIGN_RE.search(code):
return "skip_foreign"
if PITFALL_WRONG_RE.search(code) and PITFALL_RIGHT_RE.search(code):
return "skip_pitfall"
if PSEUDOCODE_RE.search(_strip_define_lines(code)):
return "skip_pseudocode"
if MAIN_RE.search(code):
return "program"
if TOPLEVEL_RE.search(code):
if HEADER_TYPEDEF_RE.search(code) or HEADER_PROTO_RE.search(code):
return "skip_header_doc"
return "toplevel"
return "fragment"
def wrap_c(code: str, kind: str) -> str:
if kind == "program":
if "#include" in code:
return code
return C_PREAMBLE + "\n" + code + "\n" + C_EPILOGUE
if kind == "toplevel":
return C_PREAMBLE + "\n" + code + "\n" + C_EPILOGUE
indented = "\n".join(" " + line for line in code.splitlines())
return (
C_PREAMBLE
+ "\nstatic int _synta_doc_sample(void) {\n"
+ FRAGMENT_VARS
+ " /* ── fragment (nested scope allows re-declaration) ── */\n"
+ " {\n"
+ indented
+ "\n"
+ " }\n"
+ " return 0;\n"
+ "}\n"
+ C_EPILOGUE
)
def wrap_cpp(code: str) -> str:
return CPP_PREAMBLE + "\n" + code + "\n" + C_EPILOGUE
def strip_local_includes(code: str) -> str:
return re.sub(r'^#include\s+"[^"]*"[^\n]*\n', "", code, flags=re.MULTILINE)
def main() -> None:
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <work_dir> [file.md ...]", file=sys.stderr)
sys.exit(1)
work_dir = sys.argv[1]
md_files = sys.argv[2:]
raw_dir = os.path.join(work_dir, "raw")
src_dir = os.path.join(work_dir, "src")
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(src_dir, exist_ok=True)
manifest_rows: list[str] = []
skipped_rows: list[str] = []
block_n = 0
skip_n = 0
file_toplevel_history: dict[str, list[str]] = {}
for md_path in md_files:
try:
with open(md_path, encoding="utf-8") as fh:
lines = fh.readlines()
except OSError as exc:
print(f"warning: cannot read {md_path}: {exc}", file=sys.stderr)
continue
in_block = False
lang = ""
annotated = False
start_line = 0
buf: list[str] = []
for lineno, line in enumerate(lines, 1):
if not in_block:
m = re.match(r"^```(\w+)(?:,(\S+))?\s*$", line)
if m and m.group(1).lower() in LANG_EXT:
lang = m.group(1).lower()
annotation = (m.group(2) or "").lower()
annotated = annotation == "ignore"
in_block = True
start_line = (
lineno + 1
) buf = []
else:
if line.startswith("```"):
in_block = False
if buf:
block_n += 1
code = "".join(buf)
ext = LANG_EXT[lang]
raw_path = os.path.join(raw_dir, f"{block_n:05d}.{ext}")
with open(raw_path, "w", encoding="utf-8") as fh:
fh.write(code)
kind = "skip_annotated" if annotated else classify(code)
if kind.startswith("skip"):
skip_n += 1
skipped_rows.append(
"\t".join(
[
md_path,
str(start_line),
lang,
kind,
raw_path,
]
)
)
else:
src_path = os.path.join(src_dir, f"{block_n:05d}.{ext}")
with open(src_path, "w", encoding="utf-8") as fh:
if lang in ("cpp", "c++"):
fh.write(wrap_cpp(code))
else:
fh.write(wrap_c(code, kind))
combined_path = ""
prev = file_toplevel_history.get(md_path, [])
if prev:
combined_code = (
"\n".join(prev) + "\n" + strip_local_includes(code)
)
combined_path = os.path.join(
src_dir, f"{block_n:05d}.combined.{ext}"
)
with open(combined_path, "w", encoding="utf-8") as fh:
if lang in ("cpp", "c++"):
fh.write(wrap_cpp(combined_code))
else:
fh.write(wrap_c(combined_code, "toplevel"))
if kind == "toplevel":
file_toplevel_history.setdefault(md_path, []).append(
code
)
manifest_rows.append(
"\t".join(
[
md_path,
str(start_line),
lang,
src_path,
kind,
raw_path,
combined_path,
]
)
)
buf = []
else:
buf.append(line)
manifest_path = os.path.join(work_dir, "manifest.tsv")
with open(manifest_path, "w", encoding="utf-8") as fh:
fh.write("\n".join(manifest_rows))
if manifest_rows:
fh.write("\n")
skipped_path = os.path.join(work_dir, "skipped.tsv")
with open(skipped_path, "w", encoding="utf-8") as fh:
fh.write("\n".join(skipped_rows))
if skipped_rows:
fh.write("\n")
print(f"{block_n}\t{skip_n}")
if __name__ == "__main__":
main()