Skip to main content

alef_e2e/codegen/
r.rs

1//! R e2e test generator using testthat.
2
3use crate::config::E2eConfig;
4use crate::escape::{escape_r, r_template_to_paste0, sanitize_filename, sanitize_ident};
5use crate::field_access::FieldResolver;
6use crate::fixture::{Assertion, CallbackAction, Fixture, FixtureGroup, TemplateReturnForm};
7use alef_core::backend::GeneratedFile;
8use alef_core::config::ResolvedCrateConfig;
9use alef_core::hash::{self, CommentStyle};
10use anyhow::Result;
11use std::fmt::Write as FmtWrite;
12use std::path::PathBuf;
13
14use super::E2eCodegen;
15
16/// R e2e code generator.
17pub struct RCodegen;
18
19impl E2eCodegen for RCodegen {
20    fn generate(
21        &self,
22        groups: &[FixtureGroup],
23        e2e_config: &E2eConfig,
24        config: &ResolvedCrateConfig,
25        _type_defs: &[alef_core::ir::TypeDef],
26        _enums: &[alef_core::ir::EnumDef],
27    ) -> Result<Vec<GeneratedFile>> {
28        let lang = self.language_name();
29        let output_base = PathBuf::from(e2e_config.effective_output()).join(lang);
30
31        let mut files = Vec::new();
32
33        // Resolve call config with overrides.
34        let call = &e2e_config.call;
35        let overrides = call.overrides.get(lang);
36        let module_path = overrides
37            .and_then(|o| o.module.as_ref())
38            .cloned()
39            .unwrap_or_else(|| call.module.clone());
40        let _function_name = overrides
41            .and_then(|o| o.function.as_ref())
42            .cloned()
43            .unwrap_or_else(|| call.function.clone());
44        let result_is_simple = call.result_is_simple || overrides.is_some_and(|o| o.result_is_simple);
45        let result_is_r_list = overrides.is_some_and(|o| o.result_is_r_list);
46        let _result_var = &call.result_var;
47
48        // Resolve package config.
49        let r_pkg = e2e_config.resolve_package("r");
50        let pkg_name = r_pkg
51            .as_ref()
52            .and_then(|p| p.name.as_ref())
53            .cloned()
54            .unwrap_or_else(|| module_path.clone());
55        let pkg_path = r_pkg
56            .as_ref()
57            .and_then(|p| p.path.as_ref())
58            .cloned()
59            .unwrap_or_else(|| "../../packages/r".to_string());
60        let pkg_version = r_pkg
61            .as_ref()
62            .and_then(|p| p.version.as_ref())
63            .cloned()
64            .or_else(|| config.resolved_version())
65            .unwrap_or_else(|| "0.1.0".to_string());
66
67        // Generate DESCRIPTION file.
68        files.push(GeneratedFile {
69            path: output_base.join("DESCRIPTION"),
70            content: render_description(&pkg_name, &pkg_version, e2e_config.dep_mode),
71            generated_header: false,
72        });
73
74        // Generate test runner script.
75        files.push(GeneratedFile {
76            path: output_base.join("run_tests.R"),
77            content: render_test_runner(&pkg_path, e2e_config.dep_mode),
78            generated_header: true,
79        });
80
81        // setup-fixtures.R — testthat sources `setup-*.R` files in the tests
82        // directory once before any tests run, with the working directory set
83        // to the tests/ folder. We use this hook to chdir into the repo's
84        // shared `test_documents/` directory so that fixture paths like
85        // `pdf/fake_memo.pdf` resolve at extraction time.
86        files.push(GeneratedFile {
87            path: output_base.join("tests").join("setup-fixtures.R"),
88            content: render_setup_fixtures(&e2e_config.test_documents_relative_from(1)),
89            generated_header: true,
90        });
91
92        // Generate test files per category.
93        for group in groups {
94            let active: Vec<&Fixture> = group
95                .fixtures
96                .iter()
97                .filter(|f| super::should_include_fixture(f, lang, e2e_config))
98                .collect();
99
100            if active.is_empty() {
101                continue;
102            }
103
104            let filename = format!("test_{}.R", sanitize_filename(&group.category));
105            let content = render_test_file(&group.category, &active, result_is_simple, result_is_r_list, e2e_config);
106            files.push(GeneratedFile {
107                path: output_base.join("tests").join(filename),
108                content,
109                generated_header: true,
110            });
111        }
112
113        Ok(files)
114    }
115
116    fn language_name(&self) -> &'static str {
117        "r"
118    }
119}
120
121fn render_description(pkg_name: &str, pkg_version: &str, dep_mode: crate::config::DependencyMode) -> String {
122    let dep_line = match dep_mode {
123        crate::config::DependencyMode::Registry => {
124            format!("Imports: {pkg_name} ({pkg_version})\n")
125        }
126        crate::config::DependencyMode::Local => String::new(),
127    };
128    format!(
129        r#"Package: e2e.r
130Title: E2E Tests for {pkg_name}
131Version: 0.1.0
132Description: End-to-end test suite.
133{dep_line}Suggests: testthat (>= 3.0.0)
134Config/testthat/edition: 3
135"#
136    )
137}
138
139fn render_setup_fixtures(test_documents_path: &str) -> String {
140    let mut out = String::new();
141    out.push_str(&hash::header(CommentStyle::Hash));
142    let _ = writeln!(out);
143    let _ = writeln!(
144        out,
145        "# Resolve fixture paths against the repo's `test_documents/` directory."
146    );
147    let _ = writeln!(
148        out,
149        "# testthat sources setup-*.R with the working directory at tests/,"
150    );
151    let _ = writeln!(
152        out,
153        "# so test_documents lives three directories up: tests/ -> e2e/r/ -> e2e/ -> repo root."
154    );
155    let _ = writeln!(
156        out,
157        "# Each `test_that()` block has its working directory reset back to tests/, so"
158    );
159    let _ = writeln!(
160        out,
161        "# fixture lookups must be performed via this helper rather than relying on `setwd`."
162    );
163    let _ = writeln!(
164        out,
165        ".alef_test_documents <- normalizePath(\"{test_documents_path}\", mustWork = FALSE)"
166    );
167    let _ = writeln!(out, ".resolve_fixture <- function(path) {{");
168    let _ = writeln!(out, "  if (dir.exists(.alef_test_documents)) {{");
169    let _ = writeln!(out, "    file.path(.alef_test_documents, path)");
170    let _ = writeln!(out, "  }} else {{");
171    let _ = writeln!(out, "    path");
172    let _ = writeln!(out, "  }}");
173    let _ = writeln!(out, "}}");
174    out
175}
176
177fn render_test_runner(pkg_path: &str, dep_mode: crate::config::DependencyMode) -> String {
178    let mut out = String::new();
179    out.push_str(&hash::header(CommentStyle::Hash));
180    let _ = writeln!(out, "library(testthat)");
181    match dep_mode {
182        crate::config::DependencyMode::Registry => {
183            // In registry mode, require the installed CRAN package directly.
184            let _ = writeln!(out, "# Package loaded via library() from CRAN install.");
185        }
186        crate::config::DependencyMode::Local => {
187            // Use devtools::load_all() to load the local R package without requiring
188            // a full install, matching the e2e test runner convention.
189            let _ = writeln!(out, "devtools::load_all(\"{pkg_path}\")");
190        }
191    }
192    let _ = writeln!(out);
193    // Surface every failure rather than aborting at the default max_fails=10 —
194    // partial pass counts are essential for triage during e2e bring-up.
195    let _ = writeln!(out, "testthat::set_max_fails(Inf)");
196    // Resolve the tests/ directory relative to this script. testthat reads
197    // setup-*.R from there before each file runs, where path resolution
198    // against test_documents/ is handled by the `.resolve_fixture` helper.
199    let _ = writeln!(
200        out,
201        ".script_dir <- tryCatch(dirname(normalizePath(sys.frame(1)$ofile)), error = function(e) getwd())"
202    );
203    let _ = writeln!(out, "test_dir(file.path(.script_dir, \"tests\"))");
204    out
205}
206
207fn render_test_file(
208    category: &str,
209    fixtures: &[&Fixture],
210    result_is_simple: bool,
211    result_is_r_list: bool,
212    e2e_config: &E2eConfig,
213) -> String {
214    let mut out = String::new();
215    out.push_str(&hash::header(CommentStyle::Hash));
216    let _ = writeln!(out, "# E2e tests for category: {category}");
217    let _ = writeln!(out);
218
219    for (i, fixture) in fixtures.iter().enumerate() {
220        render_test_case(&mut out, fixture, e2e_config, result_is_simple, result_is_r_list);
221        if i + 1 < fixtures.len() {
222            let _ = writeln!(out);
223        }
224    }
225
226    // Clean up trailing newlines.
227    while out.ends_with("\n\n") {
228        out.pop();
229    }
230    if !out.ends_with('\n') {
231        out.push('\n');
232    }
233    out
234}
235
236fn render_test_case(
237    out: &mut String,
238    fixture: &Fixture,
239    e2e_config: &E2eConfig,
240    default_result_is_simple: bool,
241    default_result_is_r_list: bool,
242) {
243    let call_config = e2e_config.resolve_call_for_fixture(
244        fixture.call.as_deref(),
245        &fixture.id,
246        &fixture.resolved_category(),
247        &fixture.tags,
248        &fixture.input,
249    );
250    let call_field_resolver = FieldResolver::new(
251        e2e_config.effective_fields(call_config),
252        e2e_config.effective_fields_optional(call_config),
253        e2e_config.effective_result_fields(call_config),
254        e2e_config.effective_fields_array(call_config),
255        &std::collections::HashSet::new(),
256    );
257    let field_resolver = &call_field_resolver;
258    let function_name = &call_config.function;
259    let result_var = &call_config.result_var;
260    // Per-fixture call configs (e.g. `list_document_extractors`) may set
261    // `result_is_simple = true` even when the default `[e2e.call]` does not.
262    // Without this lookup the registry/detection wrappers (which return scalar
263    // strings or character vectors directly) get wrapped in
264    // `jsonlite::fromJSON(...)` and the parser fails on non-JSON output.
265    let r_override = call_config.overrides.get("r");
266    let result_is_simple = if fixture.call.is_some() {
267        call_config.result_is_simple || r_override.is_some_and(|o| o.result_is_simple)
268    } else {
269        default_result_is_simple
270    };
271    // Per-fixture override: when the R binding already returns a native R list
272    // (not a JSON string), suppress `jsonlite::fromJSON` wrapping while still
273    // using field-path (`result$field`) accessors in assertions.
274    let result_is_r_list = if fixture.call.is_some() {
275        r_override.is_some_and(|o| o.result_is_r_list)
276    } else {
277        default_result_is_r_list
278    };
279
280    let test_name = sanitize_ident(&fixture.id);
281    let description = fixture.description.replace('"', "\\\"");
282
283    let expects_error = fixture.assertions.iter().any(|a| a.assertion_type == "error");
284
285    // Allow per-call R overrides to remap fixture argument names. Many calls
286    // (e.g. `extract_bytes`, `batch_extract_files`) use language-neutral
287    // fixture field names (`data`, `paths`) that the R extendr binding
288    // exposes under different identifiers (`content`, `items`).
289    let arg_name_map = r_override.map(|o| &o.arg_name_map);
290    // Resolve `options_type` for typed config args. When set (e.g. via the
291    // C#/Java override that pins the `config` arg of `embed_texts` to
292    // `EmbeddingConfig`), we use it instead of the heuristic in
293    // `r_default_for_config_arg` so the extendr binding receives the right
294    // ExternalPtr type rather than a default `ExtractionConfig`.
295    let options_type = r_override.and_then(|o| o.options_type.as_deref()).or_else(|| {
296        // Fall back to any other language's override that pins the type —
297        // R doesn't define its own override list yet for most embed calls,
298        // and the underlying Rust signature is the same regardless of
299        // binding, so reusing csharp/java/go/php options_type is safe.
300        //
301        // Skip `Js`-prefixed types from the Node/wasm bindings: those are
302        // NAPI/wasm-bindgen specific wrapper types, while extendr exposes the
303        // bare Rust type names (e.g. `ExtractionConfig`, not `JsExtractionConfig`).
304        call_config
305            .overrides
306            .values()
307            .filter_map(|o| o.options_type.as_deref())
308            .find(|name| !name.starts_with("Js"))
309    });
310    let args_str = build_args_string(&fixture.input, &call_config.args, arg_name_map, options_type);
311
312    // Build visitor setup and args if present
313    let mut setup_lines = Vec::new();
314    let final_args = if let Some(visitor_spec) = &fixture.visitor {
315        build_r_visitor(&mut setup_lines, visitor_spec);
316        // R rejects duplicated named arguments ("matched by multiple actual arguments"), so
317        // strip any existing `options = ...` arg before appending the visitor-options list.
318        // Handles `options = NULL` (when no default) and `options = ConversionOptions$default()`
319        // (when build_args_string emits a default placeholder for an optional options arg).
320        let base = strip_options_arg(&args_str);
321        let visitor_opts = "options = list(visitor = visitor)";
322        let trimmed = base.trim_matches([' ', ',']);
323        if trimmed.is_empty() {
324            visitor_opts.to_string()
325        } else {
326            format!("{trimmed}, {visitor_opts}")
327        }
328    } else {
329        args_str
330    };
331
332    if expects_error {
333        let _ = writeln!(out, "test_that(\"{test_name}: {description}\", {{");
334        for line in &setup_lines {
335            let _ = writeln!(out, "  {line}");
336        }
337        let _ = writeln!(out, "  expect_error({function_name}({final_args}))");
338        let _ = writeln!(out, "}})");
339        return;
340    }
341
342    let _ = writeln!(out, "test_that(\"{test_name}: {description}\", {{");
343    for line in &setup_lines {
344        let _ = writeln!(out, "  {line}");
345    }
346    // The extendr extraction wrappers return JSON strings carrying the
347    // serialized core result; parse into an R list so tests can use `$`
348    // accessors. `result_is_simple` calls (e.g. `convert_html_to_markdown`)
349    // already return scalar values and must be passed through verbatim.
350    // `result_is_r_list` signals the binding returns a native R list (Robj),
351    // not a JSON string — skip `jsonlite::fromJSON` but keep `$` accessors.
352    if result_is_simple || result_is_r_list {
353        let _ = writeln!(out, "  {result_var} <- {function_name}({final_args})");
354    } else {
355        let _ = writeln!(
356            out,
357            "  {result_var} <- jsonlite::fromJSON({function_name}({final_args}), simplifyVector = FALSE)"
358        );
359    }
360
361    for assertion in &fixture.assertions {
362        render_assertion(out, assertion, result_var, field_resolver, result_is_simple, e2e_config);
363    }
364
365    let _ = writeln!(out, "}})");
366}
367
368/// Remove the named `options = …` argument (if any) from an R call-args string.
369///
370/// Walks the string while tracking paren/quote depth so a comma inside a nested
371/// expression like `options = list(visitor = visitor)` isn't treated as the
372/// arg terminator. Returns the rebuilt args string with the `options =` arg
373/// dropped; callers append a fresh one.
374fn strip_options_arg(args_str: &str) -> String {
375    let mut parts: Vec<String> = Vec::new();
376    let mut current = String::new();
377    let mut paren_depth: i32 = 0;
378    let mut in_single = false;
379    let mut in_double = false;
380    for c in args_str.chars() {
381        if !in_single && !in_double {
382            match c {
383                '(' | '[' | '{' => paren_depth += 1,
384                ')' | ']' | '}' => paren_depth -= 1,
385                '\'' => in_single = true,
386                '"' => in_double = true,
387                ',' if paren_depth == 0 => {
388                    parts.push(current.trim().to_string());
389                    current.clear();
390                    continue;
391                }
392                _ => {}
393            }
394        } else if in_single && c == '\'' {
395            in_single = false;
396        } else if in_double && c == '"' {
397            in_double = false;
398        }
399        current.push(c);
400    }
401    if !current.trim().is_empty() {
402        parts.push(current.trim().to_string());
403    }
404    parts
405        .into_iter()
406        .filter(|p| !p.starts_with("options ") && !p.starts_with("options="))
407        .collect::<Vec<_>>()
408        .join(", ")
409}
410
411fn build_args_string(
412    input: &serde_json::Value,
413    args: &[crate::config::ArgMapping],
414    arg_name_map: Option<&std::collections::HashMap<String, String>>,
415    options_type: Option<&str>,
416) -> String {
417    if args.is_empty() {
418        // No declared args means the wrapper takes zero parameters; emitting
419        // `list()` here would trigger an `unused argument (list())` error in R.
420        // Likewise, fall through to nothing if the fixture's input is empty.
421        if matches!(input, serde_json::Value::Null) || input.as_object().is_some_and(|m| m.is_empty()) {
422            return String::new();
423        }
424        return json_to_r(input, true);
425    }
426
427    let parts: Vec<String> = args
428        .iter()
429        .filter_map(|arg| {
430            // Apply per-language argument renames before emitting the call.
431            let arg_name: &str = arg_name_map
432                .and_then(|m| m.get(&arg.name).map(String::as_str))
433                .unwrap_or(&arg.name);
434
435            let field = arg.field.strip_prefix("input.").unwrap_or(&arg.field);
436            let val = input.get(field);
437            // R extendr-generated wrappers do not preserve Option<T> defaults from
438            // the Rust signature — every parameter is positional and required at
439            // the R level. To keep generated calls valid we must pass a placeholder
440            // (`NULL` for `Option<T>`, `ExtractionConfig$default()` for typed
441            // configs) whenever the fixture omits an optional value.
442            let val = match val {
443                Some(v) if !(v.is_null() && arg.optional) => v,
444                _ => {
445                    if !arg.optional {
446                        return None;
447                    }
448                    if arg.arg_type == "json_object" {
449                        let r_value = r_default_for_config_arg(arg_name, options_type);
450                        return Some(format!("{arg_name} = {r_value}"));
451                    }
452                    return Some(format!("{arg_name} = NULL"));
453                }
454            };
455            // The extendr bindings expect owned PORs (ExternalPtr) for typed
456            // config arguments — passing an R `list()` raises
457            // `Expected ExternalPtr got List`. The fixtures don't carry the
458            // option fields needed to round-trip through ExtractionConfig$new,
459            // so emit `ExtractionConfig$default()` whenever a `json_object` arg
460            // resolves to an empty / object-shaped JSON value.
461            if arg.arg_type == "json_object" && (val.is_null() || val.as_object().is_some_and(|m| m.is_empty())) {
462                let r_value = r_default_for_config_arg(arg_name, options_type);
463                return Some(format!("{arg_name} = {r_value}"));
464            }
465            // Non-empty json_object for typed config args (those whose default is a
466            // `$default()` constructor): use `TypeName$from_json(jsonlite::toJSON(...))`
467            // so the Rust function receives a proper ExternalPtr, not a list.
468            // For `options`-style args (default = NULL) emit as a plain R list.
469            if arg.arg_type == "json_object" && val.is_object() {
470                let default_expr = r_default_for_config_arg(arg_name, options_type);
471                if default_expr.ends_with("$default()") {
472                    // Extract the type name from "TypeName$default()"
473                    let type_name = default_expr.trim_end_matches("$default()");
474                    // Use the `I(...)` (AsIs) wrapper for array-valued fields so
475                    // `jsonlite::toJSON(..., auto_unbox = TRUE)` preserves them as
476                    // JSON arrays. Without this, single-element vectors get
477                    // unboxed to scalars (e.g. `c("foo")` → `"foo"`) and serde
478                    // rejects them when deserializing `Vec<T>` fields.
479                    let r_list = json_to_r_preserve_arrays(val, true);
480                    let r_value = format!("{type_name}$from_json(jsonlite::toJSON({r_list}, auto_unbox = TRUE))");
481                    return Some(format!("{arg_name} = {r_value}"));
482                }
483                let r_value = json_to_r(val, true);
484                return Some(format!("{arg_name} = {r_value}"));
485            }
486            // `json_object` arrays are passed to extendr functions whose Rust
487            // signature is `items: String` (JSON-serialized batch items). The
488            // wrapper has no R-list → JSON conversion, so we must serialize the
489            // fixture value to a literal JSON string at test-emit time.
490            //
491            // Exception: when `element_type = "String"` the Rust signature is
492            // `Vec<String>` (e.g. `embed_texts(texts: Vec<String>, ...)`), which
493            // extendr binds as a native R character vector. Passing a JSON
494            // literal there would land as a single-element character vector
495            // containing the literal bytes `["a","b"]`, which is not what the
496            // caller intended. Emit a plain `c("a","b")` literal instead.
497            if arg.arg_type == "json_object" && val.is_array() {
498                if arg.element_type.as_deref() == Some("String") {
499                    let r_value = json_to_r(val, false);
500                    return Some(format!("{arg_name} = {r_value}"));
501                }
502                let json_literal = serde_json::to_string(val).unwrap_or_else(|_| "[]".to_string());
503                let escaped = escape_r(&json_literal);
504                return Some(format!("{arg_name} = \"{escaped}\""));
505            }
506            // `bytes` arg type: convert string fixture values into runtime
507            // `readBin(...)` calls so the wrapper receives raw bytes instead
508            // of an R character vector. This mirrors the Python emit_bytes_arg
509            // helper and is what the extendr binding for Vec<u8> expects.
510            if arg.arg_type == "bytes" {
511                if let Some(raw) = val.as_str() {
512                    let r_value = render_bytes_value(raw);
513                    return Some(format!("{arg_name} = {r_value}"));
514                }
515            }
516            // `file_path` arg type: fixtures encode relative paths that resolve
517            // against the repo's `test_documents/` directory. Using a runtime
518            // helper that anchors paths to that directory avoids fragility from
519            // testthat resetting the working directory between files.
520            if arg.arg_type == "file_path" {
521                if let Some(raw) = val.as_str() {
522                    if !raw.starts_with('/') && !raw.is_empty() {
523                        let escaped = escape_r(raw);
524                        return Some(format!("{arg_name} = .resolve_fixture(\"{escaped}\")"));
525                    }
526                }
527            }
528            Some(format!("{arg_name} = {}", json_to_r(val, true)))
529        })
530        .collect();
531
532    parts.join(", ")
533}
534
535/// Render a `bytes` fixture value as the R expression that produces a raw
536/// vector at test time. Mirrors python's `emit_bytes_arg` classifier so we can
537/// support both file-path style fixtures (`"pdf/fake_memo.pdf"`) and inline
538/// text payloads (`"<html>..."`). The resulting expression is dropped directly
539/// into the call site, e.g. `content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), ...)`.
540fn render_bytes_value(raw: &str) -> String {
541    if raw.starts_with('<') || raw.starts_with('{') || raw.starts_with('[') || raw.contains(' ') {
542        // Inline text payload — encode to raw via charToRaw.
543        let escaped = escape_r(raw);
544        return format!("charToRaw(\"{escaped}\")");
545    }
546    let first = raw.chars().next().unwrap_or('\0');
547    if first.is_ascii_alphanumeric() || first == '_' {
548        if let Some(slash) = raw.find('/') {
549            if slash > 0 {
550                let after = &raw[slash + 1..];
551                if after.contains('.') && !after.is_empty() {
552                    let escaped = escape_r(raw);
553                    return format!(
554                        "readBin(.resolve_fixture(\"{escaped}\"), what = \"raw\", n = file.info(.resolve_fixture(\"{escaped}\"))$size)"
555                    );
556                }
557            }
558        }
559    }
560    // Default to inline text encoding — matches Python's InlineText branch.
561    let escaped = escape_r(raw);
562    format!("charToRaw(\"{escaped}\")")
563}
564
565/// Map the extractor argument name onto its R `*Config$default()` constructor.
566/// Falls back to `list()` for unknown names — the extendr binding will error
567/// with a clear message, which is preferable to silently passing a wrong type.
568///
569/// When `options_type` is provided (via a per-call language override pinning
570/// the typed config, e.g. `EmbeddingConfig` for `embed_texts`), it takes
571/// precedence over the arg-name heuristic so the extendr binding receives the
572/// correct ExternalPtr type.
573fn r_default_for_config_arg(arg_name: &str, options_type: Option<&str>) -> String {
574    if let Some(type_name) = options_type {
575        return format!("{type_name}$default()");
576    }
577    match arg_name {
578        "config" => "ExtractionConfig$default()".to_string(),
579        "options" => "NULL".to_string(),
580        "html_output" => "HtmlOutputConfig$default()".to_string(),
581        "chunking" => "ChunkingConfig$default()".to_string(),
582        "ocr" => "OcrConfig$default()".to_string(),
583        "image" | "images" => "ImageExtractionConfig$default()".to_string(),
584        "language_detection" => "LanguageDetectionConfig$default()".to_string(),
585        _ => "list()".to_string(),
586    }
587}
588
589fn render_assertion(
590    out: &mut String,
591    assertion: &Assertion,
592    result_var: &str,
593    field_resolver: &FieldResolver,
594    result_is_simple: bool,
595    _e2e_config: &E2eConfig,
596) {
597    // Handle synthetic / derived fields before the is_valid_for_result check
598    // so they are never treated as struct attribute accesses on the result.
599    if let Some(f) = &assertion.field {
600        match f.as_str() {
601            "chunks_have_content" => {
602                let pred = format!("all(sapply({result_var}$chunks %||% list(), function(c) nchar(c$content) > 0))");
603                match assertion.assertion_type.as_str() {
604                    "is_true" => {
605                        let _ = writeln!(out, "  expect_true({pred})");
606                    }
607                    "is_false" => {
608                        let _ = writeln!(out, "  expect_false({pred})");
609                    }
610                    _ => {
611                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
612                    }
613                }
614                return;
615            }
616            "chunks_have_embeddings" => {
617                let pred = format!(
618                    "all(sapply({result_var}$chunks %||% list(), function(c) !is.null(c$embedding) && length(c$embedding) > 0))"
619                );
620                match assertion.assertion_type.as_str() {
621                    "is_true" => {
622                        let _ = writeln!(out, "  expect_true({pred})");
623                    }
624                    "is_false" => {
625                        let _ = writeln!(out, "  expect_false({pred})");
626                    }
627                    _ => {
628                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
629                    }
630                }
631                return;
632            }
633            // ---- EmbedResponse virtual fields ----
634            // The extendr binding cannot return `Vec<Vec<f32>>` directly (extendr's
635            // Robj conversion has no impl for nested numeric vectors), so the
636            // wrapper serializes the result to a JSON string at the FFI boundary.
637            // Parse it on demand here so length/index assertions operate on the
638            // matrix structure rather than on the single string scalar.
639            "embeddings" => {
640                let parsed = format!(
641                    "(if (is.character({result_var}) && length({result_var}) == 1) jsonlite::fromJSON({result_var}, simplifyVector = FALSE) else {result_var})"
642                );
643                match assertion.assertion_type.as_str() {
644                    "count_equals" => {
645                        if let Some(val) = &assertion.value {
646                            let r_val = json_to_r(val, false);
647                            let _ = writeln!(out, "  expect_equal(length({parsed}), {r_val})");
648                        }
649                    }
650                    "count_min" => {
651                        if let Some(val) = &assertion.value {
652                            let r_val = json_to_r(val, false);
653                            let _ = writeln!(out, "  expect_gte(length({parsed}), {r_val})");
654                        }
655                    }
656                    "not_empty" => {
657                        let _ = writeln!(out, "  expect_gt(length({parsed}), 0)");
658                    }
659                    "is_empty" => {
660                        let _ = writeln!(out, "  expect_equal(length({parsed}), 0)");
661                    }
662                    _ => {
663                        let _ = writeln!(
664                            out,
665                            "  # skipped: unsupported assertion type on synthetic field 'embeddings'"
666                        );
667                    }
668                }
669                return;
670            }
671            "embedding_dimensions" => {
672                let expr = format!("(if (length({result_var}) == 0) 0L else length({result_var}[[1]]))");
673                match assertion.assertion_type.as_str() {
674                    "equals" => {
675                        if let Some(val) = &assertion.value {
676                            let r_val = json_to_r(val, false);
677                            let _ = writeln!(out, "  expect_equal({expr}, {r_val})");
678                        }
679                    }
680                    "greater_than" => {
681                        if let Some(val) = &assertion.value {
682                            let r_val = json_to_r(val, false);
683                            let _ = writeln!(out, "  expect_gt({expr}, {r_val})");
684                        }
685                    }
686                    _ => {
687                        let _ = writeln!(
688                            out,
689                            "  # skipped: unsupported assertion type on synthetic field 'embedding_dimensions'"
690                        );
691                    }
692                }
693                return;
694            }
695            "embeddings_valid" | "embeddings_finite" | "embeddings_non_zero" | "embeddings_normalized" => {
696                let pred = match f.as_str() {
697                    "embeddings_valid" => {
698                        format!("all(sapply({result_var}, function(e) length(e) > 0))")
699                    }
700                    "embeddings_finite" => {
701                        format!("all(sapply({result_var}, function(e) all(is.finite(e))))")
702                    }
703                    "embeddings_non_zero" => {
704                        format!("all(sapply({result_var}, function(e) any(e != 0.0)))")
705                    }
706                    "embeddings_normalized" => {
707                        format!("all(sapply({result_var}, function(e) abs(sum(e * e) - 1.0) < 1e-3))")
708                    }
709                    _ => unreachable!(),
710                };
711                match assertion.assertion_type.as_str() {
712                    "is_true" => {
713                        let _ = writeln!(out, "  expect_true({pred})");
714                    }
715                    "is_false" => {
716                        let _ = writeln!(out, "  expect_false({pred})");
717                    }
718                    _ => {
719                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
720                    }
721                }
722                return;
723            }
724            // ---- keywords / keywords_count ----
725            // R ExtractionResult does not expose extracted_keywords; skip.
726            "keywords" | "keywords_count" => {
727                let _ = writeln!(out, "  # skipped: field '{f}' not available on R ExtractionResult");
728                return;
729            }
730            _ => {}
731        }
732    }
733
734    // Skip assertions on fields that don't exist on the result type.
735    if let Some(f) = &assertion.field {
736        if !f.is_empty() && !field_resolver.is_valid_for_result(f) {
737            let _ = writeln!(out, "  # skipped: field '{f}' not available on result type");
738            return;
739        }
740    }
741
742    // When result_is_simple, skip assertions that reference non-content fields
743    // (e.g., metadata, document, structure) since the binding returns a plain value.
744    if result_is_simple {
745        if let Some(f) = &assertion.field {
746            let f_lower = f.to_lowercase();
747            if !f.is_empty()
748                && f_lower != "content"
749                && (f_lower.starts_with("metadata")
750                    || f_lower.starts_with("document")
751                    || f_lower.starts_with("structure"))
752            {
753                let _ = writeln!(
754                    out,
755                    "  # skipped: result_is_simple for field '{f}' not available on result type"
756                );
757                return;
758            }
759        }
760    }
761
762    let field_expr = if result_is_simple {
763        result_var.to_string()
764    } else {
765        match &assertion.field {
766            Some(f) if !f.is_empty() => field_resolver.accessor(f, "r", result_var),
767            _ => result_var.to_string(),
768        }
769    };
770
771    match assertion.assertion_type.as_str() {
772        "equals" => {
773            if let Some(expected) = &assertion.value {
774                let r_val = json_to_r(expected, false);
775                let _ = writeln!(out, "  expect_equal(trimws({field_expr}), {r_val})");
776            }
777        }
778        "contains" => {
779            if let Some(expected) = &assertion.value {
780                let r_val = json_to_r(expected, false);
781                let _ = writeln!(out, "  expect_true(grepl({r_val}, {field_expr}, fixed = TRUE))");
782            }
783        }
784        "contains_all" => {
785            if let Some(values) = &assertion.values {
786                for val in values {
787                    let r_val = json_to_r(val, false);
788                    let _ = writeln!(out, "  expect_true(any(grepl({r_val}, {field_expr}, fixed = TRUE)))");
789                }
790            }
791        }
792        "not_contains" => {
793            if let Some(expected) = &assertion.value {
794                let r_val = json_to_r(expected, false);
795                let _ = writeln!(out, "  expect_false(grepl({r_val}, {field_expr}, fixed = TRUE))");
796            }
797        }
798        "not_empty" => {
799            // Multi-element character vectors (e.g. `list_embedding_presets`)
800            // would otherwise evaluate `nchar(x) > 0` element-wise and fail
801            // `expect_true`'s scalar-logical contract. Reduce with `any()` so
802            // the predicate stays a single TRUE/FALSE regardless of length,
803            // and treat zero-length vectors as empty.
804            let _ = writeln!(
805                out,
806                "  expect_true(if (is.character({field_expr})) length({field_expr}) > 0 && any(nchar({field_expr}) > 0) else length({field_expr}) > 0)"
807            );
808        }
809        "is_empty" => {
810            // Rust `Option<String>::None` surfaces as `NA_character_` through
811            // extendr, and `Vec<...>` empties as a zero-length vector. Treat
812            // NULL, NA, "", and zero-length collections as "empty" so the same
813            // assertion works for scalar Option returns (`get_embedding_preset`)
814            // and collection returns alike.
815            let _ = writeln!(
816                out,
817                "  expect_true(is.null({field_expr}) || length({field_expr}) == 0 || (length({field_expr}) == 1 && (is.na({field_expr}) || identical({field_expr}, \"\"))))"
818            );
819        }
820        "contains_any" => {
821            if let Some(values) = &assertion.values {
822                let items: Vec<String> = values.iter().map(|v| json_to_r(v, false)).collect();
823                let vec_str = items.join(", ");
824                let _ = writeln!(
825                    out,
826                    "  expect_true(any(sapply(c({vec_str}), function(v) grepl(v, {field_expr}, fixed = TRUE))))"
827                );
828            }
829        }
830        "greater_than" => {
831            if let Some(val) = &assertion.value {
832                let r_val = json_to_r(val, false);
833                let _ = writeln!(out, "  expect_true({field_expr} > {r_val})");
834            }
835        }
836        "less_than" => {
837            if let Some(val) = &assertion.value {
838                let r_val = json_to_r(val, false);
839                let _ = writeln!(out, "  expect_true({field_expr} < {r_val})");
840            }
841        }
842        "greater_than_or_equal" => {
843            if let Some(val) = &assertion.value {
844                let r_val = json_to_r(val, false);
845                let _ = writeln!(out, "  expect_true({field_expr} >= {r_val})");
846            }
847        }
848        "less_than_or_equal" => {
849            if let Some(val) = &assertion.value {
850                let r_val = json_to_r(val, false);
851                let _ = writeln!(out, "  expect_true({field_expr} <= {r_val})");
852            }
853        }
854        "starts_with" => {
855            if let Some(expected) = &assertion.value {
856                let r_val = json_to_r(expected, false);
857                let _ = writeln!(out, "  expect_true(startsWith({field_expr}, {r_val}))");
858            }
859        }
860        "ends_with" => {
861            if let Some(expected) = &assertion.value {
862                let r_val = json_to_r(expected, false);
863                let _ = writeln!(out, "  expect_true(endsWith({field_expr}, {r_val}))");
864            }
865        }
866        "min_length" => {
867            if let Some(val) = &assertion.value {
868                if let Some(n) = val.as_u64() {
869                    let _ = writeln!(out, "  expect_true(nchar({field_expr}) >= {n})");
870                }
871            }
872        }
873        "max_length" => {
874            if let Some(val) = &assertion.value {
875                if let Some(n) = val.as_u64() {
876                    let _ = writeln!(out, "  expect_true(nchar({field_expr}) <= {n})");
877                }
878            }
879        }
880        "count_min" => {
881            if let Some(val) = &assertion.value {
882                if let Some(n) = val.as_u64() {
883                    let _ = writeln!(out, "  expect_true(length({field_expr}) >= {n})");
884                }
885            }
886        }
887        "count_equals" => {
888            if let Some(val) = &assertion.value {
889                if let Some(n) = val.as_u64() {
890                    let _ = writeln!(out, "  expect_equal(length({field_expr}), {n})");
891                }
892            }
893        }
894        "is_true" => {
895            let _ = writeln!(out, "  expect_true({field_expr})");
896        }
897        "is_false" => {
898            let _ = writeln!(out, "  expect_false({field_expr})");
899        }
900        "method_result" => {
901            if let Some(method_name) = &assertion.method {
902                let call_expr = build_r_method_call(result_var, method_name, assertion.args.as_ref());
903                let check = assertion.check.as_deref().unwrap_or("is_true");
904                match check {
905                    "equals" => {
906                        if let Some(val) = &assertion.value {
907                            if val.is_boolean() {
908                                if val.as_bool() == Some(true) {
909                                    let _ = writeln!(out, "  expect_true({call_expr})");
910                                } else {
911                                    let _ = writeln!(out, "  expect_false({call_expr})");
912                                }
913                            } else {
914                                let r_val = json_to_r(val, false);
915                                let _ = writeln!(out, "  expect_equal({call_expr}, {r_val})");
916                            }
917                        }
918                    }
919                    "is_true" => {
920                        let _ = writeln!(out, "  expect_true({call_expr})");
921                    }
922                    "is_false" => {
923                        let _ = writeln!(out, "  expect_false({call_expr})");
924                    }
925                    "greater_than_or_equal" => {
926                        if let Some(val) = &assertion.value {
927                            let r_val = json_to_r(val, false);
928                            let _ = writeln!(out, "  expect_true({call_expr} >= {r_val})");
929                        }
930                    }
931                    "count_min" => {
932                        if let Some(val) = &assertion.value {
933                            let n = val.as_u64().unwrap_or(0);
934                            let _ = writeln!(out, "  expect_true(length({call_expr}) >= {n})");
935                        }
936                    }
937                    "is_error" => {
938                        let _ = writeln!(out, "  expect_error({call_expr})");
939                    }
940                    "contains" => {
941                        if let Some(val) = &assertion.value {
942                            let r_val = json_to_r(val, false);
943                            let _ = writeln!(out, "  expect_true(grepl({r_val}, {call_expr}, fixed = TRUE))");
944                        }
945                    }
946                    other_check => {
947                        panic!("R e2e generator: unsupported method_result check type: {other_check}");
948                    }
949                }
950            } else {
951                panic!("R e2e generator: method_result assertion missing 'method' field");
952            }
953        }
954        "matches_regex" => {
955            if let Some(expected) = &assertion.value {
956                let r_val = json_to_r(expected, false);
957                let _ = writeln!(out, "  expect_true(grepl({r_val}, {field_expr}))");
958            }
959        }
960        "not_error" => {
961            // The call itself stops the test on error; emit an explicit
962            // `expect_true(TRUE)` so testthat doesn't report the test as
963            // empty when this is the only assertion.
964            let _ = writeln!(out, "  expect_true(TRUE)");
965        }
966        "error" => {
967            // Handled at the test level.
968        }
969        other => {
970            panic!("R e2e generator: unsupported assertion type: {other}");
971        }
972    }
973}
974
975/// Convert a `serde_json::Value` to an R literal string.
976///
977/// # Arguments
978///
979/// * `value` - The JSON value to convert
980///
981/// Convert a PascalCase string to snake_case.
982/// e.g. "DoubleEqual" → "double_equal", "Backticks" → "backticks"
983fn pascal_to_snake_case(s: &str) -> String {
984    let mut result = String::with_capacity(s.len() + 4);
985    for (i, ch) in s.chars().enumerate() {
986        if ch.is_uppercase() && i > 0 {
987            result.push('_');
988        }
989        for lc in ch.to_lowercase() {
990            result.push(lc);
991        }
992    }
993    result
994}
995
996/// Convert a JSON value to an R expression suitable for embedding inside a
997/// `list(...)` that will be passed to `jsonlite::toJSON(..., auto_unbox = TRUE)`.
998///
999/// Differs from [`json_to_r`] in that any array-valued field is wrapped with
1000/// `I(...)` (jsonlite's `AsIs` marker) so it remains a JSON array after the
1001/// `auto_unbox` transform. Empty arrays become `I(list())` (→ `[]`) and
1002/// non-empty arrays become `I(c(...))` (→ `[..]`). Without this wrapping,
1003/// `Vec<String>` fields like `exclude_selectors` get unboxed to scalars and
1004/// serde deserialization on the Rust side fails with
1005/// `invalid type: string "foo", expected a sequence`.
1006fn json_to_r_preserve_arrays(value: &serde_json::Value, lowercase_enum_values: bool) -> String {
1007    match value {
1008        serde_json::Value::Array(arr) => {
1009            if arr.is_empty() {
1010                "I(list())".to_string()
1011            } else {
1012                let items: Vec<String> = arr.iter().map(|v| json_to_r(v, lowercase_enum_values)).collect();
1013                format!("I(c({}))", items.join(", "))
1014            }
1015        }
1016        serde_json::Value::Object(map) => {
1017            let entries: Vec<String> = map
1018                .iter()
1019                .map(|(k, v)| {
1020                    format!(
1021                        "\"{}\" = {}",
1022                        escape_r(k),
1023                        json_to_r_preserve_arrays(v, lowercase_enum_values)
1024                    )
1025                })
1026                .collect();
1027            format!("list({})", entries.join(", "))
1028        }
1029        _ => json_to_r(value, lowercase_enum_values),
1030    }
1031}
1032
1033/// * `lowercase_enum_values` - If true, convert PascalCase strings to snake_case (for enum values).
1034///   If false, preserve original case (for assertion expected values).
1035fn json_to_r(value: &serde_json::Value, lowercase_enum_values: bool) -> String {
1036    match value {
1037        serde_json::Value::String(s) => {
1038            // Convert PascalCase enum values to snake_case only if requested.
1039            // e.g. "Backticks" → "backticks", "DoubleEqual" → "double_equal"
1040            let normalized = if lowercase_enum_values && s.chars().next().is_some_and(|c| c.is_uppercase()) {
1041                pascal_to_snake_case(s)
1042            } else {
1043                s.clone()
1044            };
1045            format!("\"{}\"", escape_r(&normalized))
1046        }
1047        serde_json::Value::Bool(true) => "TRUE".to_string(),
1048        serde_json::Value::Bool(false) => "FALSE".to_string(),
1049        serde_json::Value::Number(n) => n.to_string(),
1050        serde_json::Value::Null => "NULL".to_string(),
1051        serde_json::Value::Array(arr) => {
1052            let items: Vec<String> = arr.iter().map(|v| json_to_r(v, lowercase_enum_values)).collect();
1053            format!("c({})", items.join(", "))
1054        }
1055        serde_json::Value::Object(map) => {
1056            let entries: Vec<String> = map
1057                .iter()
1058                .map(|(k, v)| format!("\"{}\" = {}", escape_r(k), json_to_r(v, lowercase_enum_values)))
1059                .collect();
1060            format!("list({})", entries.join(", "))
1061        }
1062    }
1063}
1064
1065/// Build an R visitor list and add setup line.
1066fn build_r_visitor(setup_lines: &mut Vec<String>, visitor_spec: &crate::fixture::VisitorSpec) {
1067    use std::fmt::Write as FmtWrite;
1068    // Collect each callback as a separate string, then join with ",\n" to avoid
1069    // trailing commas — R's list() does not accept a trailing comma.
1070    let methods: Vec<String> = visitor_spec
1071        .callbacks
1072        .iter()
1073        .map(|(method_name, action)| {
1074            let mut buf = String::new();
1075            emit_r_visitor_method(&mut buf, method_name, action);
1076            // strip the trailing ",\n" added by emit_r_visitor_method
1077            buf.trim_end_matches(['\n', ',']).to_string()
1078        })
1079        .collect();
1080    let mut visitor_obj = String::new();
1081    let _ = writeln!(visitor_obj, "list(");
1082    let _ = write!(visitor_obj, "{}", methods.join(",\n"));
1083    let _ = writeln!(visitor_obj);
1084    let _ = writeln!(visitor_obj, "  )");
1085
1086    setup_lines.push(format!("visitor <- {visitor_obj}"));
1087}
1088
1089/// Build an R call expression for a `method_result` assertion.
1090/// Maps method names to the appropriate R function or method calls.
1091fn build_r_method_call(result_var: &str, method_name: &str, args: Option<&serde_json::Value>) -> String {
1092    match method_name {
1093        "root_child_count" => format!("{result_var}$root_child_count()"),
1094        "root_node_type" => format!("{result_var}$root_node_type()"),
1095        "named_children_count" => format!("{result_var}$named_children_count()"),
1096        "has_error_nodes" => format!("tree_has_error_nodes({result_var})"),
1097        "error_count" | "tree_error_count" => format!("tree_error_count({result_var})"),
1098        "tree_to_sexp" => format!("tree_to_sexp({result_var})"),
1099        "contains_node_type" => {
1100            let node_type = args
1101                .and_then(|a| a.get("node_type"))
1102                .and_then(|v| v.as_str())
1103                .unwrap_or("");
1104            format!("tree_contains_node_type({result_var}, \"{node_type}\")")
1105        }
1106        "find_nodes_by_type" => {
1107            let node_type = args
1108                .and_then(|a| a.get("node_type"))
1109                .and_then(|v| v.as_str())
1110                .unwrap_or("");
1111            format!("find_nodes_by_type({result_var}, \"{node_type}\")")
1112        }
1113        "run_query" => {
1114            let query_source = args
1115                .and_then(|a| a.get("query_source"))
1116                .and_then(|v| v.as_str())
1117                .unwrap_or("");
1118            let language = args
1119                .and_then(|a| a.get("language"))
1120                .and_then(|v| v.as_str())
1121                .unwrap_or("");
1122            format!("run_query({result_var}, \"{language}\", \"{query_source}\", source)")
1123        }
1124        _ => {
1125            if let Some(args_val) = args {
1126                let arg_str = args_val
1127                    .as_object()
1128                    .map(|obj| {
1129                        obj.iter()
1130                            .map(|(k, v)| {
1131                                let r_val = json_to_r(v, false);
1132                                format!("{k} = {r_val}")
1133                            })
1134                            .collect::<Vec<_>>()
1135                            .join(", ")
1136                    })
1137                    .unwrap_or_default();
1138                format!("{result_var}${method_name}({arg_str})")
1139            } else {
1140                format!("{result_var}${method_name}()")
1141            }
1142        }
1143    }
1144}
1145
1146/// Emit an R visitor method for a callback action.
1147fn emit_r_visitor_method(out: &mut String, method_name: &str, action: &CallbackAction) {
1148    use std::fmt::Write as FmtWrite;
1149
1150    // R uses visit_ prefix (matches binding signature)
1151    let params = match method_name {
1152        "visit_link" => "ctx, href, text, title",
1153        "visit_image" => "ctx, src, alt, title",
1154        "visit_heading" => "ctx, level, text, id",
1155        "visit_code_block" => "ctx, lang, code",
1156        "visit_code_inline"
1157        | "visit_strong"
1158        | "visit_emphasis"
1159        | "visit_strikethrough"
1160        | "visit_underline"
1161        | "visit_subscript"
1162        | "visit_superscript"
1163        | "visit_mark"
1164        | "visit_button"
1165        | "visit_summary"
1166        | "visit_figcaption"
1167        | "visit_definition_term"
1168        | "visit_definition_description" => "ctx, text",
1169        "visit_text" => "ctx, text",
1170        "visit_list_item" => "ctx, ordered, marker, text",
1171        "visit_blockquote" => "ctx, content, depth",
1172        "visit_table_row" => "ctx, cells, is_header",
1173        "visit_custom_element" => "ctx, tag_name, html",
1174        "visit_form" => "ctx, action_url, method",
1175        "visit_input" => "ctx, input_type, name, value",
1176        "visit_audio" | "visit_video" | "visit_iframe" => "ctx, src",
1177        "visit_details" => "ctx, open",
1178        "visit_element_end" | "visit_table_end" | "visit_definition_list_end" | "visit_figure_end" => "ctx, output",
1179        "visit_list_start" => "ctx, ordered",
1180        "visit_list_end" => "ctx, ordered, output",
1181        _ => "ctx",
1182    };
1183
1184    let _ = writeln!(out, "    {method_name} = function({params}) {{");
1185    match action {
1186        CallbackAction::Skip => {
1187            let _ = writeln!(out, "      \"skip\"");
1188        }
1189        CallbackAction::Continue => {
1190            let _ = writeln!(out, "      \"continue\"");
1191        }
1192        CallbackAction::PreserveHtml => {
1193            let _ = writeln!(out, "      \"preserve_html\"");
1194        }
1195        CallbackAction::Custom { output } => {
1196            let escaped = escape_r(output);
1197            let _ = writeln!(out, "      list(custom = \"{escaped}\")");
1198        }
1199        CallbackAction::CustomTemplate { template, return_form } => {
1200            let r_expr = r_template_to_paste0(template);
1201            match return_form {
1202                TemplateReturnForm::BareString => {
1203                    let _ = writeln!(out, "      {r_expr}");
1204                }
1205                TemplateReturnForm::Dict => {
1206                    let _ = writeln!(out, "      list(custom = {r_expr})");
1207                }
1208            }
1209        }
1210    }
1211    let _ = writeln!(out, "    }},");
1212}