Skip to main content

alef_e2e/codegen/
r.rs

1//! R e2e test generator using testthat.
2
3use crate::config::E2eConfig;
4use crate::escape::{escape_r, r_template_to_paste0, sanitize_filename, sanitize_ident};
5use crate::field_access::FieldResolver;
6use crate::fixture::{Assertion, CallbackAction, Fixture, FixtureGroup, TemplateReturnForm};
7use alef_core::backend::GeneratedFile;
8use alef_core::config::ResolvedCrateConfig;
9use alef_core::hash::{self, CommentStyle};
10use anyhow::Result;
11use std::fmt::Write as FmtWrite;
12use std::path::PathBuf;
13
14use super::E2eCodegen;
15
16/// R e2e code generator.
17pub struct RCodegen;
18
19impl E2eCodegen for RCodegen {
20    fn generate(
21        &self,
22        groups: &[FixtureGroup],
23        e2e_config: &E2eConfig,
24        config: &ResolvedCrateConfig,
25        _type_defs: &[alef_core::ir::TypeDef],
26        _enums: &[alef_core::ir::EnumDef],
27    ) -> Result<Vec<GeneratedFile>> {
28        let lang = self.language_name();
29        let output_base = PathBuf::from(e2e_config.effective_output()).join(lang);
30
31        let mut files = Vec::new();
32
33        // Resolve call config with overrides.
34        let call = &e2e_config.call;
35        let overrides = call.overrides.get(lang);
36        let module_path = overrides
37            .and_then(|o| o.module.as_ref())
38            .cloned()
39            .unwrap_or_else(|| call.module.clone());
40        let _function_name = overrides
41            .and_then(|o| o.function.as_ref())
42            .cloned()
43            .unwrap_or_else(|| call.function.clone());
44        let result_is_simple = call.result_is_simple || overrides.is_some_and(|o| o.result_is_simple);
45        let result_is_r_list = overrides.is_some_and(|o| o.result_is_r_list);
46        let _result_var = &call.result_var;
47
48        // Resolve package config.
49        let r_pkg = e2e_config.resolve_package("r");
50        let pkg_name = r_pkg
51            .as_ref()
52            .and_then(|p| p.name.as_ref())
53            .cloned()
54            .unwrap_or_else(|| module_path.clone());
55        let pkg_path = r_pkg
56            .as_ref()
57            .and_then(|p| p.path.as_ref())
58            .cloned()
59            .unwrap_or_else(|| "../../packages/r".to_string());
60        let pkg_version = r_pkg
61            .as_ref()
62            .and_then(|p| p.version.as_ref())
63            .cloned()
64            .or_else(|| config.resolved_version())
65            .unwrap_or_else(|| "0.1.0".to_string());
66
67        // Generate DESCRIPTION file.
68        files.push(GeneratedFile {
69            path: output_base.join("DESCRIPTION"),
70            content: render_description(&pkg_name, &pkg_version, e2e_config.dep_mode),
71            generated_header: false,
72        });
73
74        // Generate test runner script.
75        files.push(GeneratedFile {
76            path: output_base.join("run_tests.R"),
77            content: render_test_runner(&pkg_path, e2e_config.dep_mode),
78            generated_header: true,
79        });
80
81        // setup-fixtures.R — testthat sources `setup-*.R` files in the tests
82        // directory once before any tests run, with the working directory set
83        // to the tests/ folder. We use this hook to chdir into the repo's
84        // shared `test_documents/` directory so that fixture paths like
85        // `pdf/fake_memo.pdf` resolve at extraction time.
86        files.push(GeneratedFile {
87            path: output_base.join("tests").join("setup-fixtures.R"),
88            content: render_setup_fixtures(&e2e_config.test_documents_relative_from(1)),
89            generated_header: true,
90        });
91
92        // Generate test files per category.
93        for group in groups {
94            let active: Vec<&Fixture> = group
95                .fixtures
96                .iter()
97                .filter(|f| super::should_include_fixture(f, lang, e2e_config))
98                .collect();
99
100            if active.is_empty() {
101                continue;
102            }
103
104            let filename = format!("test_{}.R", sanitize_filename(&group.category));
105            let field_resolver = FieldResolver::new(
106                &e2e_config.fields,
107                &e2e_config.fields_optional,
108                &e2e_config.result_fields,
109                &e2e_config.fields_array,
110                &std::collections::HashSet::new(),
111            );
112            let content = render_test_file(
113                &group.category,
114                &active,
115                &field_resolver,
116                result_is_simple,
117                result_is_r_list,
118                e2e_config,
119            );
120            files.push(GeneratedFile {
121                path: output_base.join("tests").join(filename),
122                content,
123                generated_header: true,
124            });
125        }
126
127        Ok(files)
128    }
129
130    fn language_name(&self) -> &'static str {
131        "r"
132    }
133}
134
135fn render_description(pkg_name: &str, pkg_version: &str, dep_mode: crate::config::DependencyMode) -> String {
136    let dep_line = match dep_mode {
137        crate::config::DependencyMode::Registry => {
138            format!("Imports: {pkg_name} ({pkg_version})\n")
139        }
140        crate::config::DependencyMode::Local => String::new(),
141    };
142    format!(
143        r#"Package: e2e.r
144Title: E2E Tests for {pkg_name}
145Version: 0.1.0
146Description: End-to-end test suite.
147{dep_line}Suggests: testthat (>= 3.0.0)
148Config/testthat/edition: 3
149"#
150    )
151}
152
153fn render_setup_fixtures(test_documents_path: &str) -> String {
154    let mut out = String::new();
155    out.push_str(&hash::header(CommentStyle::Hash));
156    let _ = writeln!(out);
157    let _ = writeln!(
158        out,
159        "# Resolve fixture paths against the repo's `test_documents/` directory."
160    );
161    let _ = writeln!(
162        out,
163        "# testthat sources setup-*.R with the working directory at tests/,"
164    );
165    let _ = writeln!(
166        out,
167        "# so test_documents lives three directories up: tests/ -> e2e/r/ -> e2e/ -> repo root."
168    );
169    let _ = writeln!(
170        out,
171        "# Each `test_that()` block has its working directory reset back to tests/, so"
172    );
173    let _ = writeln!(
174        out,
175        "# fixture lookups must be performed via this helper rather than relying on `setwd`."
176    );
177    let _ = writeln!(
178        out,
179        ".alef_test_documents <- normalizePath(\"{test_documents_path}\", mustWork = FALSE)"
180    );
181    let _ = writeln!(out, ".resolve_fixture <- function(path) {{");
182    let _ = writeln!(out, "  if (dir.exists(.alef_test_documents)) {{");
183    let _ = writeln!(out, "    file.path(.alef_test_documents, path)");
184    let _ = writeln!(out, "  }} else {{");
185    let _ = writeln!(out, "    path");
186    let _ = writeln!(out, "  }}");
187    let _ = writeln!(out, "}}");
188    out
189}
190
191fn render_test_runner(pkg_path: &str, dep_mode: crate::config::DependencyMode) -> String {
192    let mut out = String::new();
193    out.push_str(&hash::header(CommentStyle::Hash));
194    let _ = writeln!(out, "library(testthat)");
195    match dep_mode {
196        crate::config::DependencyMode::Registry => {
197            // In registry mode, require the installed CRAN package directly.
198            let _ = writeln!(out, "# Package loaded via library() from CRAN install.");
199        }
200        crate::config::DependencyMode::Local => {
201            // Use devtools::load_all() to load the local R package without requiring
202            // a full install, matching the e2e test runner convention.
203            let _ = writeln!(out, "devtools::load_all(\"{pkg_path}\")");
204        }
205    }
206    let _ = writeln!(out);
207    // Surface every failure rather than aborting at the default max_fails=10 —
208    // partial pass counts are essential for triage during e2e bring-up.
209    let _ = writeln!(out, "testthat::set_max_fails(Inf)");
210    // Resolve the tests/ directory relative to this script. testthat reads
211    // setup-*.R from there before each file runs, where path resolution
212    // against test_documents/ is handled by the `.resolve_fixture` helper.
213    let _ = writeln!(
214        out,
215        ".script_dir <- tryCatch(dirname(normalizePath(sys.frame(1)$ofile)), error = function(e) getwd())"
216    );
217    let _ = writeln!(out, "test_dir(file.path(.script_dir, \"tests\"))");
218    out
219}
220
221fn render_test_file(
222    category: &str,
223    fixtures: &[&Fixture],
224    field_resolver: &FieldResolver,
225    result_is_simple: bool,
226    result_is_r_list: bool,
227    e2e_config: &E2eConfig,
228) -> String {
229    let mut out = String::new();
230    out.push_str(&hash::header(CommentStyle::Hash));
231    let _ = writeln!(out, "# E2e tests for category: {category}");
232    let _ = writeln!(out);
233
234    for (i, fixture) in fixtures.iter().enumerate() {
235        render_test_case(
236            &mut out,
237            fixture,
238            e2e_config,
239            field_resolver,
240            result_is_simple,
241            result_is_r_list,
242        );
243        if i + 1 < fixtures.len() {
244            let _ = writeln!(out);
245        }
246    }
247
248    // Clean up trailing newlines.
249    while out.ends_with("\n\n") {
250        out.pop();
251    }
252    if !out.ends_with('\n') {
253        out.push('\n');
254    }
255    out
256}
257
258fn render_test_case(
259    out: &mut String,
260    fixture: &Fixture,
261    e2e_config: &E2eConfig,
262    field_resolver: &FieldResolver,
263    default_result_is_simple: bool,
264    default_result_is_r_list: bool,
265) {
266    let call_config = e2e_config.resolve_call_for_fixture(fixture.call.as_deref(), &fixture.input);
267    let function_name = &call_config.function;
268    let result_var = &call_config.result_var;
269    // Per-fixture call configs (e.g. `list_document_extractors`) may set
270    // `result_is_simple = true` even when the default `[e2e.call]` does not.
271    // Without this lookup the registry/detection wrappers (which return scalar
272    // strings or character vectors directly) get wrapped in
273    // `jsonlite::fromJSON(...)` and the parser fails on non-JSON output.
274    let r_override = call_config.overrides.get("r");
275    let result_is_simple = if fixture.call.is_some() {
276        call_config.result_is_simple || r_override.is_some_and(|o| o.result_is_simple)
277    } else {
278        default_result_is_simple
279    };
280    // Per-fixture override: when the R binding already returns a native R list
281    // (not a JSON string), suppress `jsonlite::fromJSON` wrapping while still
282    // using field-path (`result$field`) accessors in assertions.
283    let result_is_r_list = if fixture.call.is_some() {
284        r_override.is_some_and(|o| o.result_is_r_list)
285    } else {
286        default_result_is_r_list
287    };
288
289    let test_name = sanitize_ident(&fixture.id);
290    let description = fixture.description.replace('"', "\\\"");
291
292    let expects_error = fixture.assertions.iter().any(|a| a.assertion_type == "error");
293
294    // Allow per-call R overrides to remap fixture argument names. Many calls
295    // (e.g. `extract_bytes`, `batch_extract_files`) use language-neutral
296    // fixture field names (`data`, `paths`) that the R extendr binding
297    // exposes under different identifiers (`content`, `items`).
298    let arg_name_map = r_override.map(|o| &o.arg_name_map);
299    // Resolve `options_type` for typed config args. When set (e.g. via the
300    // C#/Java override that pins the `config` arg of `embed_texts` to
301    // `EmbeddingConfig`), we use it instead of the heuristic in
302    // `r_default_for_config_arg` so the extendr binding receives the right
303    // ExternalPtr type rather than a default `ExtractionConfig`.
304    let options_type = r_override.and_then(|o| o.options_type.as_deref()).or_else(|| {
305        // Fall back to any other language's override that pins the type —
306        // R doesn't define its own override list yet for most embed calls,
307        // and the underlying Rust signature is the same regardless of
308        // binding, so reusing csharp/java/go/php options_type is safe.
309        //
310        // Skip `Js`-prefixed types from the Node/wasm bindings: those are
311        // NAPI/wasm-bindgen specific wrapper types, while extendr exposes the
312        // bare Rust type names (e.g. `ExtractionConfig`, not `JsExtractionConfig`).
313        call_config
314            .overrides
315            .values()
316            .filter_map(|o| o.options_type.as_deref())
317            .find(|name| !name.starts_with("Js"))
318    });
319    let args_str = build_args_string(&fixture.input, &call_config.args, arg_name_map, options_type);
320
321    // Build visitor setup and args if present
322    let mut setup_lines = Vec::new();
323    let final_args = if let Some(visitor_spec) = &fixture.visitor {
324        build_r_visitor(&mut setup_lines, visitor_spec);
325        // R rejects duplicated named arguments ("matched by multiple actual arguments"), so
326        // strip any existing `options = ...` arg before appending the visitor-options list.
327        // Handles `options = NULL` (when no default) and `options = ConversionOptions$default()`
328        // (when build_args_string emits a default placeholder for an optional options arg).
329        let base = strip_options_arg(&args_str);
330        let visitor_opts = "options = list(visitor = visitor)";
331        let trimmed = base.trim_matches([' ', ',']);
332        if trimmed.is_empty() {
333            visitor_opts.to_string()
334        } else {
335            format!("{trimmed}, {visitor_opts}")
336        }
337    } else {
338        args_str
339    };
340
341    if expects_error {
342        let _ = writeln!(out, "test_that(\"{test_name}: {description}\", {{");
343        for line in &setup_lines {
344            let _ = writeln!(out, "  {line}");
345        }
346        let _ = writeln!(out, "  expect_error({function_name}({final_args}))");
347        let _ = writeln!(out, "}})");
348        return;
349    }
350
351    let _ = writeln!(out, "test_that(\"{test_name}: {description}\", {{");
352    for line in &setup_lines {
353        let _ = writeln!(out, "  {line}");
354    }
355    // The extendr extraction wrappers return JSON strings carrying the
356    // serialized core result; parse into an R list so tests can use `$`
357    // accessors. `result_is_simple` calls (e.g. `convert_html_to_markdown`)
358    // already return scalar values and must be passed through verbatim.
359    // `result_is_r_list` signals the binding returns a native R list (Robj),
360    // not a JSON string — skip `jsonlite::fromJSON` but keep `$` accessors.
361    if result_is_simple || result_is_r_list {
362        let _ = writeln!(out, "  {result_var} <- {function_name}({final_args})");
363    } else {
364        let _ = writeln!(
365            out,
366            "  {result_var} <- jsonlite::fromJSON({function_name}({final_args}), simplifyVector = FALSE)"
367        );
368    }
369
370    for assertion in &fixture.assertions {
371        render_assertion(out, assertion, result_var, field_resolver, result_is_simple, e2e_config);
372    }
373
374    let _ = writeln!(out, "}})");
375}
376
377/// Remove the named `options = …` argument (if any) from an R call-args string.
378///
379/// Walks the string while tracking paren/quote depth so a comma inside a nested
380/// expression like `options = list(visitor = visitor)` isn't treated as the
381/// arg terminator. Returns the rebuilt args string with the `options =` arg
382/// dropped; callers append a fresh one.
383fn strip_options_arg(args_str: &str) -> String {
384    let mut parts: Vec<String> = Vec::new();
385    let mut current = String::new();
386    let mut paren_depth: i32 = 0;
387    let mut in_single = false;
388    let mut in_double = false;
389    for c in args_str.chars() {
390        if !in_single && !in_double {
391            match c {
392                '(' | '[' | '{' => paren_depth += 1,
393                ')' | ']' | '}' => paren_depth -= 1,
394                '\'' => in_single = true,
395                '"' => in_double = true,
396                ',' if paren_depth == 0 => {
397                    parts.push(current.trim().to_string());
398                    current.clear();
399                    continue;
400                }
401                _ => {}
402            }
403        } else if in_single && c == '\'' {
404            in_single = false;
405        } else if in_double && c == '"' {
406            in_double = false;
407        }
408        current.push(c);
409    }
410    if !current.trim().is_empty() {
411        parts.push(current.trim().to_string());
412    }
413    parts
414        .into_iter()
415        .filter(|p| !p.starts_with("options ") && !p.starts_with("options="))
416        .collect::<Vec<_>>()
417        .join(", ")
418}
419
420fn build_args_string(
421    input: &serde_json::Value,
422    args: &[crate::config::ArgMapping],
423    arg_name_map: Option<&std::collections::HashMap<String, String>>,
424    options_type: Option<&str>,
425) -> String {
426    if args.is_empty() {
427        // No declared args means the wrapper takes zero parameters; emitting
428        // `list()` here would trigger an `unused argument (list())` error in R.
429        // Likewise, fall through to nothing if the fixture's input is empty.
430        if matches!(input, serde_json::Value::Null) || input.as_object().is_some_and(|m| m.is_empty()) {
431            return String::new();
432        }
433        return json_to_r(input, true);
434    }
435
436    let parts: Vec<String> = args
437        .iter()
438        .filter_map(|arg| {
439            // Apply per-language argument renames before emitting the call.
440            let arg_name: &str = arg_name_map
441                .and_then(|m| m.get(&arg.name).map(String::as_str))
442                .unwrap_or(&arg.name);
443
444            let field = arg.field.strip_prefix("input.").unwrap_or(&arg.field);
445            let val = input.get(field);
446            // R extendr-generated wrappers do not preserve Option<T> defaults from
447            // the Rust signature — every parameter is positional and required at
448            // the R level. To keep generated calls valid we must pass a placeholder
449            // (`NULL` for `Option<T>`, `ExtractionConfig$default()` for typed
450            // configs) whenever the fixture omits an optional value.
451            let val = match val {
452                Some(v) if !(v.is_null() && arg.optional) => v,
453                _ => {
454                    if !arg.optional {
455                        return None;
456                    }
457                    if arg.arg_type == "json_object" {
458                        let r_value = r_default_for_config_arg(arg_name, options_type);
459                        return Some(format!("{arg_name} = {r_value}"));
460                    }
461                    return Some(format!("{arg_name} = NULL"));
462                }
463            };
464            // The extendr bindings expect owned PORs (ExternalPtr) for typed
465            // config arguments — passing an R `list()` raises
466            // `Expected ExternalPtr got List`. The fixtures don't carry the
467            // option fields needed to round-trip through ExtractionConfig$new,
468            // so emit `ExtractionConfig$default()` whenever a `json_object` arg
469            // resolves to an empty / object-shaped JSON value.
470            if arg.arg_type == "json_object" && (val.is_null() || val.as_object().is_some_and(|m| m.is_empty())) {
471                let r_value = r_default_for_config_arg(arg_name, options_type);
472                return Some(format!("{arg_name} = {r_value}"));
473            }
474            // Non-empty json_object for typed config args (those whose default is a
475            // `$default()` constructor): use `TypeName$from_json(jsonlite::toJSON(...))`
476            // so the Rust function receives a proper ExternalPtr, not a list.
477            // For `options`-style args (default = NULL) emit as a plain R list.
478            if arg.arg_type == "json_object" && val.is_object() {
479                let default_expr = r_default_for_config_arg(arg_name, options_type);
480                if default_expr.ends_with("$default()") {
481                    // Extract the type name from "TypeName$default()"
482                    let type_name = default_expr.trim_end_matches("$default()");
483                    // Use the `I(...)` (AsIs) wrapper for array-valued fields so
484                    // `jsonlite::toJSON(..., auto_unbox = TRUE)` preserves them as
485                    // JSON arrays. Without this, single-element vectors get
486                    // unboxed to scalars (e.g. `c("foo")` → `"foo"`) and serde
487                    // rejects them when deserializing `Vec<T>` fields.
488                    let r_list = json_to_r_preserve_arrays(val, true);
489                    let r_value = format!("{type_name}$from_json(jsonlite::toJSON({r_list}, auto_unbox = TRUE))");
490                    return Some(format!("{arg_name} = {r_value}"));
491                }
492                let r_value = json_to_r(val, true);
493                return Some(format!("{arg_name} = {r_value}"));
494            }
495            // `json_object` arrays are passed to extendr functions whose Rust
496            // signature is `items: String` (JSON-serialized batch items). The
497            // wrapper has no R-list → JSON conversion, so we must serialize the
498            // fixture value to a literal JSON string at test-emit time.
499            //
500            // Exception: when `element_type = "String"` the Rust signature is
501            // `Vec<String>` (e.g. `embed_texts(texts: Vec<String>, ...)`), which
502            // extendr binds as a native R character vector. Passing a JSON
503            // literal there would land as a single-element character vector
504            // containing the literal bytes `["a","b"]`, which is not what the
505            // caller intended. Emit a plain `c("a","b")` literal instead.
506            if arg.arg_type == "json_object" && val.is_array() {
507                if arg.element_type.as_deref() == Some("String") {
508                    let r_value = json_to_r(val, false);
509                    return Some(format!("{arg_name} = {r_value}"));
510                }
511                let json_literal = serde_json::to_string(val).unwrap_or_else(|_| "[]".to_string());
512                let escaped = escape_r(&json_literal);
513                return Some(format!("{arg_name} = \"{escaped}\""));
514            }
515            // `bytes` arg type: convert string fixture values into runtime
516            // `readBin(...)` calls so the wrapper receives raw bytes instead
517            // of an R character vector. This mirrors the Python emit_bytes_arg
518            // helper and is what the extendr binding for Vec<u8> expects.
519            if arg.arg_type == "bytes" {
520                if let Some(raw) = val.as_str() {
521                    let r_value = render_bytes_value(raw);
522                    return Some(format!("{arg_name} = {r_value}"));
523                }
524            }
525            // `file_path` arg type: fixtures encode relative paths that resolve
526            // against the repo's `test_documents/` directory. Using a runtime
527            // helper that anchors paths to that directory avoids fragility from
528            // testthat resetting the working directory between files.
529            if arg.arg_type == "file_path" {
530                if let Some(raw) = val.as_str() {
531                    if !raw.starts_with('/') && !raw.is_empty() {
532                        let escaped = escape_r(raw);
533                        return Some(format!("{arg_name} = .resolve_fixture(\"{escaped}\")"));
534                    }
535                }
536            }
537            Some(format!("{arg_name} = {}", json_to_r(val, true)))
538        })
539        .collect();
540
541    parts.join(", ")
542}
543
544/// Render a `bytes` fixture value as the R expression that produces a raw
545/// vector at test time. Mirrors python's `emit_bytes_arg` classifier so we can
546/// support both file-path style fixtures (`"pdf/fake_memo.pdf"`) and inline
547/// text payloads (`"<html>..."`). The resulting expression is dropped directly
548/// into the call site, e.g. `content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), ...)`.
549fn render_bytes_value(raw: &str) -> String {
550    if raw.starts_with('<') || raw.starts_with('{') || raw.starts_with('[') || raw.contains(' ') {
551        // Inline text payload — encode to raw via charToRaw.
552        let escaped = escape_r(raw);
553        return format!("charToRaw(\"{escaped}\")");
554    }
555    let first = raw.chars().next().unwrap_or('\0');
556    if first.is_ascii_alphanumeric() || first == '_' {
557        if let Some(slash) = raw.find('/') {
558            if slash > 0 {
559                let after = &raw[slash + 1..];
560                if after.contains('.') && !after.is_empty() {
561                    let escaped = escape_r(raw);
562                    return format!(
563                        "readBin(.resolve_fixture(\"{escaped}\"), what = \"raw\", n = file.info(.resolve_fixture(\"{escaped}\"))$size)"
564                    );
565                }
566            }
567        }
568    }
569    // Default to inline text encoding — matches Python's InlineText branch.
570    let escaped = escape_r(raw);
571    format!("charToRaw(\"{escaped}\")")
572}
573
574/// Map the extractor argument name onto its R `*Config$default()` constructor.
575/// Falls back to `list()` for unknown names — the extendr binding will error
576/// with a clear message, which is preferable to silently passing a wrong type.
577///
578/// When `options_type` is provided (via a per-call language override pinning
579/// the typed config, e.g. `EmbeddingConfig` for `embed_texts`), it takes
580/// precedence over the arg-name heuristic so the extendr binding receives the
581/// correct ExternalPtr type.
582fn r_default_for_config_arg(arg_name: &str, options_type: Option<&str>) -> String {
583    if let Some(type_name) = options_type {
584        return format!("{type_name}$default()");
585    }
586    match arg_name {
587        "config" => "ExtractionConfig$default()".to_string(),
588        "options" => "NULL".to_string(),
589        "html_output" => "HtmlOutputConfig$default()".to_string(),
590        "chunking" => "ChunkingConfig$default()".to_string(),
591        "ocr" => "OcrConfig$default()".to_string(),
592        "image" | "images" => "ImageExtractionConfig$default()".to_string(),
593        "language_detection" => "LanguageDetectionConfig$default()".to_string(),
594        _ => "list()".to_string(),
595    }
596}
597
598fn render_assertion(
599    out: &mut String,
600    assertion: &Assertion,
601    result_var: &str,
602    field_resolver: &FieldResolver,
603    result_is_simple: bool,
604    _e2e_config: &E2eConfig,
605) {
606    // Handle synthetic / derived fields before the is_valid_for_result check
607    // so they are never treated as struct attribute accesses on the result.
608    if let Some(f) = &assertion.field {
609        match f.as_str() {
610            "chunks_have_content" => {
611                let pred = format!("all(sapply({result_var}$chunks %||% list(), function(c) nchar(c$content) > 0))");
612                match assertion.assertion_type.as_str() {
613                    "is_true" => {
614                        let _ = writeln!(out, "  expect_true({pred})");
615                    }
616                    "is_false" => {
617                        let _ = writeln!(out, "  expect_false({pred})");
618                    }
619                    _ => {
620                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
621                    }
622                }
623                return;
624            }
625            "chunks_have_embeddings" => {
626                let pred = format!(
627                    "all(sapply({result_var}$chunks %||% list(), function(c) !is.null(c$embedding) && length(c$embedding) > 0))"
628                );
629                match assertion.assertion_type.as_str() {
630                    "is_true" => {
631                        let _ = writeln!(out, "  expect_true({pred})");
632                    }
633                    "is_false" => {
634                        let _ = writeln!(out, "  expect_false({pred})");
635                    }
636                    _ => {
637                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
638                    }
639                }
640                return;
641            }
642            // ---- EmbedResponse virtual fields ----
643            // The extendr binding cannot return `Vec<Vec<f32>>` directly (extendr's
644            // Robj conversion has no impl for nested numeric vectors), so the
645            // wrapper serializes the result to a JSON string at the FFI boundary.
646            // Parse it on demand here so length/index assertions operate on the
647            // matrix structure rather than on the single string scalar.
648            "embeddings" => {
649                let parsed = format!(
650                    "(if (is.character({result_var}) && length({result_var}) == 1) jsonlite::fromJSON({result_var}, simplifyVector = FALSE) else {result_var})"
651                );
652                match assertion.assertion_type.as_str() {
653                    "count_equals" => {
654                        if let Some(val) = &assertion.value {
655                            let r_val = json_to_r(val, false);
656                            let _ = writeln!(out, "  expect_equal(length({parsed}), {r_val})");
657                        }
658                    }
659                    "count_min" => {
660                        if let Some(val) = &assertion.value {
661                            let r_val = json_to_r(val, false);
662                            let _ = writeln!(out, "  expect_gte(length({parsed}), {r_val})");
663                        }
664                    }
665                    "not_empty" => {
666                        let _ = writeln!(out, "  expect_gt(length({parsed}), 0)");
667                    }
668                    "is_empty" => {
669                        let _ = writeln!(out, "  expect_equal(length({parsed}), 0)");
670                    }
671                    _ => {
672                        let _ = writeln!(
673                            out,
674                            "  # skipped: unsupported assertion type on synthetic field 'embeddings'"
675                        );
676                    }
677                }
678                return;
679            }
680            "embedding_dimensions" => {
681                let expr = format!("(if (length({result_var}) == 0) 0L else length({result_var}[[1]]))");
682                match assertion.assertion_type.as_str() {
683                    "equals" => {
684                        if let Some(val) = &assertion.value {
685                            let r_val = json_to_r(val, false);
686                            let _ = writeln!(out, "  expect_equal({expr}, {r_val})");
687                        }
688                    }
689                    "greater_than" => {
690                        if let Some(val) = &assertion.value {
691                            let r_val = json_to_r(val, false);
692                            let _ = writeln!(out, "  expect_gt({expr}, {r_val})");
693                        }
694                    }
695                    _ => {
696                        let _ = writeln!(
697                            out,
698                            "  # skipped: unsupported assertion type on synthetic field 'embedding_dimensions'"
699                        );
700                    }
701                }
702                return;
703            }
704            "embeddings_valid" | "embeddings_finite" | "embeddings_non_zero" | "embeddings_normalized" => {
705                let pred = match f.as_str() {
706                    "embeddings_valid" => {
707                        format!("all(sapply({result_var}, function(e) length(e) > 0))")
708                    }
709                    "embeddings_finite" => {
710                        format!("all(sapply({result_var}, function(e) all(is.finite(e))))")
711                    }
712                    "embeddings_non_zero" => {
713                        format!("all(sapply({result_var}, function(e) any(e != 0.0)))")
714                    }
715                    "embeddings_normalized" => {
716                        format!("all(sapply({result_var}, function(e) abs(sum(e * e) - 1.0) < 1e-3))")
717                    }
718                    _ => unreachable!(),
719                };
720                match assertion.assertion_type.as_str() {
721                    "is_true" => {
722                        let _ = writeln!(out, "  expect_true({pred})");
723                    }
724                    "is_false" => {
725                        let _ = writeln!(out, "  expect_false({pred})");
726                    }
727                    _ => {
728                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
729                    }
730                }
731                return;
732            }
733            // ---- keywords / keywords_count ----
734            // R ExtractionResult does not expose extracted_keywords; skip.
735            "keywords" | "keywords_count" => {
736                let _ = writeln!(out, "  # skipped: field '{f}' not available on R ExtractionResult");
737                return;
738            }
739            _ => {}
740        }
741    }
742
743    // Skip assertions on fields that don't exist on the result type.
744    if let Some(f) = &assertion.field {
745        if !f.is_empty() && !field_resolver.is_valid_for_result(f) {
746            let _ = writeln!(out, "  # skipped: field '{f}' not available on result type");
747            return;
748        }
749    }
750
751    // When result_is_simple, skip assertions that reference non-content fields
752    // (e.g., metadata, document, structure) since the binding returns a plain value.
753    if result_is_simple {
754        if let Some(f) = &assertion.field {
755            let f_lower = f.to_lowercase();
756            if !f.is_empty()
757                && f_lower != "content"
758                && (f_lower.starts_with("metadata")
759                    || f_lower.starts_with("document")
760                    || f_lower.starts_with("structure"))
761            {
762                let _ = writeln!(
763                    out,
764                    "  # skipped: result_is_simple for field '{f}' not available on result type"
765                );
766                return;
767            }
768        }
769    }
770
771    let field_expr = if result_is_simple {
772        result_var.to_string()
773    } else {
774        match &assertion.field {
775            Some(f) if !f.is_empty() => field_resolver.accessor(f, "r", result_var),
776            _ => result_var.to_string(),
777        }
778    };
779
780    match assertion.assertion_type.as_str() {
781        "equals" => {
782            if let Some(expected) = &assertion.value {
783                let r_val = json_to_r(expected, false);
784                let _ = writeln!(out, "  expect_equal(trimws({field_expr}), {r_val})");
785            }
786        }
787        "contains" => {
788            if let Some(expected) = &assertion.value {
789                let r_val = json_to_r(expected, false);
790                let _ = writeln!(out, "  expect_true(grepl({r_val}, {field_expr}, fixed = TRUE))");
791            }
792        }
793        "contains_all" => {
794            if let Some(values) = &assertion.values {
795                for val in values {
796                    let r_val = json_to_r(val, false);
797                    let _ = writeln!(out, "  expect_true(any(grepl({r_val}, {field_expr}, fixed = TRUE)))");
798                }
799            }
800        }
801        "not_contains" => {
802            if let Some(expected) = &assertion.value {
803                let r_val = json_to_r(expected, false);
804                let _ = writeln!(out, "  expect_false(grepl({r_val}, {field_expr}, fixed = TRUE))");
805            }
806        }
807        "not_empty" => {
808            // Multi-element character vectors (e.g. `list_embedding_presets`)
809            // would otherwise evaluate `nchar(x) > 0` element-wise and fail
810            // `expect_true`'s scalar-logical contract. Reduce with `any()` so
811            // the predicate stays a single TRUE/FALSE regardless of length,
812            // and treat zero-length vectors as empty.
813            let _ = writeln!(
814                out,
815                "  expect_true(if (is.character({field_expr})) length({field_expr}) > 0 && any(nchar({field_expr}) > 0) else length({field_expr}) > 0)"
816            );
817        }
818        "is_empty" => {
819            // Rust `Option<String>::None` surfaces as `NA_character_` through
820            // extendr, and `Vec<...>` empties as a zero-length vector. Treat
821            // NULL, NA, "", and zero-length collections as "empty" so the same
822            // assertion works for scalar Option returns (`get_embedding_preset`)
823            // and collection returns alike.
824            let _ = writeln!(
825                out,
826                "  expect_true(is.null({field_expr}) || length({field_expr}) == 0 || (length({field_expr}) == 1 && (is.na({field_expr}) || identical({field_expr}, \"\"))))"
827            );
828        }
829        "contains_any" => {
830            if let Some(values) = &assertion.values {
831                let items: Vec<String> = values.iter().map(|v| json_to_r(v, false)).collect();
832                let vec_str = items.join(", ");
833                let _ = writeln!(
834                    out,
835                    "  expect_true(any(sapply(c({vec_str}), function(v) grepl(v, {field_expr}, fixed = TRUE))))"
836                );
837            }
838        }
839        "greater_than" => {
840            if let Some(val) = &assertion.value {
841                let r_val = json_to_r(val, false);
842                let _ = writeln!(out, "  expect_true({field_expr} > {r_val})");
843            }
844        }
845        "less_than" => {
846            if let Some(val) = &assertion.value {
847                let r_val = json_to_r(val, false);
848                let _ = writeln!(out, "  expect_true({field_expr} < {r_val})");
849            }
850        }
851        "greater_than_or_equal" => {
852            if let Some(val) = &assertion.value {
853                let r_val = json_to_r(val, false);
854                let _ = writeln!(out, "  expect_true({field_expr} >= {r_val})");
855            }
856        }
857        "less_than_or_equal" => {
858            if let Some(val) = &assertion.value {
859                let r_val = json_to_r(val, false);
860                let _ = writeln!(out, "  expect_true({field_expr} <= {r_val})");
861            }
862        }
863        "starts_with" => {
864            if let Some(expected) = &assertion.value {
865                let r_val = json_to_r(expected, false);
866                let _ = writeln!(out, "  expect_true(startsWith({field_expr}, {r_val}))");
867            }
868        }
869        "ends_with" => {
870            if let Some(expected) = &assertion.value {
871                let r_val = json_to_r(expected, false);
872                let _ = writeln!(out, "  expect_true(endsWith({field_expr}, {r_val}))");
873            }
874        }
875        "min_length" => {
876            if let Some(val) = &assertion.value {
877                if let Some(n) = val.as_u64() {
878                    let _ = writeln!(out, "  expect_true(nchar({field_expr}) >= {n})");
879                }
880            }
881        }
882        "max_length" => {
883            if let Some(val) = &assertion.value {
884                if let Some(n) = val.as_u64() {
885                    let _ = writeln!(out, "  expect_true(nchar({field_expr}) <= {n})");
886                }
887            }
888        }
889        "count_min" => {
890            if let Some(val) = &assertion.value {
891                if let Some(n) = val.as_u64() {
892                    let _ = writeln!(out, "  expect_true(length({field_expr}) >= {n})");
893                }
894            }
895        }
896        "count_equals" => {
897            if let Some(val) = &assertion.value {
898                if let Some(n) = val.as_u64() {
899                    let _ = writeln!(out, "  expect_equal(length({field_expr}), {n})");
900                }
901            }
902        }
903        "is_true" => {
904            let _ = writeln!(out, "  expect_true({field_expr})");
905        }
906        "is_false" => {
907            let _ = writeln!(out, "  expect_false({field_expr})");
908        }
909        "method_result" => {
910            if let Some(method_name) = &assertion.method {
911                let call_expr = build_r_method_call(result_var, method_name, assertion.args.as_ref());
912                let check = assertion.check.as_deref().unwrap_or("is_true");
913                match check {
914                    "equals" => {
915                        if let Some(val) = &assertion.value {
916                            if val.is_boolean() {
917                                if val.as_bool() == Some(true) {
918                                    let _ = writeln!(out, "  expect_true({call_expr})");
919                                } else {
920                                    let _ = writeln!(out, "  expect_false({call_expr})");
921                                }
922                            } else {
923                                let r_val = json_to_r(val, false);
924                                let _ = writeln!(out, "  expect_equal({call_expr}, {r_val})");
925                            }
926                        }
927                    }
928                    "is_true" => {
929                        let _ = writeln!(out, "  expect_true({call_expr})");
930                    }
931                    "is_false" => {
932                        let _ = writeln!(out, "  expect_false({call_expr})");
933                    }
934                    "greater_than_or_equal" => {
935                        if let Some(val) = &assertion.value {
936                            let r_val = json_to_r(val, false);
937                            let _ = writeln!(out, "  expect_true({call_expr} >= {r_val})");
938                        }
939                    }
940                    "count_min" => {
941                        if let Some(val) = &assertion.value {
942                            let n = val.as_u64().unwrap_or(0);
943                            let _ = writeln!(out, "  expect_true(length({call_expr}) >= {n})");
944                        }
945                    }
946                    "is_error" => {
947                        let _ = writeln!(out, "  expect_error({call_expr})");
948                    }
949                    "contains" => {
950                        if let Some(val) = &assertion.value {
951                            let r_val = json_to_r(val, false);
952                            let _ = writeln!(out, "  expect_true(grepl({r_val}, {call_expr}, fixed = TRUE))");
953                        }
954                    }
955                    other_check => {
956                        panic!("R e2e generator: unsupported method_result check type: {other_check}");
957                    }
958                }
959            } else {
960                panic!("R e2e generator: method_result assertion missing 'method' field");
961            }
962        }
963        "matches_regex" => {
964            if let Some(expected) = &assertion.value {
965                let r_val = json_to_r(expected, false);
966                let _ = writeln!(out, "  expect_true(grepl({r_val}, {field_expr}))");
967            }
968        }
969        "not_error" => {
970            // The call itself stops the test on error; emit an explicit
971            // `expect_true(TRUE)` so testthat doesn't report the test as
972            // empty when this is the only assertion.
973            let _ = writeln!(out, "  expect_true(TRUE)");
974        }
975        "error" => {
976            // Handled at the test level.
977        }
978        other => {
979            panic!("R e2e generator: unsupported assertion type: {other}");
980        }
981    }
982}
983
984/// Convert a `serde_json::Value` to an R literal string.
985///
986/// # Arguments
987///
988/// * `value` - The JSON value to convert
989///
990/// Convert a PascalCase string to snake_case.
991/// e.g. "DoubleEqual" → "double_equal", "Backticks" → "backticks"
992fn pascal_to_snake_case(s: &str) -> String {
993    let mut result = String::with_capacity(s.len() + 4);
994    for (i, ch) in s.chars().enumerate() {
995        if ch.is_uppercase() && i > 0 {
996            result.push('_');
997        }
998        for lc in ch.to_lowercase() {
999            result.push(lc);
1000        }
1001    }
1002    result
1003}
1004
1005/// Convert a JSON value to an R expression suitable for embedding inside a
1006/// `list(...)` that will be passed to `jsonlite::toJSON(..., auto_unbox = TRUE)`.
1007///
1008/// Differs from [`json_to_r`] in that any array-valued field is wrapped with
1009/// `I(...)` (jsonlite's `AsIs` marker) so it remains a JSON array after the
1010/// `auto_unbox` transform. Empty arrays become `I(list())` (→ `[]`) and
1011/// non-empty arrays become `I(c(...))` (→ `[..]`). Without this wrapping,
1012/// `Vec<String>` fields like `exclude_selectors` get unboxed to scalars and
1013/// serde deserialization on the Rust side fails with
1014/// `invalid type: string "foo", expected a sequence`.
1015fn json_to_r_preserve_arrays(value: &serde_json::Value, lowercase_enum_values: bool) -> String {
1016    match value {
1017        serde_json::Value::Array(arr) => {
1018            if arr.is_empty() {
1019                "I(list())".to_string()
1020            } else {
1021                let items: Vec<String> = arr.iter().map(|v| json_to_r(v, lowercase_enum_values)).collect();
1022                format!("I(c({}))", items.join(", "))
1023            }
1024        }
1025        serde_json::Value::Object(map) => {
1026            let entries: Vec<String> = map
1027                .iter()
1028                .map(|(k, v)| {
1029                    format!(
1030                        "\"{}\" = {}",
1031                        escape_r(k),
1032                        json_to_r_preserve_arrays(v, lowercase_enum_values)
1033                    )
1034                })
1035                .collect();
1036            format!("list({})", entries.join(", "))
1037        }
1038        _ => json_to_r(value, lowercase_enum_values),
1039    }
1040}
1041
1042/// * `lowercase_enum_values` - If true, convert PascalCase strings to snake_case (for enum values).
1043///   If false, preserve original case (for assertion expected values).
1044fn json_to_r(value: &serde_json::Value, lowercase_enum_values: bool) -> String {
1045    match value {
1046        serde_json::Value::String(s) => {
1047            // Convert PascalCase enum values to snake_case only if requested.
1048            // e.g. "Backticks" → "backticks", "DoubleEqual" → "double_equal"
1049            let normalized = if lowercase_enum_values && s.chars().next().is_some_and(|c| c.is_uppercase()) {
1050                pascal_to_snake_case(s)
1051            } else {
1052                s.clone()
1053            };
1054            format!("\"{}\"", escape_r(&normalized))
1055        }
1056        serde_json::Value::Bool(true) => "TRUE".to_string(),
1057        serde_json::Value::Bool(false) => "FALSE".to_string(),
1058        serde_json::Value::Number(n) => n.to_string(),
1059        serde_json::Value::Null => "NULL".to_string(),
1060        serde_json::Value::Array(arr) => {
1061            let items: Vec<String> = arr.iter().map(|v| json_to_r(v, lowercase_enum_values)).collect();
1062            format!("c({})", items.join(", "))
1063        }
1064        serde_json::Value::Object(map) => {
1065            let entries: Vec<String> = map
1066                .iter()
1067                .map(|(k, v)| format!("\"{}\" = {}", escape_r(k), json_to_r(v, lowercase_enum_values)))
1068                .collect();
1069            format!("list({})", entries.join(", "))
1070        }
1071    }
1072}
1073
1074/// Build an R visitor list and add setup line.
1075fn build_r_visitor(setup_lines: &mut Vec<String>, visitor_spec: &crate::fixture::VisitorSpec) {
1076    use std::fmt::Write as FmtWrite;
1077    // Collect each callback as a separate string, then join with ",\n" to avoid
1078    // trailing commas — R's list() does not accept a trailing comma.
1079    let methods: Vec<String> = visitor_spec
1080        .callbacks
1081        .iter()
1082        .map(|(method_name, action)| {
1083            let mut buf = String::new();
1084            emit_r_visitor_method(&mut buf, method_name, action);
1085            // strip the trailing ",\n" added by emit_r_visitor_method
1086            buf.trim_end_matches(['\n', ',']).to_string()
1087        })
1088        .collect();
1089    let mut visitor_obj = String::new();
1090    let _ = writeln!(visitor_obj, "list(");
1091    let _ = write!(visitor_obj, "{}", methods.join(",\n"));
1092    let _ = writeln!(visitor_obj);
1093    let _ = writeln!(visitor_obj, "  )");
1094
1095    setup_lines.push(format!("visitor <- {visitor_obj}"));
1096}
1097
1098/// Build an R call expression for a `method_result` assertion.
1099/// Maps method names to the appropriate R function or method calls.
1100fn build_r_method_call(result_var: &str, method_name: &str, args: Option<&serde_json::Value>) -> String {
1101    match method_name {
1102        "root_child_count" => format!("{result_var}$root_child_count()"),
1103        "root_node_type" => format!("{result_var}$root_node_type()"),
1104        "named_children_count" => format!("{result_var}$named_children_count()"),
1105        "has_error_nodes" => format!("tree_has_error_nodes({result_var})"),
1106        "error_count" | "tree_error_count" => format!("tree_error_count({result_var})"),
1107        "tree_to_sexp" => format!("tree_to_sexp({result_var})"),
1108        "contains_node_type" => {
1109            let node_type = args
1110                .and_then(|a| a.get("node_type"))
1111                .and_then(|v| v.as_str())
1112                .unwrap_or("");
1113            format!("tree_contains_node_type({result_var}, \"{node_type}\")")
1114        }
1115        "find_nodes_by_type" => {
1116            let node_type = args
1117                .and_then(|a| a.get("node_type"))
1118                .and_then(|v| v.as_str())
1119                .unwrap_or("");
1120            format!("find_nodes_by_type({result_var}, \"{node_type}\")")
1121        }
1122        "run_query" => {
1123            let query_source = args
1124                .and_then(|a| a.get("query_source"))
1125                .and_then(|v| v.as_str())
1126                .unwrap_or("");
1127            let language = args
1128                .and_then(|a| a.get("language"))
1129                .and_then(|v| v.as_str())
1130                .unwrap_or("");
1131            format!("run_query({result_var}, \"{language}\", \"{query_source}\", source)")
1132        }
1133        _ => {
1134            if let Some(args_val) = args {
1135                let arg_str = args_val
1136                    .as_object()
1137                    .map(|obj| {
1138                        obj.iter()
1139                            .map(|(k, v)| {
1140                                let r_val = json_to_r(v, false);
1141                                format!("{k} = {r_val}")
1142                            })
1143                            .collect::<Vec<_>>()
1144                            .join(", ")
1145                    })
1146                    .unwrap_or_default();
1147                format!("{result_var}${method_name}({arg_str})")
1148            } else {
1149                format!("{result_var}${method_name}()")
1150            }
1151        }
1152    }
1153}
1154
1155/// Emit an R visitor method for a callback action.
1156fn emit_r_visitor_method(out: &mut String, method_name: &str, action: &CallbackAction) {
1157    use std::fmt::Write as FmtWrite;
1158
1159    // R uses visit_ prefix (matches binding signature)
1160    let params = match method_name {
1161        "visit_link" => "ctx, href, text, title",
1162        "visit_image" => "ctx, src, alt, title",
1163        "visit_heading" => "ctx, level, text, id",
1164        "visit_code_block" => "ctx, lang, code",
1165        "visit_code_inline"
1166        | "visit_strong"
1167        | "visit_emphasis"
1168        | "visit_strikethrough"
1169        | "visit_underline"
1170        | "visit_subscript"
1171        | "visit_superscript"
1172        | "visit_mark"
1173        | "visit_button"
1174        | "visit_summary"
1175        | "visit_figcaption"
1176        | "visit_definition_term"
1177        | "visit_definition_description" => "ctx, text",
1178        "visit_text" => "ctx, text",
1179        "visit_list_item" => "ctx, ordered, marker, text",
1180        "visit_blockquote" => "ctx, content, depth",
1181        "visit_table_row" => "ctx, cells, is_header",
1182        "visit_custom_element" => "ctx, tag_name, html",
1183        "visit_form" => "ctx, action_url, method",
1184        "visit_input" => "ctx, input_type, name, value",
1185        "visit_audio" | "visit_video" | "visit_iframe" => "ctx, src",
1186        "visit_details" => "ctx, open",
1187        "visit_element_end" | "visit_table_end" | "visit_definition_list_end" | "visit_figure_end" => "ctx, output",
1188        "visit_list_start" => "ctx, ordered",
1189        "visit_list_end" => "ctx, ordered, output",
1190        _ => "ctx",
1191    };
1192
1193    let _ = writeln!(out, "    {method_name} = function({params}) {{");
1194    match action {
1195        CallbackAction::Skip => {
1196            let _ = writeln!(out, "      \"skip\"");
1197        }
1198        CallbackAction::Continue => {
1199            let _ = writeln!(out, "      \"continue\"");
1200        }
1201        CallbackAction::PreserveHtml => {
1202            let _ = writeln!(out, "      \"preserve_html\"");
1203        }
1204        CallbackAction::Custom { output } => {
1205            let escaped = escape_r(output);
1206            let _ = writeln!(out, "      list(custom = \"{escaped}\")");
1207        }
1208        CallbackAction::CustomTemplate { template, return_form } => {
1209            let r_expr = r_template_to_paste0(template);
1210            match return_form {
1211                TemplateReturnForm::BareString => {
1212                    let _ = writeln!(out, "      {r_expr}");
1213                }
1214                TemplateReturnForm::Dict => {
1215                    let _ = writeln!(out, "      list(custom = {r_expr})");
1216                }
1217            }
1218        }
1219    }
1220    let _ = writeln!(out, "    }},");
1221}