Skip to main content

alef_e2e/codegen/
r.rs

1//! R e2e test generator using testthat.
2
3use crate::config::E2eConfig;
4use crate::escape::{escape_r, r_template_to_paste0, sanitize_filename, sanitize_ident};
5use crate::field_access::FieldResolver;
6use crate::fixture::{Assertion, CallbackAction, Fixture, FixtureGroup, TemplateReturnForm};
7use alef_core::backend::GeneratedFile;
8use alef_core::config::ResolvedCrateConfig;
9use alef_core::hash::{self, CommentStyle};
10use anyhow::Result;
11use std::fmt::Write as FmtWrite;
12use std::path::PathBuf;
13
14use super::E2eCodegen;
15
16/// R e2e code generator.
17pub struct RCodegen;
18
19impl E2eCodegen for RCodegen {
20    fn generate(
21        &self,
22        groups: &[FixtureGroup],
23        e2e_config: &E2eConfig,
24        config: &ResolvedCrateConfig,
25        _type_defs: &[alef_core::ir::TypeDef],
26    ) -> Result<Vec<GeneratedFile>> {
27        let lang = self.language_name();
28        let output_base = PathBuf::from(e2e_config.effective_output()).join(lang);
29
30        let mut files = Vec::new();
31
32        // Resolve call config with overrides.
33        let call = &e2e_config.call;
34        let overrides = call.overrides.get(lang);
35        let module_path = overrides
36            .and_then(|o| o.module.as_ref())
37            .cloned()
38            .unwrap_or_else(|| call.module.clone());
39        let _function_name = overrides
40            .and_then(|o| o.function.as_ref())
41            .cloned()
42            .unwrap_or_else(|| call.function.clone());
43        let result_is_simple = call.result_is_simple || overrides.is_some_and(|o| o.result_is_simple);
44        let result_is_r_list = overrides.is_some_and(|o| o.result_is_r_list);
45        let _result_var = &call.result_var;
46
47        // Resolve package config.
48        let r_pkg = e2e_config.resolve_package("r");
49        let pkg_name = r_pkg
50            .as_ref()
51            .and_then(|p| p.name.as_ref())
52            .cloned()
53            .unwrap_or_else(|| module_path.clone());
54        let pkg_path = r_pkg
55            .as_ref()
56            .and_then(|p| p.path.as_ref())
57            .cloned()
58            .unwrap_or_else(|| "../../packages/r".to_string());
59        let pkg_version = r_pkg
60            .as_ref()
61            .and_then(|p| p.version.as_ref())
62            .cloned()
63            .or_else(|| config.resolved_version())
64            .unwrap_or_else(|| "0.1.0".to_string());
65
66        // Generate DESCRIPTION file.
67        files.push(GeneratedFile {
68            path: output_base.join("DESCRIPTION"),
69            content: render_description(&pkg_name, &pkg_version, e2e_config.dep_mode),
70            generated_header: false,
71        });
72
73        // Generate test runner script.
74        files.push(GeneratedFile {
75            path: output_base.join("run_tests.R"),
76            content: render_test_runner(&pkg_path, e2e_config.dep_mode),
77            generated_header: true,
78        });
79
80        // setup-fixtures.R — testthat sources `setup-*.R` files in the tests
81        // directory once before any tests run, with the working directory set
82        // to the tests/ folder. We use this hook to chdir into the repo's
83        // shared `test_documents/` directory so that fixture paths like
84        // `pdf/fake_memo.pdf` resolve at extraction time.
85        files.push(GeneratedFile {
86            path: output_base.join("tests").join("setup-fixtures.R"),
87            content: render_setup_fixtures(&e2e_config.test_documents_relative_from(1)),
88            generated_header: true,
89        });
90
91        // Generate test files per category.
92        for group in groups {
93            let active: Vec<&Fixture> = group
94                .fixtures
95                .iter()
96                .filter(|f| super::should_include_fixture(f, lang, e2e_config))
97                .collect();
98
99            if active.is_empty() {
100                continue;
101            }
102
103            let filename = format!("test_{}.R", sanitize_filename(&group.category));
104            let field_resolver = FieldResolver::new(
105                &e2e_config.fields,
106                &e2e_config.fields_optional,
107                &e2e_config.result_fields,
108                &e2e_config.fields_array,
109                &std::collections::HashSet::new(),
110            );
111            let content = render_test_file(
112                &group.category,
113                &active,
114                &field_resolver,
115                result_is_simple,
116                result_is_r_list,
117                e2e_config,
118            );
119            files.push(GeneratedFile {
120                path: output_base.join("tests").join(filename),
121                content,
122                generated_header: true,
123            });
124        }
125
126        Ok(files)
127    }
128
129    fn language_name(&self) -> &'static str {
130        "r"
131    }
132}
133
134fn render_description(pkg_name: &str, pkg_version: &str, dep_mode: crate::config::DependencyMode) -> String {
135    let dep_line = match dep_mode {
136        crate::config::DependencyMode::Registry => {
137            format!("Imports: {pkg_name} ({pkg_version})\n")
138        }
139        crate::config::DependencyMode::Local => String::new(),
140    };
141    format!(
142        r#"Package: e2e.r
143Title: E2E Tests for {pkg_name}
144Version: 0.1.0
145Description: End-to-end test suite.
146{dep_line}Suggests: testthat (>= 3.0.0)
147Config/testthat/edition: 3
148"#
149    )
150}
151
152fn render_setup_fixtures(test_documents_path: &str) -> String {
153    let mut out = String::new();
154    out.push_str(&hash::header(CommentStyle::Hash));
155    let _ = writeln!(out);
156    let _ = writeln!(
157        out,
158        "# Resolve fixture paths against the repo's `test_documents/` directory."
159    );
160    let _ = writeln!(
161        out,
162        "# testthat sources setup-*.R with the working directory at tests/,"
163    );
164    let _ = writeln!(
165        out,
166        "# so test_documents lives three directories up: tests/ -> e2e/r/ -> e2e/ -> repo root."
167    );
168    let _ = writeln!(
169        out,
170        "# Each `test_that()` block has its working directory reset back to tests/, so"
171    );
172    let _ = writeln!(
173        out,
174        "# fixture lookups must be performed via this helper rather than relying on `setwd`."
175    );
176    let _ = writeln!(
177        out,
178        ".alef_test_documents <- normalizePath(\"{test_documents_path}\", mustWork = FALSE)"
179    );
180    let _ = writeln!(out, ".resolve_fixture <- function(path) {{");
181    let _ = writeln!(out, "  if (dir.exists(.alef_test_documents)) {{");
182    let _ = writeln!(out, "    file.path(.alef_test_documents, path)");
183    let _ = writeln!(out, "  }} else {{");
184    let _ = writeln!(out, "    path");
185    let _ = writeln!(out, "  }}");
186    let _ = writeln!(out, "}}");
187    out
188}
189
190fn render_test_runner(pkg_path: &str, dep_mode: crate::config::DependencyMode) -> String {
191    let mut out = String::new();
192    out.push_str(&hash::header(CommentStyle::Hash));
193    let _ = writeln!(out, "library(testthat)");
194    match dep_mode {
195        crate::config::DependencyMode::Registry => {
196            // In registry mode, require the installed CRAN package directly.
197            let _ = writeln!(out, "# Package loaded via library() from CRAN install.");
198        }
199        crate::config::DependencyMode::Local => {
200            // Use devtools::load_all() to load the local R package without requiring
201            // a full install, matching the e2e test runner convention.
202            let _ = writeln!(out, "devtools::load_all(\"{pkg_path}\")");
203        }
204    }
205    let _ = writeln!(out);
206    // Surface every failure rather than aborting at the default max_fails=10 —
207    // partial pass counts are essential for triage during e2e bring-up.
208    let _ = writeln!(out, "testthat::set_max_fails(Inf)");
209    // Resolve the tests/ directory relative to this script. testthat reads
210    // setup-*.R from there before each file runs, where path resolution
211    // against test_documents/ is handled by the `.resolve_fixture` helper.
212    let _ = writeln!(
213        out,
214        ".script_dir <- tryCatch(dirname(normalizePath(sys.frame(1)$ofile)), error = function(e) getwd())"
215    );
216    let _ = writeln!(out, "test_dir(file.path(.script_dir, \"tests\"))");
217    out
218}
219
220fn render_test_file(
221    category: &str,
222    fixtures: &[&Fixture],
223    field_resolver: &FieldResolver,
224    result_is_simple: bool,
225    result_is_r_list: bool,
226    e2e_config: &E2eConfig,
227) -> String {
228    let mut out = String::new();
229    out.push_str(&hash::header(CommentStyle::Hash));
230    let _ = writeln!(out, "# E2e tests for category: {category}");
231    let _ = writeln!(out);
232
233    for (i, fixture) in fixtures.iter().enumerate() {
234        render_test_case(
235            &mut out,
236            fixture,
237            e2e_config,
238            field_resolver,
239            result_is_simple,
240            result_is_r_list,
241        );
242        if i + 1 < fixtures.len() {
243            let _ = writeln!(out);
244        }
245    }
246
247    // Clean up trailing newlines.
248    while out.ends_with("\n\n") {
249        out.pop();
250    }
251    if !out.ends_with('\n') {
252        out.push('\n');
253    }
254    out
255}
256
257fn render_test_case(
258    out: &mut String,
259    fixture: &Fixture,
260    e2e_config: &E2eConfig,
261    field_resolver: &FieldResolver,
262    default_result_is_simple: bool,
263    default_result_is_r_list: bool,
264) {
265    let call_config = e2e_config.resolve_call_for_fixture(fixture.call.as_deref(), &fixture.input);
266    let function_name = &call_config.function;
267    let result_var = &call_config.result_var;
268    // Per-fixture call configs (e.g. `list_document_extractors`) may set
269    // `result_is_simple = true` even when the default `[e2e.call]` does not.
270    // Without this lookup the registry/detection wrappers (which return scalar
271    // strings or character vectors directly) get wrapped in
272    // `jsonlite::fromJSON(...)` and the parser fails on non-JSON output.
273    let r_override = call_config.overrides.get("r");
274    let result_is_simple = if fixture.call.is_some() {
275        call_config.result_is_simple || r_override.is_some_and(|o| o.result_is_simple)
276    } else {
277        default_result_is_simple
278    };
279    // Per-fixture override: when the R binding already returns a native R list
280    // (not a JSON string), suppress `jsonlite::fromJSON` wrapping while still
281    // using field-path (`result$field`) accessors in assertions.
282    let result_is_r_list = if fixture.call.is_some() {
283        r_override.is_some_and(|o| o.result_is_r_list)
284    } else {
285        default_result_is_r_list
286    };
287
288    let test_name = sanitize_ident(&fixture.id);
289    let description = fixture.description.replace('"', "\\\"");
290
291    let expects_error = fixture.assertions.iter().any(|a| a.assertion_type == "error");
292
293    // Allow per-call R overrides to remap fixture argument names. Many calls
294    // (e.g. `extract_bytes`, `batch_extract_files`) use language-neutral
295    // fixture field names (`data`, `paths`) that the R extendr binding
296    // exposes under different identifiers (`content`, `items`).
297    let arg_name_map = r_override.map(|o| &o.arg_name_map);
298    // Resolve `options_type` for typed config args. When set (e.g. via the
299    // C#/Java override that pins the `config` arg of `embed_texts` to
300    // `EmbeddingConfig`), we use it instead of the heuristic in
301    // `r_default_for_config_arg` so the extendr binding receives the right
302    // ExternalPtr type rather than a default `ExtractionConfig`.
303    let options_type = r_override.and_then(|o| o.options_type.as_deref()).or_else(|| {
304        // Fall back to any other language's override that pins the type —
305        // R doesn't define its own override list yet for most embed calls,
306        // and the underlying Rust signature is the same regardless of
307        // binding, so reusing csharp/java/go/php options_type is safe.
308        //
309        // Skip `Js`-prefixed types from the Node/wasm bindings: those are
310        // NAPI/wasm-bindgen specific wrapper types, while extendr exposes the
311        // bare Rust type names (e.g. `ExtractionConfig`, not `JsExtractionConfig`).
312        call_config
313            .overrides
314            .values()
315            .filter_map(|o| o.options_type.as_deref())
316            .find(|name| !name.starts_with("Js"))
317    });
318    let args_str = build_args_string(&fixture.input, &call_config.args, arg_name_map, options_type);
319
320    // Build visitor setup and args if present
321    let mut setup_lines = Vec::new();
322    let final_args = if let Some(visitor_spec) = &fixture.visitor {
323        build_r_visitor(&mut setup_lines, visitor_spec);
324        // R rejects duplicated named arguments ("matched by multiple actual arguments"), so
325        // strip any existing `options = ...` arg before appending the visitor-options list.
326        // Handles `options = NULL` (when no default) and `options = ConversionOptions$default()`
327        // (when build_args_string emits a default placeholder for an optional options arg).
328        let base = strip_options_arg(&args_str);
329        let visitor_opts = "options = list(visitor = visitor)";
330        let trimmed = base.trim_matches([' ', ',']);
331        if trimmed.is_empty() {
332            visitor_opts.to_string()
333        } else {
334            format!("{trimmed}, {visitor_opts}")
335        }
336    } else {
337        args_str
338    };
339
340    if expects_error {
341        let _ = writeln!(out, "test_that(\"{test_name}: {description}\", {{");
342        for line in &setup_lines {
343            let _ = writeln!(out, "  {line}");
344        }
345        let _ = writeln!(out, "  expect_error({function_name}({final_args}))");
346        let _ = writeln!(out, "}})");
347        return;
348    }
349
350    let _ = writeln!(out, "test_that(\"{test_name}: {description}\", {{");
351    for line in &setup_lines {
352        let _ = writeln!(out, "  {line}");
353    }
354    // The extendr extraction wrappers return JSON strings carrying the
355    // serialized core result; parse into an R list so tests can use `$`
356    // accessors. `result_is_simple` calls (e.g. `convert_html_to_markdown`)
357    // already return scalar values and must be passed through verbatim.
358    // `result_is_r_list` signals the binding returns a native R list (Robj),
359    // not a JSON string — skip `jsonlite::fromJSON` but keep `$` accessors.
360    if result_is_simple || result_is_r_list {
361        let _ = writeln!(out, "  {result_var} <- {function_name}({final_args})");
362    } else {
363        let _ = writeln!(
364            out,
365            "  {result_var} <- jsonlite::fromJSON({function_name}({final_args}), simplifyVector = FALSE)"
366        );
367    }
368
369    for assertion in &fixture.assertions {
370        render_assertion(out, assertion, result_var, field_resolver, result_is_simple, e2e_config);
371    }
372
373    let _ = writeln!(out, "}})");
374}
375
376/// Remove the named `options = …` argument (if any) from an R call-args string.
377///
378/// Walks the string while tracking paren/quote depth so a comma inside a nested
379/// expression like `options = list(visitor = visitor)` isn't treated as the
380/// arg terminator. Returns the rebuilt args string with the `options =` arg
381/// dropped; callers append a fresh one.
382fn strip_options_arg(args_str: &str) -> String {
383    let mut parts: Vec<String> = Vec::new();
384    let mut current = String::new();
385    let mut paren_depth: i32 = 0;
386    let mut in_single = false;
387    let mut in_double = false;
388    for c in args_str.chars() {
389        if !in_single && !in_double {
390            match c {
391                '(' | '[' | '{' => paren_depth += 1,
392                ')' | ']' | '}' => paren_depth -= 1,
393                '\'' => in_single = true,
394                '"' => in_double = true,
395                ',' if paren_depth == 0 => {
396                    parts.push(current.trim().to_string());
397                    current.clear();
398                    continue;
399                }
400                _ => {}
401            }
402        } else if in_single && c == '\'' {
403            in_single = false;
404        } else if in_double && c == '"' {
405            in_double = false;
406        }
407        current.push(c);
408    }
409    if !current.trim().is_empty() {
410        parts.push(current.trim().to_string());
411    }
412    parts
413        .into_iter()
414        .filter(|p| !p.starts_with("options ") && !p.starts_with("options="))
415        .collect::<Vec<_>>()
416        .join(", ")
417}
418
419fn build_args_string(
420    input: &serde_json::Value,
421    args: &[crate::config::ArgMapping],
422    arg_name_map: Option<&std::collections::HashMap<String, String>>,
423    options_type: Option<&str>,
424) -> String {
425    if args.is_empty() {
426        // No declared args means the wrapper takes zero parameters; emitting
427        // `list()` here would trigger an `unused argument (list())` error in R.
428        // Likewise, fall through to nothing if the fixture's input is empty.
429        if matches!(input, serde_json::Value::Null) || input.as_object().is_some_and(|m| m.is_empty()) {
430            return String::new();
431        }
432        return json_to_r(input, true);
433    }
434
435    let parts: Vec<String> = args
436        .iter()
437        .filter_map(|arg| {
438            // Apply per-language argument renames before emitting the call.
439            let arg_name: &str = arg_name_map
440                .and_then(|m| m.get(&arg.name).map(String::as_str))
441                .unwrap_or(&arg.name);
442
443            let field = arg.field.strip_prefix("input.").unwrap_or(&arg.field);
444            let val = input.get(field);
445            // R extendr-generated wrappers do not preserve Option<T> defaults from
446            // the Rust signature — every parameter is positional and required at
447            // the R level. To keep generated calls valid we must pass a placeholder
448            // (`NULL` for `Option<T>`, `ExtractionConfig$default()` for typed
449            // configs) whenever the fixture omits an optional value.
450            let val = match val {
451                Some(v) if !(v.is_null() && arg.optional) => v,
452                _ => {
453                    if !arg.optional {
454                        return None;
455                    }
456                    if arg.arg_type == "json_object" {
457                        let r_value = r_default_for_config_arg(arg_name, options_type);
458                        return Some(format!("{arg_name} = {r_value}"));
459                    }
460                    return Some(format!("{arg_name} = NULL"));
461                }
462            };
463            // The extendr bindings expect owned PORs (ExternalPtr) for typed
464            // config arguments — passing an R `list()` raises
465            // `Expected ExternalPtr got List`. The fixtures don't carry the
466            // option fields needed to round-trip through ExtractionConfig$new,
467            // so emit `ExtractionConfig$default()` whenever a `json_object` arg
468            // resolves to an empty / object-shaped JSON value.
469            if arg.arg_type == "json_object" && (val.is_null() || val.as_object().is_some_and(|m| m.is_empty())) {
470                let r_value = r_default_for_config_arg(arg_name, options_type);
471                return Some(format!("{arg_name} = {r_value}"));
472            }
473            // Non-empty json_object for typed config args (those whose default is a
474            // `$default()` constructor): use `TypeName$from_json(jsonlite::toJSON(...))`
475            // so the Rust function receives a proper ExternalPtr, not a list.
476            // For `options`-style args (default = NULL) emit as a plain R list.
477            if arg.arg_type == "json_object" && val.is_object() {
478                let default_expr = r_default_for_config_arg(arg_name, options_type);
479                if default_expr.ends_with("$default()") {
480                    // Extract the type name from "TypeName$default()"
481                    let type_name = default_expr.trim_end_matches("$default()");
482                    // Use the `I(...)` (AsIs) wrapper for array-valued fields so
483                    // `jsonlite::toJSON(..., auto_unbox = TRUE)` preserves them as
484                    // JSON arrays. Without this, single-element vectors get
485                    // unboxed to scalars (e.g. `c("foo")` → `"foo"`) and serde
486                    // rejects them when deserializing `Vec<T>` fields.
487                    let r_list = json_to_r_preserve_arrays(val, true);
488                    let r_value = format!("{type_name}$from_json(jsonlite::toJSON({r_list}, auto_unbox = TRUE))");
489                    return Some(format!("{arg_name} = {r_value}"));
490                }
491                let r_value = json_to_r(val, true);
492                return Some(format!("{arg_name} = {r_value}"));
493            }
494            // `json_object` arrays are passed to extendr functions whose Rust
495            // signature is `items: String` (JSON-serialized batch items). The
496            // wrapper has no R-list → JSON conversion, so we must serialize the
497            // fixture value to a literal JSON string at test-emit time.
498            //
499            // Exception: when `element_type = "String"` the Rust signature is
500            // `Vec<String>` (e.g. `embed_texts(texts: Vec<String>, ...)`), which
501            // extendr binds as a native R character vector. Passing a JSON
502            // literal there would land as a single-element character vector
503            // containing the literal bytes `["a","b"]`, which is not what the
504            // caller intended. Emit a plain `c("a","b")` literal instead.
505            if arg.arg_type == "json_object" && val.is_array() {
506                if arg.element_type.as_deref() == Some("String") {
507                    let r_value = json_to_r(val, false);
508                    return Some(format!("{arg_name} = {r_value}"));
509                }
510                let json_literal = serde_json::to_string(val).unwrap_or_else(|_| "[]".to_string());
511                let escaped = escape_r(&json_literal);
512                return Some(format!("{arg_name} = \"{escaped}\""));
513            }
514            // `bytes` arg type: convert string fixture values into runtime
515            // `readBin(...)` calls so the wrapper receives raw bytes instead
516            // of an R character vector. This mirrors the Python emit_bytes_arg
517            // helper and is what the extendr binding for Vec<u8> expects.
518            if arg.arg_type == "bytes" {
519                if let Some(raw) = val.as_str() {
520                    let r_value = render_bytes_value(raw);
521                    return Some(format!("{arg_name} = {r_value}"));
522                }
523            }
524            // `file_path` arg type: fixtures encode relative paths that resolve
525            // against the repo's `test_documents/` directory. Using a runtime
526            // helper that anchors paths to that directory avoids fragility from
527            // testthat resetting the working directory between files.
528            if arg.arg_type == "file_path" {
529                if let Some(raw) = val.as_str() {
530                    if !raw.starts_with('/') && !raw.is_empty() {
531                        let escaped = escape_r(raw);
532                        return Some(format!("{arg_name} = .resolve_fixture(\"{escaped}\")"));
533                    }
534                }
535            }
536            Some(format!("{arg_name} = {}", json_to_r(val, true)))
537        })
538        .collect();
539
540    parts.join(", ")
541}
542
543/// Render a `bytes` fixture value as the R expression that produces a raw
544/// vector at test time. Mirrors python's `emit_bytes_arg` classifier so we can
545/// support both file-path style fixtures (`"pdf/fake_memo.pdf"`) and inline
546/// text payloads (`"<html>..."`). The resulting expression is dropped directly
547/// into the call site, e.g. `content = readBin(.resolve_fixture("pdf/fake_memo.pdf"), ...)`.
548fn render_bytes_value(raw: &str) -> String {
549    if raw.starts_with('<') || raw.starts_with('{') || raw.starts_with('[') || raw.contains(' ') {
550        // Inline text payload — encode to raw via charToRaw.
551        let escaped = escape_r(raw);
552        return format!("charToRaw(\"{escaped}\")");
553    }
554    let first = raw.chars().next().unwrap_or('\0');
555    if first.is_ascii_alphanumeric() || first == '_' {
556        if let Some(slash) = raw.find('/') {
557            if slash > 0 {
558                let after = &raw[slash + 1..];
559                if after.contains('.') && !after.is_empty() {
560                    let escaped = escape_r(raw);
561                    return format!(
562                        "readBin(.resolve_fixture(\"{escaped}\"), what = \"raw\", n = file.info(.resolve_fixture(\"{escaped}\"))$size)"
563                    );
564                }
565            }
566        }
567    }
568    // Default to inline text encoding — matches Python's InlineText branch.
569    let escaped = escape_r(raw);
570    format!("charToRaw(\"{escaped}\")")
571}
572
573/// Map the extractor argument name onto its R `*Config$default()` constructor.
574/// Falls back to `list()` for unknown names — the extendr binding will error
575/// with a clear message, which is preferable to silently passing a wrong type.
576///
577/// When `options_type` is provided (via a per-call language override pinning
578/// the typed config, e.g. `EmbeddingConfig` for `embed_texts`), it takes
579/// precedence over the arg-name heuristic so the extendr binding receives the
580/// correct ExternalPtr type.
581fn r_default_for_config_arg(arg_name: &str, options_type: Option<&str>) -> String {
582    if let Some(type_name) = options_type {
583        return format!("{type_name}$default()");
584    }
585    match arg_name {
586        "config" => "ExtractionConfig$default()".to_string(),
587        "options" => "NULL".to_string(),
588        "html_output" => "HtmlOutputConfig$default()".to_string(),
589        "chunking" => "ChunkingConfig$default()".to_string(),
590        "ocr" => "OcrConfig$default()".to_string(),
591        "image" | "images" => "ImageExtractionConfig$default()".to_string(),
592        "language_detection" => "LanguageDetectionConfig$default()".to_string(),
593        _ => "list()".to_string(),
594    }
595}
596
597fn render_assertion(
598    out: &mut String,
599    assertion: &Assertion,
600    result_var: &str,
601    field_resolver: &FieldResolver,
602    result_is_simple: bool,
603    _e2e_config: &E2eConfig,
604) {
605    // Handle synthetic / derived fields before the is_valid_for_result check
606    // so they are never treated as struct attribute accesses on the result.
607    if let Some(f) = &assertion.field {
608        match f.as_str() {
609            "chunks_have_content" => {
610                let pred = format!("all(sapply({result_var}$chunks %||% list(), function(c) nchar(c$content) > 0))");
611                match assertion.assertion_type.as_str() {
612                    "is_true" => {
613                        let _ = writeln!(out, "  expect_true({pred})");
614                    }
615                    "is_false" => {
616                        let _ = writeln!(out, "  expect_false({pred})");
617                    }
618                    _ => {
619                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
620                    }
621                }
622                return;
623            }
624            "chunks_have_embeddings" => {
625                let pred = format!(
626                    "all(sapply({result_var}$chunks %||% list(), function(c) !is.null(c$embedding) && length(c$embedding) > 0))"
627                );
628                match assertion.assertion_type.as_str() {
629                    "is_true" => {
630                        let _ = writeln!(out, "  expect_true({pred})");
631                    }
632                    "is_false" => {
633                        let _ = writeln!(out, "  expect_false({pred})");
634                    }
635                    _ => {
636                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
637                    }
638                }
639                return;
640            }
641            // ---- EmbedResponse virtual fields ----
642            // The extendr binding cannot return `Vec<Vec<f32>>` directly (extendr's
643            // Robj conversion has no impl for nested numeric vectors), so the
644            // wrapper serializes the result to a JSON string at the FFI boundary.
645            // Parse it on demand here so length/index assertions operate on the
646            // matrix structure rather than on the single string scalar.
647            "embeddings" => {
648                let parsed = format!(
649                    "(if (is.character({result_var}) && length({result_var}) == 1) jsonlite::fromJSON({result_var}, simplifyVector = FALSE) else {result_var})"
650                );
651                match assertion.assertion_type.as_str() {
652                    "count_equals" => {
653                        if let Some(val) = &assertion.value {
654                            let r_val = json_to_r(val, false);
655                            let _ = writeln!(out, "  expect_equal(length({parsed}), {r_val})");
656                        }
657                    }
658                    "count_min" => {
659                        if let Some(val) = &assertion.value {
660                            let r_val = json_to_r(val, false);
661                            let _ = writeln!(out, "  expect_gte(length({parsed}), {r_val})");
662                        }
663                    }
664                    "not_empty" => {
665                        let _ = writeln!(out, "  expect_gt(length({parsed}), 0)");
666                    }
667                    "is_empty" => {
668                        let _ = writeln!(out, "  expect_equal(length({parsed}), 0)");
669                    }
670                    _ => {
671                        let _ = writeln!(
672                            out,
673                            "  # skipped: unsupported assertion type on synthetic field 'embeddings'"
674                        );
675                    }
676                }
677                return;
678            }
679            "embedding_dimensions" => {
680                let expr = format!("(if (length({result_var}) == 0) 0L else length({result_var}[[1]]))");
681                match assertion.assertion_type.as_str() {
682                    "equals" => {
683                        if let Some(val) = &assertion.value {
684                            let r_val = json_to_r(val, false);
685                            let _ = writeln!(out, "  expect_equal({expr}, {r_val})");
686                        }
687                    }
688                    "greater_than" => {
689                        if let Some(val) = &assertion.value {
690                            let r_val = json_to_r(val, false);
691                            let _ = writeln!(out, "  expect_gt({expr}, {r_val})");
692                        }
693                    }
694                    _ => {
695                        let _ = writeln!(
696                            out,
697                            "  # skipped: unsupported assertion type on synthetic field 'embedding_dimensions'"
698                        );
699                    }
700                }
701                return;
702            }
703            "embeddings_valid" | "embeddings_finite" | "embeddings_non_zero" | "embeddings_normalized" => {
704                let pred = match f.as_str() {
705                    "embeddings_valid" => {
706                        format!("all(sapply({result_var}, function(e) length(e) > 0))")
707                    }
708                    "embeddings_finite" => {
709                        format!("all(sapply({result_var}, function(e) all(is.finite(e))))")
710                    }
711                    "embeddings_non_zero" => {
712                        format!("all(sapply({result_var}, function(e) any(e != 0.0)))")
713                    }
714                    "embeddings_normalized" => {
715                        format!("all(sapply({result_var}, function(e) abs(sum(e * e) - 1.0) < 1e-3))")
716                    }
717                    _ => unreachable!(),
718                };
719                match assertion.assertion_type.as_str() {
720                    "is_true" => {
721                        let _ = writeln!(out, "  expect_true({pred})");
722                    }
723                    "is_false" => {
724                        let _ = writeln!(out, "  expect_false({pred})");
725                    }
726                    _ => {
727                        let _ = writeln!(out, "  # skipped: unsupported assertion type on synthetic field '{f}'");
728                    }
729                }
730                return;
731            }
732            // ---- keywords / keywords_count ----
733            // R ExtractionResult does not expose extracted_keywords; skip.
734            "keywords" | "keywords_count" => {
735                let _ = writeln!(out, "  # skipped: field '{f}' not available on R ExtractionResult");
736                return;
737            }
738            _ => {}
739        }
740    }
741
742    // Skip assertions on fields that don't exist on the result type.
743    if let Some(f) = &assertion.field {
744        if !f.is_empty() && !field_resolver.is_valid_for_result(f) {
745            let _ = writeln!(out, "  # skipped: field '{f}' not available on result type");
746            return;
747        }
748    }
749
750    // When result_is_simple, skip assertions that reference non-content fields
751    // (e.g., metadata, document, structure) since the binding returns a plain value.
752    if result_is_simple {
753        if let Some(f) = &assertion.field {
754            let f_lower = f.to_lowercase();
755            if !f.is_empty()
756                && f_lower != "content"
757                && (f_lower.starts_with("metadata")
758                    || f_lower.starts_with("document")
759                    || f_lower.starts_with("structure"))
760            {
761                let _ = writeln!(
762                    out,
763                    "  # skipped: result_is_simple for field '{f}' not available on result type"
764                );
765                return;
766            }
767        }
768    }
769
770    let field_expr = if result_is_simple {
771        result_var.to_string()
772    } else {
773        match &assertion.field {
774            Some(f) if !f.is_empty() => field_resolver.accessor(f, "r", result_var),
775            _ => result_var.to_string(),
776        }
777    };
778
779    match assertion.assertion_type.as_str() {
780        "equals" => {
781            if let Some(expected) = &assertion.value {
782                let r_val = json_to_r(expected, false);
783                let _ = writeln!(out, "  expect_equal(trimws({field_expr}), {r_val})");
784            }
785        }
786        "contains" => {
787            if let Some(expected) = &assertion.value {
788                let r_val = json_to_r(expected, false);
789                let _ = writeln!(out, "  expect_true(grepl({r_val}, {field_expr}, fixed = TRUE))");
790            }
791        }
792        "contains_all" => {
793            if let Some(values) = &assertion.values {
794                for val in values {
795                    let r_val = json_to_r(val, false);
796                    let _ = writeln!(out, "  expect_true(any(grepl({r_val}, {field_expr}, fixed = TRUE)))");
797                }
798            }
799        }
800        "not_contains" => {
801            if let Some(expected) = &assertion.value {
802                let r_val = json_to_r(expected, false);
803                let _ = writeln!(out, "  expect_false(grepl({r_val}, {field_expr}, fixed = TRUE))");
804            }
805        }
806        "not_empty" => {
807            // Multi-element character vectors (e.g. `list_embedding_presets`)
808            // would otherwise evaluate `nchar(x) > 0` element-wise and fail
809            // `expect_true`'s scalar-logical contract. Reduce with `any()` so
810            // the predicate stays a single TRUE/FALSE regardless of length,
811            // and treat zero-length vectors as empty.
812            let _ = writeln!(
813                out,
814                "  expect_true(if (is.character({field_expr})) length({field_expr}) > 0 && any(nchar({field_expr}) > 0) else length({field_expr}) > 0)"
815            );
816        }
817        "is_empty" => {
818            // Rust `Option<String>::None` surfaces as `NA_character_` through
819            // extendr, and `Vec<...>` empties as a zero-length vector. Treat
820            // NULL, NA, "", and zero-length collections as "empty" so the same
821            // assertion works for scalar Option returns (`get_embedding_preset`)
822            // and collection returns alike.
823            let _ = writeln!(
824                out,
825                "  expect_true(is.null({field_expr}) || length({field_expr}) == 0 || (length({field_expr}) == 1 && (is.na({field_expr}) || identical({field_expr}, \"\"))))"
826            );
827        }
828        "contains_any" => {
829            if let Some(values) = &assertion.values {
830                let items: Vec<String> = values.iter().map(|v| json_to_r(v, false)).collect();
831                let vec_str = items.join(", ");
832                let _ = writeln!(
833                    out,
834                    "  expect_true(any(sapply(c({vec_str}), function(v) grepl(v, {field_expr}, fixed = TRUE))))"
835                );
836            }
837        }
838        "greater_than" => {
839            if let Some(val) = &assertion.value {
840                let r_val = json_to_r(val, false);
841                let _ = writeln!(out, "  expect_true({field_expr} > {r_val})");
842            }
843        }
844        "less_than" => {
845            if let Some(val) = &assertion.value {
846                let r_val = json_to_r(val, false);
847                let _ = writeln!(out, "  expect_true({field_expr} < {r_val})");
848            }
849        }
850        "greater_than_or_equal" => {
851            if let Some(val) = &assertion.value {
852                let r_val = json_to_r(val, false);
853                let _ = writeln!(out, "  expect_true({field_expr} >= {r_val})");
854            }
855        }
856        "less_than_or_equal" => {
857            if let Some(val) = &assertion.value {
858                let r_val = json_to_r(val, false);
859                let _ = writeln!(out, "  expect_true({field_expr} <= {r_val})");
860            }
861        }
862        "starts_with" => {
863            if let Some(expected) = &assertion.value {
864                let r_val = json_to_r(expected, false);
865                let _ = writeln!(out, "  expect_true(startsWith({field_expr}, {r_val}))");
866            }
867        }
868        "ends_with" => {
869            if let Some(expected) = &assertion.value {
870                let r_val = json_to_r(expected, false);
871                let _ = writeln!(out, "  expect_true(endsWith({field_expr}, {r_val}))");
872            }
873        }
874        "min_length" => {
875            if let Some(val) = &assertion.value {
876                if let Some(n) = val.as_u64() {
877                    let _ = writeln!(out, "  expect_true(nchar({field_expr}) >= {n})");
878                }
879            }
880        }
881        "max_length" => {
882            if let Some(val) = &assertion.value {
883                if let Some(n) = val.as_u64() {
884                    let _ = writeln!(out, "  expect_true(nchar({field_expr}) <= {n})");
885                }
886            }
887        }
888        "count_min" => {
889            if let Some(val) = &assertion.value {
890                if let Some(n) = val.as_u64() {
891                    let _ = writeln!(out, "  expect_true(length({field_expr}) >= {n})");
892                }
893            }
894        }
895        "count_equals" => {
896            if let Some(val) = &assertion.value {
897                if let Some(n) = val.as_u64() {
898                    let _ = writeln!(out, "  expect_equal(length({field_expr}), {n})");
899                }
900            }
901        }
902        "is_true" => {
903            let _ = writeln!(out, "  expect_true({field_expr})");
904        }
905        "is_false" => {
906            let _ = writeln!(out, "  expect_false({field_expr})");
907        }
908        "method_result" => {
909            if let Some(method_name) = &assertion.method {
910                let call_expr = build_r_method_call(result_var, method_name, assertion.args.as_ref());
911                let check = assertion.check.as_deref().unwrap_or("is_true");
912                match check {
913                    "equals" => {
914                        if let Some(val) = &assertion.value {
915                            if val.is_boolean() {
916                                if val.as_bool() == Some(true) {
917                                    let _ = writeln!(out, "  expect_true({call_expr})");
918                                } else {
919                                    let _ = writeln!(out, "  expect_false({call_expr})");
920                                }
921                            } else {
922                                let r_val = json_to_r(val, false);
923                                let _ = writeln!(out, "  expect_equal({call_expr}, {r_val})");
924                            }
925                        }
926                    }
927                    "is_true" => {
928                        let _ = writeln!(out, "  expect_true({call_expr})");
929                    }
930                    "is_false" => {
931                        let _ = writeln!(out, "  expect_false({call_expr})");
932                    }
933                    "greater_than_or_equal" => {
934                        if let Some(val) = &assertion.value {
935                            let r_val = json_to_r(val, false);
936                            let _ = writeln!(out, "  expect_true({call_expr} >= {r_val})");
937                        }
938                    }
939                    "count_min" => {
940                        if let Some(val) = &assertion.value {
941                            let n = val.as_u64().unwrap_or(0);
942                            let _ = writeln!(out, "  expect_true(length({call_expr}) >= {n})");
943                        }
944                    }
945                    "is_error" => {
946                        let _ = writeln!(out, "  expect_error({call_expr})");
947                    }
948                    "contains" => {
949                        if let Some(val) = &assertion.value {
950                            let r_val = json_to_r(val, false);
951                            let _ = writeln!(out, "  expect_true(grepl({r_val}, {call_expr}, fixed = TRUE))");
952                        }
953                    }
954                    other_check => {
955                        panic!("R e2e generator: unsupported method_result check type: {other_check}");
956                    }
957                }
958            } else {
959                panic!("R e2e generator: method_result assertion missing 'method' field");
960            }
961        }
962        "matches_regex" => {
963            if let Some(expected) = &assertion.value {
964                let r_val = json_to_r(expected, false);
965                let _ = writeln!(out, "  expect_true(grepl({r_val}, {field_expr}))");
966            }
967        }
968        "not_error" => {
969            // The call itself stops the test on error; emit an explicit
970            // `expect_true(TRUE)` so testthat doesn't report the test as
971            // empty when this is the only assertion.
972            let _ = writeln!(out, "  expect_true(TRUE)");
973        }
974        "error" => {
975            // Handled at the test level.
976        }
977        other => {
978            panic!("R e2e generator: unsupported assertion type: {other}");
979        }
980    }
981}
982
983/// Convert a `serde_json::Value` to an R literal string.
984///
985/// # Arguments
986///
987/// * `value` - The JSON value to convert
988///
989/// Convert a PascalCase string to snake_case.
990/// e.g. "DoubleEqual" → "double_equal", "Backticks" → "backticks"
991fn pascal_to_snake_case(s: &str) -> String {
992    let mut result = String::with_capacity(s.len() + 4);
993    for (i, ch) in s.chars().enumerate() {
994        if ch.is_uppercase() && i > 0 {
995            result.push('_');
996        }
997        for lc in ch.to_lowercase() {
998            result.push(lc);
999        }
1000    }
1001    result
1002}
1003
1004/// Convert a JSON value to an R expression suitable for embedding inside a
1005/// `list(...)` that will be passed to `jsonlite::toJSON(..., auto_unbox = TRUE)`.
1006///
1007/// Differs from [`json_to_r`] in that any array-valued field is wrapped with
1008/// `I(...)` (jsonlite's `AsIs` marker) so it remains a JSON array after the
1009/// `auto_unbox` transform. Empty arrays become `I(list())` (→ `[]`) and
1010/// non-empty arrays become `I(c(...))` (→ `[..]`). Without this wrapping,
1011/// `Vec<String>` fields like `exclude_selectors` get unboxed to scalars and
1012/// serde deserialization on the Rust side fails with
1013/// `invalid type: string "foo", expected a sequence`.
1014fn json_to_r_preserve_arrays(value: &serde_json::Value, lowercase_enum_values: bool) -> String {
1015    match value {
1016        serde_json::Value::Array(arr) => {
1017            if arr.is_empty() {
1018                "I(list())".to_string()
1019            } else {
1020                let items: Vec<String> = arr.iter().map(|v| json_to_r(v, lowercase_enum_values)).collect();
1021                format!("I(c({}))", items.join(", "))
1022            }
1023        }
1024        serde_json::Value::Object(map) => {
1025            let entries: Vec<String> = map
1026                .iter()
1027                .map(|(k, v)| {
1028                    format!(
1029                        "\"{}\" = {}",
1030                        escape_r(k),
1031                        json_to_r_preserve_arrays(v, lowercase_enum_values)
1032                    )
1033                })
1034                .collect();
1035            format!("list({})", entries.join(", "))
1036        }
1037        _ => json_to_r(value, lowercase_enum_values),
1038    }
1039}
1040
1041/// * `lowercase_enum_values` - If true, convert PascalCase strings to snake_case (for enum values).
1042///   If false, preserve original case (for assertion expected values).
1043fn json_to_r(value: &serde_json::Value, lowercase_enum_values: bool) -> String {
1044    match value {
1045        serde_json::Value::String(s) => {
1046            // Convert PascalCase enum values to snake_case only if requested.
1047            // e.g. "Backticks" → "backticks", "DoubleEqual" → "double_equal"
1048            let normalized = if lowercase_enum_values && s.chars().next().is_some_and(|c| c.is_uppercase()) {
1049                pascal_to_snake_case(s)
1050            } else {
1051                s.clone()
1052            };
1053            format!("\"{}\"", escape_r(&normalized))
1054        }
1055        serde_json::Value::Bool(true) => "TRUE".to_string(),
1056        serde_json::Value::Bool(false) => "FALSE".to_string(),
1057        serde_json::Value::Number(n) => n.to_string(),
1058        serde_json::Value::Null => "NULL".to_string(),
1059        serde_json::Value::Array(arr) => {
1060            let items: Vec<String> = arr.iter().map(|v| json_to_r(v, lowercase_enum_values)).collect();
1061            format!("c({})", items.join(", "))
1062        }
1063        serde_json::Value::Object(map) => {
1064            let entries: Vec<String> = map
1065                .iter()
1066                .map(|(k, v)| format!("\"{}\" = {}", escape_r(k), json_to_r(v, lowercase_enum_values)))
1067                .collect();
1068            format!("list({})", entries.join(", "))
1069        }
1070    }
1071}
1072
1073/// Build an R visitor list and add setup line.
1074fn build_r_visitor(setup_lines: &mut Vec<String>, visitor_spec: &crate::fixture::VisitorSpec) {
1075    use std::fmt::Write as FmtWrite;
1076    // Collect each callback as a separate string, then join with ",\n" to avoid
1077    // trailing commas — R's list() does not accept a trailing comma.
1078    let methods: Vec<String> = visitor_spec
1079        .callbacks
1080        .iter()
1081        .map(|(method_name, action)| {
1082            let mut buf = String::new();
1083            emit_r_visitor_method(&mut buf, method_name, action);
1084            // strip the trailing ",\n" added by emit_r_visitor_method
1085            buf.trim_end_matches(['\n', ',']).to_string()
1086        })
1087        .collect();
1088    let mut visitor_obj = String::new();
1089    let _ = writeln!(visitor_obj, "list(");
1090    let _ = write!(visitor_obj, "{}", methods.join(",\n"));
1091    let _ = writeln!(visitor_obj);
1092    let _ = writeln!(visitor_obj, "  )");
1093
1094    setup_lines.push(format!("visitor <- {visitor_obj}"));
1095}
1096
1097/// Build an R call expression for a `method_result` assertion.
1098/// Maps method names to the appropriate R function or method calls.
1099fn build_r_method_call(result_var: &str, method_name: &str, args: Option<&serde_json::Value>) -> String {
1100    match method_name {
1101        "root_child_count" => format!("{result_var}$root_child_count()"),
1102        "root_node_type" => format!("{result_var}$root_node_type()"),
1103        "named_children_count" => format!("{result_var}$named_children_count()"),
1104        "has_error_nodes" => format!("tree_has_error_nodes({result_var})"),
1105        "error_count" | "tree_error_count" => format!("tree_error_count({result_var})"),
1106        "tree_to_sexp" => format!("tree_to_sexp({result_var})"),
1107        "contains_node_type" => {
1108            let node_type = args
1109                .and_then(|a| a.get("node_type"))
1110                .and_then(|v| v.as_str())
1111                .unwrap_or("");
1112            format!("tree_contains_node_type({result_var}, \"{node_type}\")")
1113        }
1114        "find_nodes_by_type" => {
1115            let node_type = args
1116                .and_then(|a| a.get("node_type"))
1117                .and_then(|v| v.as_str())
1118                .unwrap_or("");
1119            format!("find_nodes_by_type({result_var}, \"{node_type}\")")
1120        }
1121        "run_query" => {
1122            let query_source = args
1123                .and_then(|a| a.get("query_source"))
1124                .and_then(|v| v.as_str())
1125                .unwrap_or("");
1126            let language = args
1127                .and_then(|a| a.get("language"))
1128                .and_then(|v| v.as_str())
1129                .unwrap_or("");
1130            format!("run_query({result_var}, \"{language}\", \"{query_source}\", source)")
1131        }
1132        _ => {
1133            if let Some(args_val) = args {
1134                let arg_str = args_val
1135                    .as_object()
1136                    .map(|obj| {
1137                        obj.iter()
1138                            .map(|(k, v)| {
1139                                let r_val = json_to_r(v, false);
1140                                format!("{k} = {r_val}")
1141                            })
1142                            .collect::<Vec<_>>()
1143                            .join(", ")
1144                    })
1145                    .unwrap_or_default();
1146                format!("{result_var}${method_name}({arg_str})")
1147            } else {
1148                format!("{result_var}${method_name}()")
1149            }
1150        }
1151    }
1152}
1153
1154/// Emit an R visitor method for a callback action.
1155fn emit_r_visitor_method(out: &mut String, method_name: &str, action: &CallbackAction) {
1156    use std::fmt::Write as FmtWrite;
1157
1158    // R uses visit_ prefix (matches binding signature)
1159    let params = match method_name {
1160        "visit_link" => "ctx, href, text, title",
1161        "visit_image" => "ctx, src, alt, title",
1162        "visit_heading" => "ctx, level, text, id",
1163        "visit_code_block" => "ctx, lang, code",
1164        "visit_code_inline"
1165        | "visit_strong"
1166        | "visit_emphasis"
1167        | "visit_strikethrough"
1168        | "visit_underline"
1169        | "visit_subscript"
1170        | "visit_superscript"
1171        | "visit_mark"
1172        | "visit_button"
1173        | "visit_summary"
1174        | "visit_figcaption"
1175        | "visit_definition_term"
1176        | "visit_definition_description" => "ctx, text",
1177        "visit_text" => "ctx, text",
1178        "visit_list_item" => "ctx, ordered, marker, text",
1179        "visit_blockquote" => "ctx, content, depth",
1180        "visit_table_row" => "ctx, cells, is_header",
1181        "visit_custom_element" => "ctx, tag_name, html",
1182        "visit_form" => "ctx, action_url, method",
1183        "visit_input" => "ctx, input_type, name, value",
1184        "visit_audio" | "visit_video" | "visit_iframe" => "ctx, src",
1185        "visit_details" => "ctx, open",
1186        "visit_element_end" | "visit_table_end" | "visit_definition_list_end" | "visit_figure_end" => "ctx, output",
1187        "visit_list_start" => "ctx, ordered",
1188        "visit_list_end" => "ctx, ordered, output",
1189        _ => "ctx",
1190    };
1191
1192    let _ = writeln!(out, "    {method_name} = function({params}) {{");
1193    match action {
1194        CallbackAction::Skip => {
1195            let _ = writeln!(out, "      \"skip\"");
1196        }
1197        CallbackAction::Continue => {
1198            let _ = writeln!(out, "      \"continue\"");
1199        }
1200        CallbackAction::PreserveHtml => {
1201            let _ = writeln!(out, "      \"preserve_html\"");
1202        }
1203        CallbackAction::Custom { output } => {
1204            let escaped = escape_r(output);
1205            let _ = writeln!(out, "      list(custom = \"{escaped}\")");
1206        }
1207        CallbackAction::CustomTemplate { template, return_form } => {
1208            let r_expr = r_template_to_paste0(template);
1209            match return_form {
1210                TemplateReturnForm::BareString => {
1211                    let _ = writeln!(out, "      {r_expr}");
1212                }
1213                TemplateReturnForm::Dict => {
1214                    let _ = writeln!(out, "      list(custom = {r_expr})");
1215                }
1216            }
1217        }
1218    }
1219    let _ = writeln!(out, "    }},");
1220}