rivet-cli 0.9.5

//! YAML scaffold generation for `rivet init`.

use crate::error::Result;

use super::{InitYamlDestination, TableInfo};

/// Split "schema.table" or just "table" into (schema, table).
pub(super) fn parse_table(table: &str) -> (String, &str) {
    match table.split_once('.') {
        Some((s, t)) => (s.to_string(), t),
        None => ("public".to_string(), table),
    }
}

pub(super) fn generate_config(
    info: &TableInfo,
    source_url: &str,
    dest: &InitYamlDestination,
) -> Result<String> {
    let st = super::source_type(source_url)?;
    let qualified_table = if info.schema == "public" || st == "mysql" {
        info.table.clone()
    } else {
        format!("{}.{}", info.schema, info.table)
    };
    let row_note = if info.row_estimate > 1_000_000 {
        format!("~{:.1}M rows", info.row_estimate as f64 / 1_000_000.0)
    } else if info.row_estimate > 1_000 {
        format!("~{:.0}K rows", info.row_estimate as f64 / 1_000.0)
    } else {
        format!("~{} rows", info.row_estimate)
    };
    let header = format!("# Generated by rivet init — {qualified_table} ({row_note})");

    let unbounded = table_has_unbounded_decimal_columns(info);
    let mut lines = config_header_lines(st, &header, unbounded);
    lines.push("exports:".to_string());
    lines.extend(export_block_lines(info, st, dest));
    Ok(lines.join("\n") + "\n")
}

pub(super) fn generate_schema_config(
    infos: &[TableInfo],
    source_url: &str,
    scope_label: &str,
    dest: &InitYamlDestination,
) -> Result<String> {
    let st = super::source_type(source_url)?;
    let header = format!("# Generated by rivet init — {scope_label}");
    let unbounded = infos.iter().any(table_has_unbounded_decimal_columns);
    let mut lines = config_header_lines(st, &header, unbounded);
    let dest_note = if dest.gcs_bucket.is_some() || dest.s3_bucket.is_some() {
        "# One export per table/view — per-table prefix `exports/<name>/` under the given bucket; review modes before running."
    } else {
        "# One export per table/view — review modes and destinations before running."
    };
    lines.push(dest_note.to_string());
    lines.push("exports:".to_string());
    for info in infos {
        lines.extend(export_block_lines(info, st, dest));
    }
    Ok(lines.join("\n") + "\n")
}

fn config_header_lines(
    source_type: &str,
    title_line: &str,
    unbounded_decimal_note: bool,
) -> Vec<String> {
    let mut lines = vec![
        title_line.to_string(),
        "# Review and adjust before running: rivet check --config <this-file>".to_string(),
    ];
    if unbounded_decimal_note {
        lines.push(
            "# NOTE: some exports use default decimal(p,s) for NUMERIC without precision in the DDL — search for \"# REVIEW:\" under columns:."
                .to_string(),
        );
    }
    lines.push("".to_string());
    lines.extend([
        "source:".to_string(),
        format!("  type: {source_type}"),
        "  url_env: DATABASE_URL  # export DATABASE_URL='<your-url>'".to_string(),
        "".to_string(),
    ]);
    lines
}

/// `exports/<table>/` in the bucket, or `exports/<schema>__<table>/` for non-`public` PostgreSQL.
fn table_export_prefix(info: &TableInfo, source_type: &str) -> String {
    let segment = if source_type == "postgres" && info.schema != "public" {
        format!("{}__{}", info.schema, info.table)
    } else {
        info.table.clone()
    };
    format!("exports/{segment}/")
}

/// Emit a YAML scalar that is unambiguous for any plain-text value the user
/// could pass via `--gcs-bucket` / `--s3-region` / `--gcs-credentials-file`.
///
/// Returns the input unchanged when it is safe as a YAML 1.2 plain scalar;
/// otherwise wraps it as a double-quoted scalar with escapes (handles names
/// like `1` / `true` / values with `:`, `#`, leading `-`, trailing space, etc.).
pub(super) fn yaml_quote_if_needed(v: &str) -> String {
    if needs_yaml_quoting(v) {
        yaml_double_quote(v)
    } else {
        v.to_string()
    }
}

fn needs_yaml_quoting(v: &str) -> bool {
    if v.is_empty() {
        return true;
    }
    // Leading or trailing whitespace would be stripped or break parsing.
    if v.trim() != v {
        return true;
    }
    // YAML 1.1/1.2 reserved scalars that must not look like a bare string.
    let lower = v.to_ascii_lowercase();
    if matches!(
        lower.as_str(),
        "true" | "false" | "yes" | "no" | "on" | "off" | "null" | "~"
    ) {
        return true;
    }
    // Anything that parses as a number would be loaded as int/float, not string.
    if v.parse::<i64>().is_ok() || v.parse::<f64>().is_ok() {
        return true;
    }
    // YAML indicators that are unsafe at the start of a plain scalar.
    if let Some(first) = v.chars().next()
        && matches!(
            first,
            '!' | '&'
                | '*'
                | '@'
                | '`'
                | '|'
                | '>'
                | '%'
                | '?'
                | ':'
                | '-'
                | '['
                | ']'
                | '{'
                | '}'
                | ','
                | '#'
                | '\''
                | '"'
        )
    {
        return true;
    }
    // Anywhere in the string: control chars, ` #` comment starter, `: ` mapping
    // separator, or flow-context indicators all need quoting to be safe.
    let bytes: Vec<char> = v.chars().collect();
    for (i, c) in bytes.iter().enumerate() {
        if c.is_control() {
            return true;
        }
        if matches!(c, '[' | ']' | '{' | '}' | ',' | '"' | '\'' | '\\' | '\t') {
            return true;
        }
        if *c == '#' && i > 0 && bytes[i - 1].is_whitespace() {
            return true;
        }
        if *c == ':' {
            // ": " or ":" at end-of-value both terminate the scalar in plain style.
            let next = bytes.get(i + 1);
            if next.is_none() || next.is_some_and(|n| n.is_whitespace()) {
                return true;
            }
        }
    }
    false
}

fn yaml_double_quote(v: &str) -> String {
    let mut s = String::with_capacity(v.len() + 2);
    s.push('"');
    for c in v.chars() {
        match c {
            '\\' => s.push_str("\\\\"),
            '"' => s.push_str("\\\""),
            '\n' => s.push_str("\\n"),
            '\r' => s.push_str("\\r"),
            '\t' => s.push_str("\\t"),
            c if (c as u32) < 0x20 || c == '\x7f' => {
                s.push_str(&format!("\\x{:02X}", c as u32));
            }
            c => s.push(c),
        }
    }
    s.push('"');
    s
}

/// Return true for column data types that Rivet cannot safely auto-map without
/// explicit precision/scale (NUMERIC / DECIMAL family).
fn is_decimal_type(data_type: &str) -> bool {
    let t = data_type.to_ascii_lowercase();
    t == "numeric" || t == "decimal"
}

/// Mirror of the validation in [`crate::config::models::validate_table_shortcut_ident`]:
/// accepts `<name>` or `<schema>.<name>` with ASCII-only identifier characters.
fn is_simple_pg_ident(s: &str) -> bool {
    let parts: Vec<&str> = s.split('.').collect();
    if parts.is_empty() || parts.len() > 2 {
        return false;
    }
    parts.iter().all(|p| {
        let mut chars = p.chars();
        match chars.next() {
            Some(c) if c.is_ascii_alphabetic() || c == '_' => {
                chars.all(|c| c.is_ascii_alphanumeric() || c == '_')
            }
            _ => false,
        }
    })
}

/// `true` when the table exports at least one `DECIMAL`/`NUMERIC` column whose
/// precision/scale are missing from introspection (`numeric` without `(p,s)` in DDL).
pub(super) fn table_has_unbounded_decimal_columns(info: &TableInfo) -> bool {
    info.columns.iter().any(|c| {
        is_decimal_type(&c.data_type)
            && !matches!(
                (c.numeric_precision.as_ref(), c.numeric_scale.as_ref()),
                (Some(_), Some(_))
            )
    })
}

/// Default `decimal(p,s)` when DDL has no `(precision, scale)` for `numeric`/`decimal`.
/// Fits Arrow Decimal128 (precision ≤ 38). Replace in YAML (or constrain the DDL) when domain rules differ.
const INIT_UNBOUNDED_DECIMAL_DEFAULT_PRECISION: u32 = 38;
const INIT_UNBOUNDED_DECIMAL_DEFAULT_SCALE: u32 = 18;

/// Present on generated `columns:` lines that use the default above — `rivet init` reminds on stderr.
pub(crate) const INIT_DECIMAL_REVIEW_MARKER: &str = "# REVIEW:";

fn init_default_decimal_yaml_line(col_name: &str) -> String {
    format!(
        "      {}: decimal({},{})  # REVIEW: DDL has no numeric(p,s); edit to the real decimal(p,s) or change the column type — values outside this bound may truncate or fail export.",
        col_name, INIT_UNBOUNDED_DECIMAL_DEFAULT_PRECISION, INIT_UNBOUNDED_DECIMAL_DEFAULT_SCALE,
    )
}

fn export_block_lines(
    info: &TableInfo,
    source_type: &str,
    dest: &InitYamlDestination,
) -> Vec<String> {
    let mode = info.suggest_mode();
    let columns: Vec<&str> = info.columns.iter().map(|c| c.name.as_str()).collect();
    let col_list = columns.join(", ");
    let qualified_table = if info.schema == "public" || source_type == "mysql" {
        info.table.clone()
    } else {
        format!("{}.{}", info.schema, info.table)
    };

    // For `mode: full` on a plain table, emit the `table:` shortcut: it produces
    // `SELECT * FROM <schema>.<table>` internally, which is the only form the PG
    // numeric-catalog-hint resolver recognises — so `numeric(p,s)` columns get
    // typed correctly even if the user later strips the `columns:` overrides.
    //
    // For `chunked` / `incremental` we keep the explicit `SELECT col1, ... FROM`
    // form: those modes usually start from a curated column set and benefit from
    // a self-documenting YAML.
    let mut lines = vec![format!("  - name: {}", yaml_quote_if_needed(&info.table))];
    if mode == "full" && source_type == "postgres" && is_simple_pg_ident(&qualified_table) {
        lines.push(format!("    table: {qualified_table}"));
    } else {
        lines.push("    query: >".to_string());
        lines.push(format!("      SELECT {col_list}"));
        lines.push(format!("      FROM {qualified_table}"));
    }
    // Inline rationale above `mode:` so the operator can see *why* this
    // mode got picked, not just *what*. Easy to delete; the suggestion
    // is documentation, not a contract.
    lines.push(format!("    # {}", info.mode_rationale(mode)));
    lines.push(format!("    mode: {mode}"));

    match mode {
        "chunked" => {
            let chunk_col = info.best_chunk_column().unwrap_or("id");
            let parallel = suggest_parallel(info.row_estimate, info.avg_row_bytes(), source_type);
            let chunk_size = info.suggest_chunk_size();
            lines.push(format!(
                "    chunk_column: {}",
                yaml_quote_if_needed(chunk_col)
            ));
            // Scale chunk_size by row estimate so the per-table file count
            // stays in a humane range. See `TableInfo::suggest_chunk_size`
            // for the bands. A 10 M-row export used to produce 100 files;
            // it now lands at ~10. Operators who want different geometry
            // override this line directly.
            lines.push(format!("    chunk_size: {chunk_size}"));
            lines.push("    chunk_checkpoint: true".to_string());
            if parallel > 1 {
                // Correction #5: show the predicted peak so the operator can
                // trade memory for speed without guessing. RSS scales with
                // worker count × row width (not chunk_size). See the sweep in
                // docs/bench/reports/REPORT_full_vs_parallel.md.
                if let Some(b) = info.avg_row_bytes() {
                    lines.push(format!(
                        "    # est. peak RSS ≈ {} MB ({} workers × ~{} MB/worker @ ~{} B/row); lower `parallel` to spend less memory",
                        estimate_peak_rss_mb(parallel, b),
                        parallel,
                        per_worker_rss_mb(b),
                        b,
                    ));
                }
                lines.push(format!("    parallel: {parallel}"));
            } else if source_type == "mysql" && info.avg_row_bytes().is_some_and(|b| b >= 1024) {
                // Wide MySQL: a single sequential scan beats parallel chunks
                // (the range-chunk contention regresses throughput). Left at
                // parallel: 1 on purpose — raise it only if you've measured a
                // gain on your hardware.
                lines.push(
                    "    # parallel: 1 (wide rows on MySQL: single scan is faster than chunks)"
                        .to_string(),
                );
            }
        }
        "incremental" => {
            let cursor = info.best_cursor_column().unwrap_or("updated_at");
            lines.push(format!(
                "    cursor_column: {}",
                yaml_quote_if_needed(cursor)
            ));
        }
        _ => {}
    }

    lines.push("    format: parquet".to_string());

    // Row-group auto-tuning: emit for chunked (always large) and full-mode
    // exports big enough for tuning to have measurable effect.
    if mode == "chunked" || info.row_estimate > 100_000 {
        lines.push("    parquet:".to_string());
        lines.push("      row_group_strategy: auto".to_string());
        lines.push(format!(
            "      target_row_group_mb: {}",
            suggest_row_group_mb(info)
        ));
    }

    lines.push("    meta_columns:".to_string());
    lines.push("      exported_at: true".to_string());
    lines.push("      row_hash: true".to_string());
    lines.extend(destination_scaffold(info, source_type, dest));

    // Emit `columns:` overrides for NUMERIC/DECIMAL columns so the export
    // doesn't fail with "precision/scale unavailable".
    let decimal_cols: Vec<&super::ColumnInfo> = info
        .columns
        .iter()
        .filter(|c| is_decimal_type(&c.data_type))
        .collect();
    if !decimal_cols.is_empty() {
        lines.push("    columns:".to_string());
        for col in decimal_cols {
            match (col.numeric_precision, col.numeric_scale) {
                (Some(p), Some(s)) => {
                    lines.push(format!("      {}: decimal({p},{s})", col.name));
                }
                _ => {
                    lines.push(init_default_decimal_yaml_line(&col.name));
                }
            }
        }
    }

    lines
}

fn destination_scaffold(
    info: &TableInfo,
    source_type: &str,
    dest: &InitYamlDestination,
) -> Vec<String> {
    let prefix = yaml_quote_if_needed(&table_export_prefix(info, source_type));
    if let Some(bucket) = &dest.gcs_bucket {
        let bucket = yaml_quote_if_needed(bucket);
        let mut v = vec![
            "    destination:".to_string(),
            "      type: gcs".to_string(),
            format!("      bucket: {bucket}"),
            format!("      prefix: {prefix}"),
        ];
        if let Some(p) = &dest.gcs_credentials_file {
            v.push(format!(
                "      credentials_file: {}",
                yaml_quote_if_needed(p)
            ));
        }
        v
    } else if let Some(bucket) = &dest.s3_bucket {
        let bucket = yaml_quote_if_needed(bucket);
        let mut v = vec![
            "    destination:".to_string(),
            "      type: s3".to_string(),
            format!("      bucket: {bucket}"),
            format!("      prefix: {prefix}"),
        ];
        if let Some(r) = &dest.s3_region {
            v.push(format!("      region: {}", yaml_quote_if_needed(r)));
        }
        v
    } else {
        vec![
            "    destination:".to_string(),
            "      type: local".to_string(),
            "      path: ./output".to_string(),
        ]
    }
}

/// Suggest `target_row_group_mb` based on column shape.
/// Wide tables with many text/JSON columns get smaller groups so downstream
/// readers can push predicates without decoding the full group.
fn suggest_row_group_mb(info: &TableInfo) -> u64 {
    let wide_cols = info.columns.iter().filter(|c| {
        matches!(
            c.data_type.to_ascii_lowercase().as_str(),
            "text" | "varchar" | "character varying" | "jsonb" | "json" | "bytea"
        )
    });
    if wide_cols.count() >= 5 { 64 } else { 128 }
}

/// Suggested worker count — cost- *and* engine-aware (see
/// `docs/bench/reports/REPORT_full_vs_parallel.md`).
///
/// Parallelism pays off only when the single thread leaves throughput headroom,
/// which is governed by per-row cost (≈ avg row bytes) and the engine's
/// single-scan speed:
///   - **Narrow rows** (cheap per-row) are CPU-bound on row *count* — parallel
///     scales ~3.3× on every engine, so scale workers with the row estimate.
///   - **Wide rows on MySQL** already saturate an efficient sequential scan;
///     splitting them adds buffer-pool/connection contention and *regresses*
///     (measured 1.3× slower + 3× RAM) — keep it single-threaded.
///   - **Wide rows on Postgres / SQL Server** have a slow single scan (and on
///     MSSQL a full statement times out), so chunked-parallel still wins / is
///     required — scale workers with the row estimate.
///
/// `avg_row_bytes == None` (unknown size) falls back to the row-count tiers.
fn suggest_parallel(rows: i64, avg_row_bytes: Option<i64>, source_type: &str) -> usize {
    /// At/above this width a row is "wide" for the contention trade-off.
    const WIDE_BYTES: i64 = 1024;
    // The one measured case where parallel is a net loss: wide rows on MySQL,
    // whose single sequential scan is already fast. Prefer one full scan.
    if source_type == "mysql" && avg_row_bytes.is_some_and(|b| b >= WIDE_BYTES) {
        return 1;
    }
    // Everywhere else there is single-thread headroom — scale with row count.
    let by_rows = match rows {
        r if r < 500_000 => 1,
        r if r < 5_000_000 => 2,
        _ => 4,
    };
    // Correction #5: never suggest a worker count whose predicted peak RSS
    // breaches a memory budget. At the current ≤4 ceiling this is rarely
    // binding (peak ≤ ~550 MB even at extreme width), but it keeps the
    // suggestion honest if the ceiling is ever raised or rows are very wide.
    match avg_row_bytes {
        Some(b) => memory_capped_parallel(by_rows, b, DEFAULT_MEM_BUDGET_MB),
        None => by_rows,
    }
}

/// Default memory budget (MB) the scaffold sizes worker count against. Generous
/// on purpose — a guard against pathological widths, not a tuner. Operators on
/// constrained boxes lower `parallel` directly (the emitted comment shows the
/// predicted peak).
const DEFAULT_MEM_BUDGET_MB: u64 = 2048;

/// Per-worker peak RSS (MB) under the default *adaptive* batching, fitted to the
/// sweep in `docs/bench/reports/REPORT_full_vs_parallel.md`. Anchored on
/// measured points — ~19 MB/worker at ~40 B/row (narrow), ~105 MB at ~4 KB/row
/// (wide) — and clamped to a ~130 MB ceiling (≈ 2× the 64 MB adaptive batch
/// target). The driver is **row width × in-flight batch, not chunk_size**
/// (chunk_size only sets file count). An explicit large `tuning.batch_size`
/// overrides adaptive batching and raises this beyond the model.
fn per_worker_rss_mb(avg_row_bytes: i64) -> u64 {
    const FLOOR_MB: u64 = 18;
    const CEIL_MB: u64 = 130;
    let b = avg_row_bytes.max(0) as u64;
    (FLOOR_MB + b * 87 / 4096).clamp(FLOOR_MB, CEIL_MB)
}

/// Predicted peak process RSS (MB) for a chunked export with `parallel` workers.
/// `peak ≈ 16 (process base) + parallel × per_worker_rss_mb(width)`. Linear in
/// `parallel`; slightly *over*-estimates past ~4 workers (allocator reuse) —
/// the safe direction for a budget. Validated against the sweep (par 4 wide:
/// est 436 vs measured 444 MB; par 8 narrow: est 166 vs 169 MB).
pub(crate) fn estimate_peak_rss_mb(parallel: usize, avg_row_bytes: i64) -> u64 {
    const PROCESS_BASE_MB: u64 = 16;
    PROCESS_BASE_MB + parallel as u64 * per_worker_rss_mb(avg_row_bytes)
}

/// Largest worker count whose predicted peak RSS stays within `budget_mb`
/// (never below 1).
fn memory_capped_parallel(suggested: usize, avg_row_bytes: i64, budget_mb: u64) -> usize {
    const PROCESS_BASE_MB: u64 = 16;
    let per_worker = per_worker_rss_mb(avg_row_bytes).max(1);
    let max_by_mem = (budget_mb.saturating_sub(PROCESS_BASE_MB) / per_worker).max(1) as usize;
    suggested.min(max_by_mem)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::init::{ColumnInfo, TableInfo};

    /// Cost+engine-aware parallelism (REPORT_full_vs_parallel.md): the wide
    /// MySQL case is the only one forced single-threaded; narrow scales on
    /// every engine, and wide PG/MSSQL still scale.
    #[test]
    fn suggest_parallel_is_cost_and_engine_aware() {
        let big = 10_000_000;
        let wide = Some(4096);
        let narrow = Some(40);
        // Wide MySQL → single scan beats chunks, regardless of row count.
        assert_eq!(suggest_parallel(big, wide, "mysql"), 1);
        assert_eq!(suggest_parallel(500_000, wide, "mysql"), 1);
        // Narrow rows → headroom on every engine → scale with rows.
        assert_eq!(suggest_parallel(big, narrow, "mysql"), 4);
        assert_eq!(suggest_parallel(big, narrow, "postgres"), 4);
        // Wide PG / MSSQL → slow single scan (MSSQL full even times out) →
        // still parallelise.
        assert_eq!(suggest_parallel(big, wide, "postgres"), 4);
        assert_eq!(suggest_parallel(big, wide, "mssql"), 4);
        // Unknown size → row-count tiers (no width signal to act on).
        assert_eq!(suggest_parallel(big, None, "mysql"), 4);
        assert_eq!(suggest_parallel(100_000, None, "postgres"), 1);
    }

    /// Correction #5: the peak-RSS estimate tracks the measured sweep
    /// (REPORT_full_vs_parallel.md) within ~10%, and over-estimates past 4
    /// workers (the safe direction). Anchors: par4 wide ≈444 MB, par8 narrow
    /// ≈169 MB, par1 narrow ≈34 MB.
    #[test]
    fn peak_rss_estimate_matches_sweep() {
        let wide = 4096;
        let narrow = 40;
        let near = |est: u64, measured: u64| {
            let d = est.abs_diff(measured);
            assert!(
                d * 100 / measured <= 12,
                "estimate {est} too far from measured {measured}"
            );
        };
        near(estimate_peak_rss_mb(4, wide), 444);
        near(estimate_peak_rss_mb(1, narrow), 34);
        near(estimate_peak_rss_mb(4, narrow), 92);
        near(estimate_peak_rss_mb(8, narrow), 169);
        // Per-worker is width-driven and clamps to the adaptive-batch ceiling.
        assert_eq!(per_worker_rss_mb(40), 18);
        assert!(per_worker_rss_mb(65_536) <= 130, "must clamp to ceiling");
    }

    /// The memory cap reduces workers only when the predicted peak would breach
    /// the budget; it is inert at a generous budget and never returns 0.
    #[test]
    fn memory_cap_binds_only_under_budget() {
        // Wide rows, tiny 256 MB budget → (256-16)/105 ≈ 2 workers.
        assert_eq!(memory_capped_parallel(4, 4096, 256), 2);
        // Same suggestion, generous budget → untouched.
        assert_eq!(memory_capped_parallel(4, 4096, 2048), 4);
        // Never below 1 even with an absurd budget.
        assert_eq!(memory_capped_parallel(4, 65_536, 1), 1);
    }

    fn make_table(cols: Vec<ColumnInfo>) -> TableInfo {
        TableInfo {
            schema: "public".into(),
            table: "payments".into(),
            row_estimate: 100,
            total_bytes: None,
            columns: cols,
        }
    }

    fn col(name: &str, ty: &str) -> ColumnInfo {
        ColumnInfo {
            name: name.into(),
            data_type: ty.into(),
            is_primary_key: false,
            is_nullable: true,
            numeric_precision: None,
            numeric_scale: None,
        }
    }

    fn decimal_col(name: &str, p: u32, s: u32) -> ColumnInfo {
        ColumnInfo {
            numeric_precision: Some(p),
            numeric_scale: Some(s),
            ..col(name, "numeric")
        }
    }

    fn unbounded_col(name: &str) -> ColumnInfo {
        col(name, "numeric")
    }

    #[test]
    fn decimal_with_precision_emits_override() {
        let info = make_table(vec![col("id", "bigint"), decimal_col("amount", 18, 2)]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(yaml.contains("    columns:"), "columns block missing");
        assert!(
            yaml.contains("      amount: decimal(18,2)"),
            "decimal override missing:\n{yaml}"
        );
        assert!(
            !yaml.contains("id:"),
            "non-decimal column must not appear in columns block"
        );
    }

    #[test]
    fn unbounded_decimal_emits_default_with_review_marker() {
        let info = make_table(vec![col("id", "bigint"), unbounded_col("price")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("# NOTE: some exports use default decimal"),
            "header NOTE missing:\n{yaml}"
        );
        assert!(
            yaml.contains("      price: decimal(38,18)  # REVIEW:"),
            "default decimal with REVIEW missing:\n{yaml}"
        );
    }

    #[test]
    fn no_decimal_columns_no_columns_block() {
        let info = make_table(vec![col("id", "bigint"), col("label", "text")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            !yaml.contains("    columns:"),
            "columns block must not appear:\n{yaml}"
        );
    }

    fn chunked_table(rows: i64, cols: Vec<ColumnInfo>) -> TableInfo {
        TableInfo {
            schema: "public".into(),
            table: "events".into(),
            row_estimate: rows,
            total_bytes: None,
            columns: cols,
        }
    }

    #[test]
    fn chunked_large_table_emits_parquet_block() {
        let info = chunked_table(
            2_000_000,
            vec![
                col("id", "bigint"),
                col("name", "text"),
                col("ts", "timestamptz"),
            ],
        );
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    parquet:"),
            "parquet block must be emitted for chunked large table:\n{yaml}"
        );
        assert!(
            yaml.contains("      row_group_strategy: auto"),
            "auto strategy must be present:\n{yaml}"
        );
        assert!(
            yaml.contains("      target_row_group_mb: 128"),
            "128 MB target expected for narrow table:\n{yaml}"
        );
    }

    #[test]
    fn wide_table_suggests_smaller_row_group_mb() {
        let info = chunked_table(
            5_000_000,
            vec![
                col("id", "bigint"),
                col("body", "text"),
                col("raw_html", "text"),
                col("metadata", "jsonb"),
                col("extra", "json"),
                col("notes", "text"),
            ],
        );
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("      target_row_group_mb: 64"),
            "wide table (≥5 text/json cols) must suggest 64 MB:\n{yaml}"
        );
    }

    #[test]
    fn small_full_mode_table_has_no_parquet_block() {
        let info = make_table(vec![col("id", "bigint"), col("label", "text")]);
        // row_estimate = 100 (from make_table), mode = full
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            !yaml.contains("    parquet:"),
            "small full-mode table must not emit parquet block:\n{yaml}"
        );
    }

    // ── `table:` shortcut emission ───────────────────────────────────────────

    #[test]
    fn full_mode_pg_emits_table_shortcut_not_select_query() {
        let info = make_table(vec![col("id", "bigint"), col("name", "text")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    table: payments"),
            "full-mode PG export should emit `table:` shortcut:\n{yaml}"
        );
        assert!(
            !yaml.contains("    query: >"),
            "explicit SELECT block should be replaced by `table:`:\n{yaml}"
        );
        assert!(
            !yaml.contains("SELECT id"),
            "no enumerated SELECT for the table-shortcut form:\n{yaml}"
        );
    }

    #[test]
    fn full_mode_pg_non_public_schema_emits_qualified_table() {
        let info = TableInfo {
            schema: "billing".into(),
            table: "invoices".into(),
            row_estimate: 100,
            total_bytes: None,
            columns: vec![col("id", "bigint")],
        };
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    table: billing.invoices"),
            "qualified table name expected for non-public schema:\n{yaml}"
        );
    }

    #[test]
    fn full_mode_mysql_keeps_select_query_form() {
        // MySQL does not benefit from the PG catalog-hint trick and the
        // `table:` shortcut is documented as PG-first; emit explicit SELECT.
        let info = make_table(vec![col("id", "bigint"), col("name", "text")]);
        let dest = InitYamlDestination::default();
        let yaml = generate_config(&info, "mysql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    query: >"),
            "MySQL full-mode should keep the explicit SELECT form:\n{yaml}"
        );
        assert!(
            !yaml.contains("    table: "),
            "no `table:` shortcut for MySQL:\n{yaml}"
        );
    }

    #[test]
    fn chunked_mode_keeps_select_query_form() {
        // chunked is curated; explicit SELECT remains.
        let info = chunked_table(2_000_000, vec![col("id", "bigint"), col("name", "text")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    query: >"),
            "chunked mode should preserve explicit SELECT:\n{yaml}"
        );
        assert!(
            !yaml.contains("    table: "),
            "no `table:` shortcut for chunked mode:\n{yaml}"
        );
    }
}