rivet-cli 0.7.7

//! YAML scaffold generation for `rivet init`.

use crate::error::Result;

use super::{InitYamlDestination, TableInfo};

/// Split "schema.table" or just "table" into (schema, table).
pub(super) fn parse_table(table: &str) -> (String, &str) {
    match table.split_once('.') {
        Some((s, t)) => (s.to_string(), t),
        None => ("public".to_string(), table),
    }
}

pub(super) fn generate_config(
    info: &TableInfo,
    source_url: &str,
    dest: &InitYamlDestination,
) -> Result<String> {
    let st = super::source_type(source_url)?;
    let qualified_table = if info.schema == "public" || st == "mysql" {
        info.table.clone()
    } else {
        format!("{}.{}", info.schema, info.table)
    };
    let row_note = if info.row_estimate > 1_000_000 {
        format!("~{:.1}M rows", info.row_estimate as f64 / 1_000_000.0)
    } else if info.row_estimate > 1_000 {
        format!("~{:.0}K rows", info.row_estimate as f64 / 1_000.0)
    } else {
        format!("~{} rows", info.row_estimate)
    };
    let header = format!("# Generated by rivet init — {qualified_table} ({row_note})");

    let unbounded = table_has_unbounded_decimal_columns(info);
    let mut lines = config_header_lines(st, &header, unbounded);
    lines.push("exports:".to_string());
    lines.extend(export_block_lines(info, st, dest));
    Ok(lines.join("\n") + "\n")
}

pub(super) fn generate_schema_config(
    infos: &[TableInfo],
    source_url: &str,
    scope_label: &str,
    dest: &InitYamlDestination,
) -> Result<String> {
    let st = super::source_type(source_url)?;
    let header = format!("# Generated by rivet init — {scope_label}");
    let unbounded = infos.iter().any(table_has_unbounded_decimal_columns);
    let mut lines = config_header_lines(st, &header, unbounded);
    let dest_note = if dest.gcs_bucket.is_some() || dest.s3_bucket.is_some() {
        "# One export per table/view — per-table prefix `exports/<name>/` under the given bucket; review modes before running."
    } else {
        "# One export per table/view — review modes and destinations before running."
    };
    lines.push(dest_note.to_string());
    lines.push("exports:".to_string());
    for info in infos {
        lines.extend(export_block_lines(info, st, dest));
    }
    Ok(lines.join("\n") + "\n")
}

fn config_header_lines(
    source_type: &str,
    title_line: &str,
    unbounded_decimal_note: bool,
) -> Vec<String> {
    let mut lines = vec![
        title_line.to_string(),
        "# Review and adjust before running: rivet check --config <this-file>".to_string(),
    ];
    if unbounded_decimal_note {
        lines.push(
            "# NOTE: some exports use default decimal(p,s) for NUMERIC without precision in the DDL — search for \"# REVIEW:\" under columns:."
                .to_string(),
        );
    }
    lines.push("".to_string());
    lines.extend([
        "source:".to_string(),
        format!("  type: {source_type}"),
        "  url_env: DATABASE_URL  # export DATABASE_URL='<your-url>'".to_string(),
        "".to_string(),
    ]);
    lines
}

/// `exports/<table>/` in the bucket, or `exports/<schema>__<table>/` for non-`public` PostgreSQL.
fn table_export_prefix(info: &TableInfo, source_type: &str) -> String {
    let segment = if source_type == "postgres" && info.schema != "public" {
        format!("{}__{}", info.schema, info.table)
    } else {
        info.table.clone()
    };
    format!("exports/{segment}/")
}

/// Emit a YAML scalar that is unambiguous for any plain-text value the user
/// could pass via `--gcs-bucket` / `--s3-region` / `--gcs-credentials-file`.
///
/// Returns the input unchanged when it is safe as a YAML 1.2 plain scalar;
/// otherwise wraps it as a double-quoted scalar with escapes (handles names
/// like `1` / `true` / values with `:`, `#`, leading `-`, trailing space, etc.).
pub(super) fn yaml_quote_if_needed(v: &str) -> String {
    if needs_yaml_quoting(v) {
        yaml_double_quote(v)
    } else {
        v.to_string()
    }
}

fn needs_yaml_quoting(v: &str) -> bool {
    if v.is_empty() {
        return true;
    }
    // Leading or trailing whitespace would be stripped or break parsing.
    if v.trim() != v {
        return true;
    }
    // YAML 1.1/1.2 reserved scalars that must not look like a bare string.
    let lower = v.to_ascii_lowercase();
    if matches!(
        lower.as_str(),
        "true" | "false" | "yes" | "no" | "on" | "off" | "null" | "~"
    ) {
        return true;
    }
    // Anything that parses as a number would be loaded as int/float, not string.
    if v.parse::<i64>().is_ok() || v.parse::<f64>().is_ok() {
        return true;
    }
    // YAML indicators that are unsafe at the start of a plain scalar.
    if let Some(first) = v.chars().next()
        && matches!(
            first,
            '!' | '&'
                | '*'
                | '@'
                | '`'
                | '|'
                | '>'
                | '%'
                | '?'
                | ':'
                | '-'
                | '['
                | ']'
                | '{'
                | '}'
                | ','
                | '#'
                | '\''
                | '"'
        )
    {
        return true;
    }
    // Anywhere in the string: control chars, ` #` comment starter, `: ` mapping
    // separator, or flow-context indicators all need quoting to be safe.
    let bytes: Vec<char> = v.chars().collect();
    for (i, c) in bytes.iter().enumerate() {
        if c.is_control() {
            return true;
        }
        if matches!(c, '[' | ']' | '{' | '}' | ',' | '"' | '\'' | '\\' | '\t') {
            return true;
        }
        if *c == '#' && i > 0 && bytes[i - 1].is_whitespace() {
            return true;
        }
        if *c == ':' {
            // ": " or ":" at end-of-value both terminate the scalar in plain style.
            let next = bytes.get(i + 1);
            if next.is_none() || next.is_some_and(|n| n.is_whitespace()) {
                return true;
            }
        }
    }
    false
}

fn yaml_double_quote(v: &str) -> String {
    let mut s = String::with_capacity(v.len() + 2);
    s.push('"');
    for c in v.chars() {
        match c {
            '\\' => s.push_str("\\\\"),
            '"' => s.push_str("\\\""),
            '\n' => s.push_str("\\n"),
            '\r' => s.push_str("\\r"),
            '\t' => s.push_str("\\t"),
            c if (c as u32) < 0x20 || c == '\x7f' => {
                s.push_str(&format!("\\x{:02X}", c as u32));
            }
            c => s.push(c),
        }
    }
    s.push('"');
    s
}

/// Return true for column data types that Rivet cannot safely auto-map without
/// explicit precision/scale (NUMERIC / DECIMAL family).
fn is_decimal_type(data_type: &str) -> bool {
    let t = data_type.to_ascii_lowercase();
    t == "numeric" || t == "decimal"
}

/// Mirror of the validation in [`crate::config::models::validate_table_shortcut_ident`]:
/// accepts `<name>` or `<schema>.<name>` with ASCII-only identifier characters.
fn is_simple_pg_ident(s: &str) -> bool {
    let parts: Vec<&str> = s.split('.').collect();
    if parts.is_empty() || parts.len() > 2 {
        return false;
    }
    parts.iter().all(|p| {
        let mut chars = p.chars();
        match chars.next() {
            Some(c) if c.is_ascii_alphabetic() || c == '_' => {
                chars.all(|c| c.is_ascii_alphanumeric() || c == '_')
            }
            _ => false,
        }
    })
}

/// `true` when the table exports at least one `DECIMAL`/`NUMERIC` column whose
/// precision/scale are missing from introspection (`numeric` without `(p,s)` in DDL).
pub(super) fn table_has_unbounded_decimal_columns(info: &TableInfo) -> bool {
    info.columns.iter().any(|c| {
        is_decimal_type(&c.data_type)
            && !matches!(
                (c.numeric_precision.as_ref(), c.numeric_scale.as_ref()),
                (Some(_), Some(_))
            )
    })
}

/// Default `decimal(p,s)` when DDL has no `(precision, scale)` for `numeric`/`decimal`.
/// Fits Arrow Decimal128 (precision ≤ 38). Replace in YAML (or constrain the DDL) when domain rules differ.
const INIT_UNBOUNDED_DECIMAL_DEFAULT_PRECISION: u32 = 38;
const INIT_UNBOUNDED_DECIMAL_DEFAULT_SCALE: u32 = 18;

/// Present on generated `columns:` lines that use the default above — `rivet init` reminds on stderr.
pub(crate) const INIT_DECIMAL_REVIEW_MARKER: &str = "# REVIEW:";

fn init_default_decimal_yaml_line(col_name: &str) -> String {
    format!(
        "      {}: decimal({},{})  # REVIEW: DDL has no numeric(p,s); edit to the real decimal(p,s) or change the column type — values outside this bound may truncate or fail export.",
        col_name, INIT_UNBOUNDED_DECIMAL_DEFAULT_PRECISION, INIT_UNBOUNDED_DECIMAL_DEFAULT_SCALE,
    )
}

fn export_block_lines(
    info: &TableInfo,
    source_type: &str,
    dest: &InitYamlDestination,
) -> Vec<String> {
    let mode = info.suggest_mode();
    let columns: Vec<&str> = info.columns.iter().map(|c| c.name.as_str()).collect();
    let col_list = columns.join(", ");
    let qualified_table = if info.schema == "public" || source_type == "mysql" {
        info.table.clone()
    } else {
        format!("{}.{}", info.schema, info.table)
    };

    // For `mode: full` on a plain table, emit the `table:` shortcut: it produces
    // `SELECT * FROM <schema>.<table>` internally, which is the only form the PG
    // numeric-catalog-hint resolver recognises — so `numeric(p,s)` columns get
    // typed correctly even if the user later strips the `columns:` overrides.
    //
    // For `chunked` / `incremental` we keep the explicit `SELECT col1, ... FROM`
    // form: those modes usually start from a curated column set and benefit from
    // a self-documenting YAML.
    let mut lines = vec![format!("  - name: {}", yaml_quote_if_needed(&info.table))];
    if mode == "full" && source_type == "postgres" && is_simple_pg_ident(&qualified_table) {
        lines.push(format!("    table: {qualified_table}"));
    } else {
        lines.push("    query: >".to_string());
        lines.push(format!("      SELECT {col_list}"));
        lines.push(format!("      FROM {qualified_table}"));
    }
    lines.push(format!("    mode: {mode}"));

    match mode {
        "chunked" => {
            let chunk_col = info.best_chunk_column().unwrap_or("id");
            let parallel = suggest_parallel(info.row_estimate);
            lines.push(format!(
                "    chunk_column: {}",
                yaml_quote_if_needed(chunk_col)
            ));
            lines.push("    chunk_size: 100000".to_string());
            lines.push("    chunk_checkpoint: true".to_string());
            if parallel > 1 {
                lines.push(format!("    parallel: {parallel}"));
            }
        }
        "incremental" => {
            let cursor = info.best_cursor_column().unwrap_or("updated_at");
            lines.push(format!(
                "    cursor_column: {}",
                yaml_quote_if_needed(cursor)
            ));
        }
        _ => {}
    }

    lines.push("    format: parquet".to_string());

    // Row-group auto-tuning: emit for chunked (always large) and full-mode
    // exports big enough for tuning to have measurable effect.
    if mode == "chunked" || info.row_estimate > 100_000 {
        lines.push("    parquet:".to_string());
        lines.push("      row_group_strategy: auto".to_string());
        lines.push(format!(
            "      target_row_group_mb: {}",
            suggest_row_group_mb(info)
        ));
    }

    lines.push("    meta_columns:".to_string());
    lines.push("      exported_at: true".to_string());
    lines.push("      row_hash: true".to_string());
    lines.extend(destination_scaffold(info, source_type, dest));

    // Emit `columns:` overrides for NUMERIC/DECIMAL columns so the export
    // doesn't fail with "precision/scale unavailable".
    let decimal_cols: Vec<&super::ColumnInfo> = info
        .columns
        .iter()
        .filter(|c| is_decimal_type(&c.data_type))
        .collect();
    if !decimal_cols.is_empty() {
        lines.push("    columns:".to_string());
        for col in decimal_cols {
            match (col.numeric_precision, col.numeric_scale) {
                (Some(p), Some(s)) => {
                    lines.push(format!("      {}: decimal({p},{s})", col.name));
                }
                _ => {
                    lines.push(init_default_decimal_yaml_line(&col.name));
                }
            }
        }
    }

    lines
}

fn destination_scaffold(
    info: &TableInfo,
    source_type: &str,
    dest: &InitYamlDestination,
) -> Vec<String> {
    let prefix = yaml_quote_if_needed(&table_export_prefix(info, source_type));
    if let Some(bucket) = &dest.gcs_bucket {
        let bucket = yaml_quote_if_needed(bucket);
        let mut v = vec![
            "    destination:".to_string(),
            "      type: gcs".to_string(),
            format!("      bucket: {bucket}"),
            format!("      prefix: {prefix}"),
        ];
        if let Some(p) = &dest.gcs_credentials_file {
            v.push(format!(
                "      credentials_file: {}",
                yaml_quote_if_needed(p)
            ));
        }
        v
    } else if let Some(bucket) = &dest.s3_bucket {
        let bucket = yaml_quote_if_needed(bucket);
        let mut v = vec![
            "    destination:".to_string(),
            "      type: s3".to_string(),
            format!("      bucket: {bucket}"),
            format!("      prefix: {prefix}"),
        ];
        if let Some(r) = &dest.s3_region {
            v.push(format!("      region: {}", yaml_quote_if_needed(r)));
        }
        v
    } else {
        vec![
            "    destination:".to_string(),
            "      type: local".to_string(),
            "      path: ./output".to_string(),
        ]
    }
}

/// Suggest `target_row_group_mb` based on column shape.
/// Wide tables with many text/JSON columns get smaller groups so downstream
/// readers can push predicates without decoding the full group.
fn suggest_row_group_mb(info: &TableInfo) -> u64 {
    let wide_cols = info.columns.iter().filter(|c| {
        matches!(
            c.data_type.to_ascii_lowercase().as_str(),
            "text" | "varchar" | "character varying" | "jsonb" | "json" | "bytea"
        )
    });
    if wide_cols.count() >= 5 { 64 } else { 128 }
}

fn suggest_parallel(rows: i64) -> usize {
    match rows {
        r if r < 500_000 => 1,
        r if r < 5_000_000 => 2,
        _ => 4,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::init::{ColumnInfo, TableInfo};

    fn make_table(cols: Vec<ColumnInfo>) -> TableInfo {
        TableInfo {
            schema: "public".into(),
            table: "payments".into(),
            row_estimate: 100,
            total_bytes: None,
            columns: cols,
        }
    }

    fn col(name: &str, ty: &str) -> ColumnInfo {
        ColumnInfo {
            name: name.into(),
            data_type: ty.into(),
            is_primary_key: false,
            is_nullable: true,
            numeric_precision: None,
            numeric_scale: None,
        }
    }

    fn decimal_col(name: &str, p: u32, s: u32) -> ColumnInfo {
        ColumnInfo {
            numeric_precision: Some(p),
            numeric_scale: Some(s),
            ..col(name, "numeric")
        }
    }

    fn unbounded_col(name: &str) -> ColumnInfo {
        col(name, "numeric")
    }

    #[test]
    fn decimal_with_precision_emits_override() {
        let info = make_table(vec![col("id", "bigint"), decimal_col("amount", 18, 2)]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(yaml.contains("    columns:"), "columns block missing");
        assert!(
            yaml.contains("      amount: decimal(18,2)"),
            "decimal override missing:\n{yaml}"
        );
        assert!(
            !yaml.contains("id:"),
            "non-decimal column must not appear in columns block"
        );
    }

    #[test]
    fn unbounded_decimal_emits_default_with_review_marker() {
        let info = make_table(vec![col("id", "bigint"), unbounded_col("price")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("# NOTE: some exports use default decimal"),
            "header NOTE missing:\n{yaml}"
        );
        assert!(
            yaml.contains("      price: decimal(38,18)  # REVIEW:"),
            "default decimal with REVIEW missing:\n{yaml}"
        );
    }

    #[test]
    fn no_decimal_columns_no_columns_block() {
        let info = make_table(vec![col("id", "bigint"), col("label", "text")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            !yaml.contains("    columns:"),
            "columns block must not appear:\n{yaml}"
        );
    }

    fn chunked_table(rows: i64, cols: Vec<ColumnInfo>) -> TableInfo {
        TableInfo {
            schema: "public".into(),
            table: "events".into(),
            row_estimate: rows,
            total_bytes: None,
            columns: cols,
        }
    }

    #[test]
    fn chunked_large_table_emits_parquet_block() {
        let info = chunked_table(
            2_000_000,
            vec![
                col("id", "bigint"),
                col("name", "text"),
                col("ts", "timestamptz"),
            ],
        );
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    parquet:"),
            "parquet block must be emitted for chunked large table:\n{yaml}"
        );
        assert!(
            yaml.contains("      row_group_strategy: auto"),
            "auto strategy must be present:\n{yaml}"
        );
        assert!(
            yaml.contains("      target_row_group_mb: 128"),
            "128 MB target expected for narrow table:\n{yaml}"
        );
    }

    #[test]
    fn wide_table_suggests_smaller_row_group_mb() {
        let info = chunked_table(
            5_000_000,
            vec![
                col("id", "bigint"),
                col("body", "text"),
                col("raw_html", "text"),
                col("metadata", "jsonb"),
                col("extra", "json"),
                col("notes", "text"),
            ],
        );
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("      target_row_group_mb: 64"),
            "wide table (≥5 text/json cols) must suggest 64 MB:\n{yaml}"
        );
    }

    #[test]
    fn small_full_mode_table_has_no_parquet_block() {
        let info = make_table(vec![col("id", "bigint"), col("label", "text")]);
        // row_estimate = 100 (from make_table), mode = full
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            !yaml.contains("    parquet:"),
            "small full-mode table must not emit parquet block:\n{yaml}"
        );
    }

    // ── `table:` shortcut emission ───────────────────────────────────────────

    #[test]
    fn full_mode_pg_emits_table_shortcut_not_select_query() {
        let info = make_table(vec![col("id", "bigint"), col("name", "text")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    table: payments"),
            "full-mode PG export should emit `table:` shortcut:\n{yaml}"
        );
        assert!(
            !yaml.contains("    query: >"),
            "explicit SELECT block should be replaced by `table:`:\n{yaml}"
        );
        assert!(
            !yaml.contains("SELECT id"),
            "no enumerated SELECT for the table-shortcut form:\n{yaml}"
        );
    }

    #[test]
    fn full_mode_pg_non_public_schema_emits_qualified_table() {
        let info = TableInfo {
            schema: "billing".into(),
            table: "invoices".into(),
            row_estimate: 100,
            total_bytes: None,
            columns: vec![col("id", "bigint")],
        };
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    table: billing.invoices"),
            "qualified table name expected for non-public schema:\n{yaml}"
        );
    }

    #[test]
    fn full_mode_mysql_keeps_select_query_form() {
        // MySQL does not benefit from the PG catalog-hint trick and the
        // `table:` shortcut is documented as PG-first; emit explicit SELECT.
        let info = make_table(vec![col("id", "bigint"), col("name", "text")]);
        let dest = InitYamlDestination::default();
        let yaml = generate_config(&info, "mysql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    query: >"),
            "MySQL full-mode should keep the explicit SELECT form:\n{yaml}"
        );
        assert!(
            !yaml.contains("    table: "),
            "no `table:` shortcut for MySQL:\n{yaml}"
        );
    }

    #[test]
    fn chunked_mode_keeps_select_query_form() {
        // chunked is curated; explicit SELECT remains.
        let info = chunked_table(2_000_000, vec![col("id", "bigint"), col("name", "text")]);
        let dest = InitYamlDestination::default();
        let yaml =
            generate_config(&info, "postgresql://rivet:rivet@localhost/rivet", &dest).unwrap();
        assert!(
            yaml.contains("    query: >"),
            "chunked mode should preserve explicit SELECT:\n{yaml}"
        );
        assert!(
            !yaml.contains("    table: "),
            "no `table:` shortcut for chunked mode:\n{yaml}"
        );
    }
}