use std::path::{Path, PathBuf};
use std::process::Command;
fn workspace_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.to_path_buf()
}
fn run_validate(args: &[&str]) -> (i32, String, String) {
let mut all_args: Vec<&str> = vec!["run", "-q", "-p", "finetype-cli", "--", "validate"];
all_args.extend_from_slice(args);
let output = Command::new("cargo")
.args(&all_args)
.current_dir(workspace_root())
.output()
.expect("failed to spawn cargo run");
let code = output.status.code().unwrap_or(-1);
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
(code, stdout, stderr)
}
fn run_finetype_raw(args: &[&str]) -> (i32, String, String) {
let mut all_args: Vec<&str> = vec!["run", "-q", "-p", "finetype-cli", "--"];
all_args.extend_from_slice(args);
let output = Command::new("cargo")
.args(&all_args)
.current_dir(workspace_root())
.output()
.expect("failed to spawn cargo run");
let code = output.status.code().unwrap_or(-1);
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
(code, stdout, stderr)
}
fn duckdb_query(db: &Path, sql: &str) -> String {
let output = Command::new("duckdb")
.arg("-noheader")
.arg("-list")
.arg(db)
.arg("-c")
.arg(sql)
.output()
.expect("failed to spawn duckdb");
assert!(
output.status.success(),
"duckdb query failed: {}\nsql: {}",
String::from_utf8_lossy(&output.stderr),
sql
);
String::from_utf8_lossy(&output.stdout).trim().to_string()
}
fn write_schema(dir: &Path, body: &str) -> PathBuf {
let p = dir.join("schema.json");
std::fs::write(&p, body).unwrap();
p
}
fn write_csv(dir: &Path, name: &str, body: &str) -> PathBuf {
let p = dir.join(name);
std::fs::write(&p, body).unwrap();
p
}
const SCHEMA_WITH_EXT: &str = r#"{
"type": "object",
"properties": {
"order_id": {
"type": "string",
"pattern": "^ORD-[0-9]{5}$",
"x-finetype-label": "identity.code.id",
"x-finetype-confidence": 0.99
},
"status": {"type": "string", "enum": ["pending", "shipped"]}
}
}"#;
const SCHEMA_NO_EXT: &str = r#"{
"type": "object",
"properties": {
"order_id": {"type": "string", "pattern": "^ORD-[0-9]{5}$"},
"status": {"type": "string", "enum": ["pending", "shipped"]}
}
}"#;
const CSV_TWO_REJECTS: &str =
"order_id,status\nORD-11111,pending\nXXX-22222,UNKNOWN\nORD-33333,shipped\n";
#[test]
#[ignore]
fn test_vrp_ac13_cli_writes_db_with_sidecar() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let csv = write_csv(tmp.path(), "in.csv", CSV_TWO_REJECTS);
let db = tmp.path().join("out.db");
let (code, _stdout, stderr) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(
code, 1,
"expected exit 1 (rejects present): stderr={stderr}"
);
let cols = duckdb_query(
&db,
"SELECT string_agg(column_name, ',' ORDER BY column_index) \
FROM duckdb_columns WHERE table_name = 'finetype_reject_errors';",
);
let expected = "scan_id,file_id,line,column_idx,column_name,error_type,\
csv_line,byte_position,error_message,type_confidence,\
expected_type,constraint_failed,constraint_value";
assert_eq!(cols, expected);
let distinct = duckdb_query(
&db,
"SELECT DISTINCT error_type FROM finetype_reject_errors;",
);
assert_eq!(distinct, "SEMANTIC_TYPE");
let user_rows = duckdb_query(&db, "SELECT COUNT(*) FROM orders;");
assert_eq!(user_rows, "2");
}
#[test]
#[ignore]
fn test_vrp_ac13_cli_staging_cleanup_on_success() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let csv = write_csv(tmp.path(), "in.csv", CSV_TWO_REJECTS);
let db = tmp.path().join("out.db");
let (_code, _stdout, _stderr) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
let staging_count = duckdb_query(
&db,
"SELECT COUNT(*) FROM duckdb_tables \
WHERE table_name LIKE '__finetype_staging_%';",
);
assert_eq!(staging_count, "0", "staging table leaked after success");
}
#[test]
#[ignore]
fn test_vrp_ac13_cli_staging_cleanup_on_failure() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let db = tmp.path().join("out.db");
let (code, _stdout, _stderr) = run_validate(&[
"/nonexistent/input.csv",
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code, 2);
assert!(
!db.exists(),
"error path wrote a partial .db at {}",
db.display()
);
}
#[test]
#[ignore]
fn test_vrp_ac13_cli_exit_code_grid() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let clean_csv = write_csv(
tmp.path(),
"clean.csv",
"order_id,status\nORD-11111,pending\n",
);
let rejecting_csv = write_csv(tmp.path(), "dirty.csv", CSV_TWO_REJECTS);
let db_a = tmp.path().join("a.db");
let (code_a, _, _) = run_validate(&[
clean_csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db_a.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code_a, 0, "zero-reject should exit 0");
let db_b = tmp.path().join("b.db");
let (code_b, _, _) = run_validate(&[
rejecting_csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db_b.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code_b, 1, "reject-without-lenient should exit 1");
let db_c = tmp.path().join("c.db");
let (code_c, _, _) = run_validate(&[
rejecting_csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db_c.to_str().unwrap(),
"--table",
"orders",
"--lenient",
]);
assert_eq!(code_c, 0, "reject-with-lenient should exit 0");
let db_d = tmp.path().join("d.db");
let (code_d, _, _) = run_validate(&[
"/nonexistent/input.csv",
schema.to_str().unwrap(),
"--db",
db_d.to_str().unwrap(),
"--table",
"orders",
"--lenient",
]);
assert_eq!(code_d, 2, "error path should exit 2 even with --lenient");
}
#[test]
#[ignore]
fn test_vrp_ac13_cli_malformed_schema_error_grid() {
let tmp = tempfile::tempdir().unwrap();
let csv = write_csv(tmp.path(), "in.csv", CSV_TWO_REJECTS);
let db1 = tmp.path().join("db1.db");
let (code1, _, stderr1) = run_validate(&[
csv.to_str().unwrap(),
"/nonexistent/schema.json",
"--db",
db1.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code1, 2);
assert!(stderr1.contains("schema file not found"), "got: {stderr1}");
assert!(!db1.exists());
let bad_json = tmp.path().join("bad.json");
std::fs::write(&bad_json, "{ this is { not valid json").unwrap();
let db2 = tmp.path().join("db2.db");
let (code2, _, stderr2) = run_validate(&[
csv.to_str().unwrap(),
bad_json.to_str().unwrap(),
"--db",
db2.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code2, 2);
assert!(stderr2.contains("invalid JSON"), "got: {stderr2}");
assert!(!db2.exists());
let no_props = tmp.path().join("noprops.json");
std::fs::write(&no_props, r#"{"type": "object"}"#).unwrap();
let db3 = tmp.path().join("db3.db");
let (code3, _, stderr3) = run_validate(&[
csv.to_str().unwrap(),
no_props.to_str().unwrap(),
"--db",
db3.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code3, 2);
assert!(
stderr3.contains("missing required `properties`"),
"got: {stderr3}"
);
assert!(!db3.exists());
let unreadable = tmp.path().join("nope.json");
std::fs::write(&unreadable, SCHEMA_WITH_EXT).unwrap();
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let mut perms = std::fs::metadata(&unreadable).unwrap().permissions();
perms.set_mode(0o000);
let chmod_ok = std::fs::set_permissions(&unreadable, perms).is_ok();
let effective = chmod_ok && std::fs::read_to_string(&unreadable).is_err();
if effective {
let db4 = tmp.path().join("db4.db");
let (code4, _, stderr4) = run_validate(&[
csv.to_str().unwrap(),
unreadable.to_str().unwrap(),
"--db",
db4.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code4, 2);
assert!(
stderr4.contains("permission denied") || stderr4.contains("not found"),
"got: {stderr4}"
);
assert!(!db4.exists());
}
let mut restore = std::fs::metadata(&unreadable).unwrap().permissions();
restore.set_mode(0o644);
let _ = std::fs::set_permissions(&unreadable, restore);
}
}
#[test]
#[ignore]
fn test_vrp_ac11_xft_extensions_surface() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let csv = write_csv(tmp.path(), "in.csv", CSV_TWO_REJECTS);
let db = tmp.path().join("out.db");
let (_code, _, _) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
let order_id_row = duckdb_query(
&db,
"SELECT expected_type || '|' || type_confidence \
FROM finetype_reject_errors WHERE column_name='order_id';",
);
assert_eq!(order_id_row, "identity.code.id|0.99");
let status_nulls = duckdb_query(
&db,
"SELECT COUNT(*) FROM finetype_reject_errors \
WHERE column_name='status' AND expected_type IS NULL AND type_confidence IS NULL;",
);
assert_eq!(status_nulls, "1");
}
#[test]
#[ignore]
fn test_vrp_ac11_null_on_absence() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_NO_EXT);
let csv = write_csv(tmp.path(), "in.csv", CSV_TWO_REJECTS);
let db = tmp.path().join("out.db");
let (_code, _, _) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
let all_null = duckdb_query(
&db,
"SELECT COUNT(*) FROM finetype_reject_errors \
WHERE expected_type IS NULL AND type_confidence IS NULL;",
);
let total = duckdb_query(&db, "SELECT COUNT(*) FROM finetype_reject_errors;");
assert_eq!(all_null, total);
assert_ne!(total, "0", "test fixture must produce rejects");
}
#[test]
#[ignore]
fn test_vrp_ac12_ecommerce_end_to_end() {
let tmp = tempfile::tempdir().unwrap();
let fixture = workspace_root().join("eval/datasets/csv/ecommerce_orders.csv");
if !fixture.exists() {
eprintln!("skipping ac-12: fixture missing at {}", fixture.display());
return;
}
let slice_path = tmp.path().join("slice.csv");
let raw = std::fs::read_to_string(&fixture).unwrap();
let mut lines = raw.lines();
let header = lines.next().unwrap();
let header_cols: Vec<&str> = header.split(',').collect();
let oi = header_cols.iter().position(|h| *h == "order_id").unwrap();
let si = header_cols.iter().position(|h| *h == "status").unwrap();
let mut slice = String::from("order_id,status\n");
for (i, row) in lines.take(50).enumerate() {
let cols: Vec<&str> = row.split(',').collect();
if i == 2 {
slice.push_str(&format!("BAD-{oi:05},INVALID_STATUS\n"));
} else {
slice.push_str(&format!("{},{}\n", cols[oi], cols[si]));
}
}
std::fs::write(&slice_path, &slice).unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let db = tmp.path().join("ecom.db");
let (_code, _, _) = run_validate(&[
slice_path.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
let high_conf_rejects = duckdb_query(
&db,
"SELECT COUNT(*) FROM finetype_reject_errors \
WHERE column_name='order_id' AND type_confidence >= 0.99;",
);
assert_ne!(
high_conf_rejects, "0",
"ac-12: expected at least one classifier-confident reject on order_id"
);
let scan = duckdb_query(&db, "SELECT MAX(scan_id) FROM finetype_reject_errors;");
assert_eq!(scan, "1");
}
const SCHEMA_TYPED_DATE: &str = r#"{
"type": "object",
"properties": {
"order_id": {
"type": "string",
"x-finetype-label": "identity.code.id",
"x-finetype-confidence": 0.99
},
"delivery_date": {
"type": "string",
"x-finetype-label": "datetime.date.iso",
"x-finetype-confidence": 0.97
}
}
}"#;
#[test]
#[ignore]
fn test_vrp_transform_failure_emits_reject_row() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_TYPED_DATE);
let csv = write_csv(
tmp.path(),
"in.csv",
"order_id,delivery_date\nORD-1,2024-01-15\nORD-2,2024-02-30\nORD-3,2024-03-20\n",
);
let db = tmp.path().join("out.db");
let (code, _, stderr) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(
code, 1,
"transform-failure should produce a reject row → exit 1; stderr={stderr}"
);
let tf_count = duckdb_query(
&db,
"SELECT COUNT(*) FROM finetype_reject_errors \
WHERE error_type = 'TRANSFORM_FAILED' AND column_name = 'delivery_date';",
);
assert_eq!(tf_count, "1", "expected exactly one TRANSFORM_FAILED row");
let cf = duckdb_query(
&db,
"SELECT DISTINCT constraint_failed FROM finetype_reject_errors \
WHERE error_type = 'TRANSFORM_FAILED';",
);
assert_eq!(cf, "transform");
let cv = duckdb_query(
&db,
"SELECT constraint_value FROM finetype_reject_errors \
WHERE error_type = 'TRANSFORM_FAILED';",
);
assert_eq!(cv, "2024-02-30");
let exp = duckdb_query(
&db,
"SELECT expected_type || '|' || type_confidence FROM finetype_reject_errors \
WHERE error_type = 'TRANSFORM_FAILED';",
);
assert_eq!(exp, "datetime.date.iso|0.97");
let user_count = duckdb_query(&db, "SELECT COUNT(*) FROM orders;");
assert_eq!(user_count, "2");
let col_type = duckdb_query(
&db,
"SELECT data_type FROM duckdb_columns \
WHERE table_name = 'orders' AND column_name = 'delivery_date';",
);
assert_eq!(col_type, "DATE");
}
#[test]
#[ignore]
fn test_vrp_null_staging_passes_to_typed_null() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_TYPED_DATE);
let csv = write_csv(
tmp.path(),
"in.csv",
"order_id,delivery_date\nORD-1,2024-01-15\nORD-2,\nORD-3,2024-03-20\n",
);
let db = tmp.path().join("out.db");
let (code, _, stderr) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(
code, 0,
"NULL staging cell must NOT register a reject; stderr={stderr}"
);
let tf_count = duckdb_query(
&db,
"SELECT COUNT(*) FROM finetype_reject_errors \
WHERE error_type = 'TRANSFORM_FAILED';",
);
assert_eq!(tf_count, "0");
let user_count = duckdb_query(&db, "SELECT COUNT(*) FROM orders;");
assert_eq!(user_count, "3");
let null_dates = duckdb_query(
&db,
"SELECT COUNT(*) FROM orders WHERE delivery_date IS NULL;",
);
assert_eq!(null_dates, "1");
}
#[test]
#[ignore]
fn test_vrp_typed_ctas_round_trip() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_TYPED_DATE);
let csv = write_csv(
tmp.path(),
"in.csv",
"order_id,delivery_date\nORD-1,2024-01-15\nORD-2,2024-03-20\n",
);
let db = tmp.path().join("out.db");
let (code, _, stderr) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code, 0, "clean fixture should exit 0; stderr={stderr}");
let date_type = duckdb_query(
&db,
"SELECT data_type FROM duckdb_columns \
WHERE table_name = 'orders' AND column_name = 'delivery_date';",
);
assert_eq!(date_type, "DATE");
let id_type = duckdb_query(
&db,
"SELECT data_type FROM duckdb_columns \
WHERE table_name = 'orders' AND column_name = 'order_id';",
);
assert_eq!(id_type, "VARCHAR");
let span = duckdb_query(
&db,
"SELECT date_diff('day', MIN(delivery_date), MAX(delivery_date)) FROM orders;",
);
assert_eq!(span, "65");
}
#[test]
#[ignore]
fn test_vrp_unknown_label_passes_as_varchar() {
let tmp = tempfile::tempdir().unwrap();
let schema = write_schema(tmp.path(), SCHEMA_WITH_EXT);
let csv = write_csv(
tmp.path(),
"in.csv",
"order_id,status\nORD-11111,pending\nORD-22222,shipped\n",
);
let db = tmp.path().join("out.db");
let (code, _, stderr) = run_validate(&[
csv.to_str().unwrap(),
schema.to_str().unwrap(),
"--db",
db.to_str().unwrap(),
"--table",
"orders",
]);
assert_eq!(code, 0, "clean fixture should exit 0; stderr={stderr}");
let id_type = duckdb_query(
&db,
"SELECT data_type FROM duckdb_columns \
WHERE table_name = 'orders' AND column_name = 'order_id';",
);
assert_eq!(id_type, "VARCHAR");
let tf_count = duckdb_query(
&db,
"SELECT COUNT(*) FROM finetype_reject_errors \
WHERE error_type = 'TRANSFORM_FAILED';",
);
assert_eq!(tf_count, "0");
}
#[test]
#[ignore]
fn test_vrp_load_subcommand_removed() {
let (code, _stdout, stderr) = run_finetype_raw(&["load", "--file", "any.csv"]);
assert_eq!(
code, 2,
"`finetype load` must exit 2 via clap unknown-subcommand handler; \
got code={code}, stderr={stderr}"
);
let stderr_lower = stderr.to_lowercase();
assert!(
stderr_lower.contains("load")
&& (stderr_lower.contains("unrecognized")
|| stderr_lower.contains("unknown")
|| stderr_lower.contains("invalid")),
"stderr should signal unknown subcommand mentioning 'load'; got: {stderr}"
);
}