use serde_json::Value;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
fn workspace_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent() .unwrap()
.parent() .unwrap()
.to_path_buf()
}
fn run_profile_json(csv_path: &Path) -> Value {
let output = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"profile",
"-f",
csv_path.to_str().unwrap(),
"-o",
"json",
])
.current_dir(workspace_root())
.output()
.expect("failed to run finetype profile");
assert!(
output.status.success(),
"profile failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8(output.stdout).expect("invalid utf8");
serde_json::from_str(&stdout).unwrap_or_else(|e| {
panic!("failed to parse profile JSON: {e}\nOutput: {stdout}");
})
}
fn run_taxonomy_json() -> Value {
let output = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"taxonomy",
"--output",
"json",
])
.current_dir(workspace_root())
.output()
.expect("failed to run finetype taxonomy");
assert!(
output.status.success(),
"taxonomy failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8(output.stdout).expect("invalid utf8");
serde_json::from_str(&stdout).unwrap_or_else(|e| {
panic!("failed to parse taxonomy JSON: {e}");
})
}
fn run_infer_json(input: &str, mode: &str) -> Value {
let output = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"infer",
"-i",
input,
"--mode",
mode,
"--output",
"json",
])
.current_dir(workspace_root())
.output()
.expect("failed to run finetype infer");
assert!(
output.status.success(),
"infer failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8(output.stdout).expect("invalid utf8");
serde_json::from_str(&stdout).unwrap_or_else(|e| {
panic!("failed to parse infer JSON: {e}\nOutput: {stdout}");
})
}
fn run_taxonomy_json_schema(type_key: &str) -> Value {
let output = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"taxonomy",
type_key,
"-o",
"json-schema",
])
.current_dir(workspace_root())
.output()
.expect("failed to run finetype taxonomy");
assert!(
output.status.success(),
"taxonomy json-schema failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8(output.stdout).expect("invalid utf8");
let array: Value = serde_json::from_str(&stdout).unwrap_or_else(|e| {
panic!("failed to parse taxonomy json-schema output: {e}\nOutput: {stdout}");
});
let arr = array
.as_array()
.expect("taxonomy -o json-schema should always emit an array");
assert!(
!arr.is_empty(),
"taxonomy {type_key} -o json-schema returned an empty array"
);
arr[0].clone()
}
fn extract_columns(profile: &Value) -> Vec<(String, String, String)> {
profile["columns"]
.as_array()
.expect("profile missing columns array")
.iter()
.map(|col| {
(
col["column"].as_str().unwrap_or("").to_string(),
col["type"].as_str().unwrap_or("").to_string(),
col["broad_type"].as_str().unwrap_or("").to_string(),
)
})
.collect()
}
fn assert_column_type(
columns: &[(String, String, String)],
column_name: &str,
expected_type: &str,
) {
let col = columns
.iter()
.find(|(name, _, _)| name == column_name)
.unwrap_or_else(|| panic!("column '{column_name}' not found in profile"));
assert_eq!(
col.1, expected_type,
"column '{}': expected type '{}', got '{}'",
column_name, expected_type, col.1
);
}
fn assert_column_broad_type(
columns: &[(String, String, String)],
column_name: &str,
expected_broad_type: &str,
) {
let col = columns
.iter()
.find(|(name, _, _)| name == column_name)
.unwrap_or_else(|| panic!("column '{column_name}' not found in profile"));
assert_eq!(
col.2, expected_broad_type,
"column '{}': expected broad_type '{}', got '{}'",
column_name, expected_broad_type, col.2
);
}
fn assert_column_domain(
columns: &[(String, String, String)],
column_name: &str,
expected_domain: &str,
) {
let col = columns
.iter()
.find(|(name, _, _)| name == column_name)
.unwrap_or_else(|| panic!("column '{column_name}' not found in profile"));
assert!(
col.1.starts_with(expected_domain),
"column '{}': expected domain '{}', got type '{}'",
column_name,
expected_domain,
col.1
);
}
fn dataset_path(name: &str) -> PathBuf {
workspace_root()
.join("eval")
.join("datasets")
.join("csv")
.join(name)
}
fn fixture_path(name: &str) -> PathBuf {
workspace_root().join("tests").join("fixtures").join(name)
}
#[test]
#[ignore]
fn golden_profile_datetime_formats() {
let profile = run_profile_json(&dataset_path("datetime_formats.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 14, "datetime_formats should have 14 columns");
assert_column_type(&cols, "iso_date", "datetime.date.iso");
assert_column_type(&cols, "us_date", "datetime.date.mdy_slash");
assert_column_type(&cols, "eu_date", "datetime.date.dmy_slash");
assert_column_type(&cols, "iso_timestamp", "datetime.timestamp.iso_8601");
assert_column_type(&cols, "sql_timestamp", "datetime.timestamp.sql_standard");
assert_column_type(&cols, "unix_epoch", "datetime.epoch.unix_seconds");
assert_column_type(&cols, "unix_ms", "datetime.epoch.unix_milliseconds");
assert_column_type(&cols, "year", "datetime.component.year");
assert_column_type(&cols, "month_name", "datetime.component.month_name");
assert_column_type(&cols, "day_of_week", "datetime.component.day_of_week");
assert_column_type(&cols, "time_24h", "datetime.time.hms_24h");
assert_column_type(&cols, "duration_iso", "datetime.duration.iso_8601");
assert_column_type(&cols, "utc_offset", "datetime.offset.utc");
assert_column_type(&cols, "timezone", "datetime.offset.iana");
assert_column_broad_type(&cols, "iso_date", "DATE");
assert_column_broad_type(&cols, "iso_timestamp", "TIMESTAMP");
assert_column_broad_type(&cols, "unix_epoch", "TIMESTAMP");
assert_column_broad_type(&cols, "time_24h", "TIME");
assert_column_broad_type(&cols, "duration_iso", "INTERVAL");
assert_column_broad_type(&cols, "year", "SMALLINT");
}
#[test]
#[ignore]
fn golden_profile_ecommerce_orders() {
let profile = run_profile_json(&dataset_path("ecommerce_orders.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 12, "ecommerce_orders should have 12 columns");
assert_column_type(
&cols,
"order_id",
"representation.identifier.alphanumeric_id",
);
assert_column_type(&cols, "customer_email", "identity.person.email");
assert_column_type(&cols, "order_date", "datetime.date.iso");
assert_column_type(&cols, "total_price", "finance.currency.amount");
assert_column_type(&cols, "currency", "finance.currency.currency_code");
assert_column_type(&cols, "credit_card_last4", "geography.address.postal_code");
assert_column_type(&cols, "shipping_country", "geography.location.country");
assert_column_type(
&cols,
"shipping_postal_code",
"geography.address.postal_code",
);
assert_column_type(&cols, "status", "representation.text.word");
assert_column_type(&cols, "is_gift", "representation.boolean.terms");
assert_column_type(&cols, "tracking_url", "technology.internet.url");
assert_column_type(&cols, "phone", "identity.person.phone_number");
assert_column_broad_type(&cols, "order_date", "DATE");
assert_column_broad_type(&cols, "total_price", "DECIMAL");
assert_column_broad_type(&cols, "is_gift", "BOOLEAN");
}
#[test]
#[ignore]
fn golden_profile_titanic() {
let profile = run_profile_json(&dataset_path("titanic.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 12, "titanic should have 12 columns");
assert_column_type(&cols, "Name", "identity.person.full_name");
assert_column_type(&cols, "Survived", "representation.boolean.binary");
assert_column_type(&cols, "Sex", "identity.person.gender");
assert_column_type(&cols, "Fare", "finance.currency.amount");
assert_column_type(&cols, "Embarked", "representation.text.word");
assert_column_domain(&cols, "Cabin", "representation.");
assert_column_type(&cols, "Age", "representation.numeric.integer_number");
assert_column_type(&cols, "SibSp", "representation.discrete.ordinal");
assert_column_type(&cols, "Parch", "representation.numeric.integer_number");
assert_column_broad_type(&cols, "Survived", "BOOLEAN");
assert_column_broad_type(&cols, "Fare", "DECIMAL");
assert_column_broad_type(&cols, "Parch", "BIGINT");
}
#[test]
#[ignore]
fn golden_profile_people_directory() {
let profile = run_profile_json(&dataset_path("people_directory.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 14, "people_directory should have 14 columns");
assert_column_type(&cols, "full_name", "identity.person.full_name");
assert_column_type(&cols, "first_name", "identity.person.first_name");
assert_column_type(&cols, "last_name", "identity.person.last_name");
assert_column_type(&cols, "email", "identity.person.email");
assert_column_type(&cols, "phone", "identity.government.ssn");
assert_column_type(&cols, "gender", "identity.person.gender");
assert_column_type(&cols, "ssn", "identity.government.ssn");
assert_column_type(&cols, "height_cm", "identity.person.height");
assert_column_type(&cols, "weight_kg", "identity.person.weight");
assert_column_domain(&cols, "date_of_birth", "datetime.");
assert_column_type(&cols, "company", "representation.text.entity_name");
assert_column_type(&cols, "job_title", "representation.text.word");
assert_column_type(&cols, "salary", "finance.currency.amount");
assert_column_broad_type(&cols, "salary", "DECIMAL");
assert_column_broad_type(&cols, "height_cm", "DOUBLE");
}
#[test]
#[ignore]
fn golden_profile_ambiguous_headers() {
let profile = run_profile_json(&fixture_path("ambiguous_headers.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 6, "ambiguous_headers should have 6 columns");
assert_column_domain(&cols, "id", "representation.");
assert_column_domain(&cols, "code", "representation.");
assert_column_domain(&cols, "value", "representation.");
assert_column_domain(&cols, "status", "representation.");
assert_column_domain(&cols, "date", "datetime.");
assert_column_domain(&cols, "name", "identity.");
}
#[test]
#[ignore]
fn golden_profile_numeric_edge_cases() {
let profile = run_profile_json(&fixture_path("numeric_edge_cases.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 6, "numeric_edge_cases should have 6 columns");
assert_column_domain(&cols, "count", "representation.");
assert_column_type(&cols, "price", "finance.currency.amount");
assert_column_type(&cols, "zip_code", "geography.address.postal_code");
assert_column_broad_type(&cols, "zip_code", "VARCHAR");
assert_column_type(&cols, "percentage", "representation.numeric.decimal_number");
assert_column_type(&cols, "population", "representation.numeric.integer_number");
assert_column_type(
&cols,
"temperature",
"representation.numeric.decimal_number",
);
}
#[test]
#[ignore]
fn golden_profile_categoricals() {
let profile = run_profile_json(&fixture_path("categoricals.csv"));
let cols = extract_columns(&profile);
assert_eq!(cols.len(), 5, "categoricals should have 5 columns");
assert_column_type(&cols, "active", "representation.boolean.binary");
assert_column_type(&cols, "gender_code", "identity.person.gender");
assert_column_type(&cols, "priority", "representation.discrete.ordinal");
assert_column_type(&cols, "is_verified", "representation.boolean.terms");
assert_column_domain(&cols, "color", "representation.");
}
#[test]
#[ignore]
fn golden_taxonomy_structure() {
let taxonomy = run_taxonomy_json();
let entries = taxonomy
.as_array()
.expect("taxonomy should be a JSON array");
assert_eq!(entries.len(), 245, "taxonomy should have 245 types");
for entry in entries {
assert!(entry["key"].is_string(), "entry missing 'key': {:?}", entry);
assert!(
entry["broad_type"].is_string(),
"entry missing 'broad_type': {:?}",
entry
);
assert!(
entry["title"].is_string(),
"entry missing 'title': {:?}",
entry
);
}
let keys: Vec<&str> = entries.iter().map(|e| e["key"].as_str().unwrap()).collect();
assert!(
keys.contains(&"identity.person.email"),
"should contain email"
);
assert!(
keys.contains(&"datetime.date.iso"),
"should contain iso date"
);
assert!(
keys.contains(&"geography.address.postal_code"),
"should contain postal_code"
);
assert!(
keys.contains(&"finance.currency.currency_code"),
"should contain currency_code"
);
}
#[test]
#[ignore]
fn golden_taxonomy_domains() {
let taxonomy = run_taxonomy_json();
let entries = taxonomy.as_array().unwrap();
let mut domain_counts = std::collections::HashMap::new();
for entry in entries {
let key = entry["key"].as_str().unwrap();
let domain = key.split('.').next().unwrap();
*domain_counts.entry(domain.to_string()).or_insert(0) += 1;
}
assert_eq!(domain_counts.get("container"), Some(&11));
assert_eq!(domain_counts.get("datetime"), Some(&84));
assert_eq!(domain_counts.get("finance"), Some(&28));
assert_eq!(domain_counts.get("geography"), Some(&25));
assert_eq!(domain_counts.get("identity"), Some(&33));
assert_eq!(domain_counts.get("representation"), Some(&33));
assert_eq!(domain_counts.get("technology"), Some(&26));
}
#[test]
#[ignore]
fn golden_taxonomy_json_schema_email() {
let schema = run_taxonomy_json_schema("identity.person.email");
assert!(
schema["$schema"].is_string(),
"schema should have $schema field"
);
assert_eq!(
schema["type"].as_str(),
Some("string"),
"email schema type should be 'string'"
);
assert!(
schema["pattern"].is_string(),
"email schema should have pattern"
);
assert_eq!(
schema["x-finetype-label"].as_str(),
Some("identity.person.email"),
"x-finetype-label should equal the queried key (added in v0.6.19)"
);
assert_eq!(
schema["x-finetype-pii"].as_bool(),
Some(true),
"email should be marked as PII"
);
assert!(
schema["x-finetype-broad-type"].is_null(),
"x-finetype-broad-type was dropped from schema export in v0.6.19"
);
assert!(
schema["x-finetype-transform"].is_null(),
"x-finetype-transform was dropped from schema export in v0.6.19"
);
assert!(
schema["x-finetype-format-string"].is_null(),
"x-finetype-format-string was dropped from schema export in v0.6.19"
);
assert!(
schema["examples"].is_array(),
"email schema should have examples"
);
}
#[test]
#[ignore]
fn golden_infer_n1_email_column() {
let result = run_infer_json("john@example.com", "column");
assert_eq!(
result["label"].as_str(),
Some("identity.person.email"),
"N=1 email column should classify as identity.person.email \
(regression guard from v16-era behaviour)"
);
}
#[test]
#[ignore]
fn golden_infer_n1_url_column() {
let result = run_infer_json("https://example.com/path", "column");
assert_eq!(
result["label"].as_str(),
Some("technology.internet.url"),
"N=1 URL column should classify as technology.internet.url"
);
}
#[test]
#[ignore]
fn golden_infer_n1_ipv4_column() {
let result = run_infer_json("192.168.1.1", "column");
assert_eq!(
result["label"].as_str(),
Some("technology.internet.ip_v4"),
"N=1 IPv4 column should classify as technology.internet.ip_v4"
);
}
#[test]
#[ignore]
fn golden_taxonomy_json_schema_iso_date() {
let schema = run_taxonomy_json_schema("datetime.date.iso");
assert_eq!(schema["type"].as_str(), Some("string"));
assert!(schema["pattern"].is_string());
assert_eq!(
schema["x-finetype-label"].as_str(),
Some("datetime.date.iso"),
"x-finetype-label should equal the queried key (added in v0.6.19)"
);
assert_eq!(
schema["x-finetype-pii"].as_bool(),
Some(false),
"iso_date should not be marked as PII"
);
assert!(
schema["x-finetype-broad-type"].is_null(),
"x-finetype-broad-type was dropped from schema export in v0.6.19"
);
}
fn run_profile_json_schema(csv_path: &Path, extra_args: &[&str]) -> Value {
let mut args: Vec<&str> = vec![
"run",
"-p",
"finetype-cli",
"--",
"profile",
"-f",
csv_path.to_str().unwrap(),
"-o",
"json-schema",
];
args.extend_from_slice(extra_args);
let output = Command::new("cargo")
.args(&args)
.current_dir(workspace_root())
.output()
.expect("failed to run finetype profile -o json-schema");
assert!(
output.status.success(),
"profile -o json-schema failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8(output.stdout).expect("invalid utf8");
serde_json::from_str(&stdout).unwrap_or_else(|e| {
panic!("failed to parse profile json-schema output: {e}\nOutput: {stdout}");
})
}
#[test]
#[ignore]
fn golden_profile_json_schema_people_directory() {
let path = workspace_root().join("eval/datasets/csv/people_directory.csv");
let schema = run_profile_json_schema(&path, &[]);
assert!(
schema["$schema"].is_string(),
"json-schema output must declare $schema URI"
);
assert_eq!(
schema["type"].as_str(),
Some("object"),
"json-schema output must be a JSON Schema object type"
);
assert!(
schema["$id"].is_string(),
"json-schema output must declare $id"
);
let properties = schema["properties"]
.as_object()
.expect("properties must be a JSON object");
assert!(
!properties.is_empty(),
"people_directory should produce non-empty properties"
);
let has_label = properties
.values()
.any(|p| p.get("x-finetype-label").is_some());
assert!(
has_label,
"at least one property must carry x-finetype-label"
);
let has_pii = properties
.values()
.any(|p| p.get("x-finetype-pii").is_some());
assert!(has_pii, "at least one property must carry x-finetype-pii");
for (col, prop) in properties.iter() {
for dropped in [
"x-finetype-broad-type",
"x-finetype-transform",
"x-finetype-transform-ext",
"x-finetype-format-string",
"x-finetype-domain",
"x-finetype-confidence",
] {
assert!(
prop.get(dropped).is_none(),
"{dropped} was dropped in v0.6.19 (column {col})"
);
}
}
for (col, prop) in properties.iter() {
for stats_diagnostic in ["x-finetype-null-rate", "x-finetype-cardinality"] {
assert!(
prop.get(stats_diagnostic).is_none(),
"{stats_diagnostic} should only appear with --stats (column {col})"
);
}
}
}
#[test]
#[ignore]
fn golden_profile_json_schema_stats_ecommerce_orders() {
let path = workspace_root().join("eval/datasets/csv/ecommerce_orders.csv");
let schema = run_profile_json_schema(&path, &["--stats", "--enum-threshold", "50"]);
let properties = schema["properties"]
.as_object()
.expect("properties must be a JSON object");
assert!(!properties.is_empty(), "expected non-empty properties");
for (col, prop) in properties.iter() {
assert!(
prop.get("x-finetype-null-rate").is_some(),
"{col} should carry x-finetype-null-rate under --stats"
);
assert!(
prop.get("x-finetype-cardinality").is_some(),
"{col} should carry x-finetype-cardinality under --stats"
);
}
let any_min_length = properties
.values()
.any(|p| p.get("minLength").is_some() && p.get("maxLength").is_some());
assert!(
any_min_length,
"at least one string column should produce minLength/maxLength under --stats"
);
}
#[test]
#[ignore]
fn golden_profile_json_schema_enum_threshold_titanic() {
let path = workspace_root().join("eval/datasets/csv/titanic.csv");
let schema_off = run_profile_json_schema(&path, &["--stats", "--enum-threshold", "0"]);
let props_off = schema_off["properties"]
.as_object()
.expect("properties object");
let enums_off: std::collections::BTreeSet<String> = props_off
.iter()
.filter(|(_, p)| p.get("enum").is_some())
.map(|(k, _)| k.clone())
.collect();
let schema_on = run_profile_json_schema(&path, &["--stats", "--enum-threshold", "50"]);
let props_on = schema_on["properties"]
.as_object()
.expect("properties object");
let enums_on: std::collections::BTreeSet<String> = props_on
.iter()
.filter(|(_, p)| p.get("enum").is_some())
.map(|(k, _)| k.clone())
.collect();
assert!(
enums_off.is_subset(&enums_on),
"enum-bearing columns at threshold=0 must be a subset of those at threshold=50 \
(off={enums_off:?}, on={enums_on:?})"
);
assert!(
!enums_on.is_empty(),
"at least one low-cardinality column should carry enum under --enum-threshold=50"
);
}
const CLOSED_MECHANISMS: &[&str] = &[
"format_diversity_path_a",
"format_diversity_path_b",
"code_vs_canonical_path_a",
"code_vs_canonical_path_b",
"enum_overfit",
"misclassification",
"prediction_confirmed",
"validator_widening",
"unknown_no_fit",
"fallthrough",
];
fn run_infer_explain_batch(input_lines: &[&str]) -> Vec<Value> {
let mut child = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"infer",
"--mode",
"column",
"--batch",
"--explain",
])
.current_dir(workspace_root())
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("failed to spawn finetype infer --explain");
{
use std::io::Write as _;
let stdin = child.stdin.as_mut().expect("piped stdin");
for line in input_lines {
writeln!(stdin, "{line}").expect("write stdin");
}
}
let out = child
.wait_with_output()
.expect("failed waiting for finetype infer --explain");
assert!(
out.status.success(),
"infer --explain failed (rc={:?}): {}",
out.status.code(),
String::from_utf8_lossy(&out.stderr)
);
let stdout = String::from_utf8(out.stdout).expect("invalid utf8");
stdout
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| {
serde_json::from_str(l).unwrap_or_else(|e| panic!("output line not JSON: {e} ({l})"))
})
.collect()
}
#[test]
#[ignore]
fn infer_explain_single_line_email_confirms() {
let input = r#"{"column_name":"email","predicted_type":"identity.person.email","samples":["alice@example.com","bob@example.com","carol@example.com","dave@example.org","eve@example.net","frank@example.com","grace@example.io","henry@example.com"]}"#;
let out = run_infer_explain_batch(&[input]);
assert_eq!(out.len(), 1, "expected 1 output line, got {}", out.len());
let r = &out[0];
assert_eq!(r["inferred_correct_type"], "identity.person.email");
assert_eq!(r["mechanism"], "prediction_confirmed");
assert!(
r["confidence"].as_f64().unwrap_or(0.0) >= 0.5,
"confidence {:?} below 0.5 for canonical email column",
r["confidence"]
);
}
#[test]
#[ignore]
fn infer_explain_batch_preserves_input_order() {
let inputs = &[
r#"{"column_name":"email","predicted_type":"identity.person.email","samples":["a@x.com","b@x.com","c@x.com"]}"#,
r#"{"column_name":"age","predicted_type":"representation.numeric.integer","samples":["25","30","45"]}"#,
r#"{"column_name":"weird","predicted_type":"identity.person.email","samples":["foo","bar","baz"]}"#,
];
let out = run_infer_explain_batch(inputs);
assert_eq!(out.len(), 3, "expected 3 output lines, got {}", out.len());
for (i, r) in out.iter().enumerate() {
let mech = r["mechanism"].as_str().expect("mechanism present");
assert!(
CLOSED_MECHANISMS.contains(&mech),
"row {i}: mechanism {mech:?} not in closed 10-token set"
);
assert!(
r.get("inferred_correct_type").is_some(),
"row {i}: missing inferred_correct_type"
);
assert!(r.get("signals").is_some(), "row {i}: missing signals");
}
}
#[test]
fn infer_explain_without_batch_is_rejected() {
let out = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"infer",
"--mode",
"column",
"--explain",
])
.current_dir(workspace_root())
.stdin(Stdio::null())
.output()
.expect("failed to run finetype infer");
assert!(
!out.status.success(),
"--explain without --batch must fail; got rc={:?}",
out.status.code()
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("--explain requires --mode column --batch"),
"expected guard message in stderr; got: {stderr}"
);
}
#[test]
fn infer_explain_without_column_mode_is_rejected() {
let out = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"infer",
"--batch",
"--explain",
"--mode",
"row",
])
.current_dir(workspace_root())
.stdin(Stdio::null())
.output()
.expect("failed to run finetype infer");
assert!(
!out.status.success(),
"--explain without --mode column must fail; got rc={:?}",
out.status.code()
);
}
fn build_batch_fixture(n: usize) -> (tempfile::TempDir, PathBuf, PathBuf) {
let tmp = tempfile::tempdir().expect("tempdir");
let in_dir = tmp.path().join("in");
let out_dir = tmp.path().join("schemas");
std::fs::create_dir_all(&in_dir).expect("mkdir in");
let mut paths = Vec::with_capacity(n);
for i in 0..n {
let p = in_dir.join(format!("t{i}.csv"));
std::fs::write(
&p,
format!(
"email,age\nalice@example.com,{}\nbob@example.com,{}\n",
i,
i + 1
),
)
.expect("write csv");
paths.push(p.to_string_lossy().to_string());
}
let paths_file = tmp.path().join("paths.txt");
std::fs::write(&paths_file, paths.join("\n") + "\n").expect("write paths");
(tmp, paths_file, out_dir)
}
#[test]
#[ignore]
fn profile_files_batch_produces_one_output_per_input() {
let (_tmp, paths, out_dir) = build_batch_fixture(3);
let out = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"profile",
"--files",
paths.to_str().unwrap(),
"--out-dir",
out_dir.to_str().unwrap(),
"-o",
"json-schema",
])
.current_dir(workspace_root())
.output()
.expect("failed to run profile --files");
assert!(
out.status.success(),
"profile --files failed: {}",
String::from_utf8_lossy(&out.stderr)
);
for i in 0..3 {
let schema_path = out_dir.join(format!("t{i}.json"));
assert!(
schema_path.exists(),
"expected {} to exist",
schema_path.display()
);
let body = std::fs::read_to_string(&schema_path).expect("read schema");
let v: Value = serde_json::from_str(&body)
.unwrap_or_else(|e| panic!("schema {i} not JSON: {e}\n{body}"));
assert!(
v.get("properties").and_then(|p| p.as_object()).is_some(),
"schema {i} missing properties: {body}"
);
}
let stderr = String::from_utf8_lossy(&out.stderr);
let load_lines = stderr.matches("Loaded multi-branch classifier").count();
assert!(
load_lines <= 1,
"expected ≤1 'Loaded multi-branch classifier' line across batch \
of 3 (amortisation), got {load_lines}:\n{stderr}"
);
}
#[test]
fn profile_files_requires_out_dir() {
let (_tmp, paths, _out_dir) = build_batch_fixture(1);
let out = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"profile",
"--files",
paths.to_str().unwrap(),
"-o",
"json-schema",
])
.current_dir(workspace_root())
.output()
.expect("failed to run profile --files");
assert!(
!out.status.success(),
"--files without --out-dir must fail; got rc={:?}",
out.status.code()
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("out-dir"),
"expected clap error mentioning --out-dir; got: {stderr}"
);
}
#[test]
fn profile_files_rejects_non_json_schema_output() {
let (_tmp, paths, out_dir) = build_batch_fixture(1);
let out = Command::new("cargo")
.args([
"run",
"-p",
"finetype-cli",
"--",
"profile",
"--files",
paths.to_str().unwrap(),
"--out-dir",
out_dir.to_str().unwrap(),
"-o",
"plain",
])
.current_dir(workspace_root())
.output()
.expect("failed to run profile --files");
assert!(
!out.status.success(),
"profile --files -o plain must fail; got rc={:?}",
out.status.code()
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(
stderr.contains("json-schema"),
"expected clap error mentioning json-schema; got: {stderr}"
);
}