#![allow(
clippy::cast_lossless,
clippy::cast_precision_loss,
clippy::cast_possible_truncation
)]
use std::{
fs::{self, File},
io::Write,
sync::Arc,
};
use arrow::{
array::{Float64Array, Int32Array, StringArray},
datatypes::{DataType, Field, Schema},
record_batch::RecordBatch,
};
use parquet::{arrow::ArrowWriter, file::properties::WriterProperties};
const FIXTURE_DIR: &str = "test_fixtures";
fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("=== Alimentar QA Fixture Generator ===");
println!("Toyota Way: Heijunka (Leveling)\n");
fs::create_dir_all(FIXTURE_DIR)?;
println!("Created directory: {FIXTURE_DIR}/");
generate_basic_parquet()?;
generate_basic_csv()?;
generate_basic_json()?;
generate_empty_parquet()?;
generate_corrupt_parquet()?;
generate_messy_dataset()?;
generate_large_parquet()?;
generate_drift_datasets()?;
println!("\n✓ All fixtures generated successfully");
println!("Ready for: docs/specifications/100-cargo-run-examples-spec.md");
Ok(())
}
fn generate_basic_parquet() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/data.parquet");
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
Field::new("value", DataType::Float64, false),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from((1..=1000).collect::<Vec<_>>())),
Arc::new(StringArray::from(
(1..=1000).map(|i| format!("item_{i}")).collect::<Vec<_>>(),
)),
Arc::new(Float64Array::from(
(1..=1000).map(|i| i as f64 * 0.1).collect::<Vec<_>>(),
)),
],
)?;
let file = File::create(&path)?;
let props = WriterProperties::builder().build();
let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
writer.write(&batch)?;
writer.close()?;
println!(" ✓ {path} (1000 rows)");
Ok(())
}
fn generate_basic_csv() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/input.csv");
let mut file = File::create(&path)?;
writeln!(file, "id,name,value,region")?;
for i in 1..=500 {
let region = match i % 4 {
0 => "US",
1 => "EU",
2 => "APAC",
_ => "LATAM",
};
writeln!(file, "{i},item_{i},{:.2},{region}", i as f64 * 0.5)?;
}
println!(" ✓ {path} (500 rows)");
Ok(())
}
fn generate_basic_json() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/data.json");
let mut file = File::create(&path)?;
for i in 1..=100 {
let json = format!(
r#"{{"id":{},"name":"item_{}","value":{:.2},"active":{}}}"#,
i,
i,
i as f64 * 0.3,
i % 2 == 0
);
writeln!(file, "{json}")?;
}
println!(" ✓ {path} (100 rows, NDJSON)");
Ok(())
}
fn generate_empty_parquet() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/empty.parquet");
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("value", DataType::Float64, false),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from(Vec::<i32>::new())),
Arc::new(Float64Array::from(Vec::<f64>::new())),
],
)?;
let file = File::create(&path)?;
let props = WriterProperties::builder().build();
let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
writer.write(&batch)?;
writer.close()?;
println!(" ✓ {path} (0 rows - empty dataset test)");
Ok(())
}
fn generate_corrupt_parquet() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/corrupt.parquet");
let mut file = File::create(&path)?;
file.write_all(b"PAR1\x00\x00\x00\x00INVALID_CONTENT_FOR_JIDOKA_TEST")?;
println!(" ✓ {path} (corrupt - Jidoka stop-to-fix test)");
Ok(())
}
fn generate_messy_dataset() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/messy.parquet");
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, true), Field::new("score", DataType::Float64, true), Field::new("category", DataType::Utf8, false),
]));
let ids: Vec<i32> = (1..=200).collect();
let names: Vec<Option<&str>> = (1..=200)
.map(|i| if i % 10 == 0 { None } else { Some("name") })
.collect();
let scores: Vec<Option<f64>> = (1..=200)
.map(|i| {
if i % 15 == 0 {
None
} else if i % 50 == 0 {
Some(999.0) } else {
Some(i as f64 * 0.5)
}
})
.collect();
let categories: Vec<&str> = (1..=200)
.map(|i| match i % 3 {
0 => "A",
1 => "B",
_ => "C",
})
.collect();
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from(ids)),
Arc::new(StringArray::from(names)),
Arc::new(Float64Array::from(scores)),
Arc::new(StringArray::from(categories)),
],
)?;
let file = File::create(&path)?;
let props = WriterProperties::builder().build();
let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
writer.write(&batch)?;
writer.close()?;
println!(" ✓ {path} (200 rows - nulls, outliers for quality tests)");
Ok(())
}
fn generate_large_parquet() -> Result<(), Box<dyn std::error::Error>> {
let path = format!("{FIXTURE_DIR}/large.parquet");
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("value", DataType::Float64, false),
Field::new("category", DataType::Utf8, false),
]));
let file = File::create(&path)?;
let props = WriterProperties::builder()
.set_compression(parquet::basic::Compression::SNAPPY)
.build();
let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?;
for chunk in 0..10 {
let start = chunk * 100_000;
let ids: Vec<i32> = (start..start + 100_000).collect();
let values: Vec<f64> = ids.iter().map(|&i| i as f64 * 0.001).collect();
let cats: Vec<&str> = ids
.iter()
.map(|i| match i % 5 {
0 => "alpha",
1 => "beta",
2 => "gamma",
3 => "delta",
_ => "epsilon",
})
.collect();
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from(ids)),
Arc::new(Float64Array::from(values)),
Arc::new(StringArray::from(cats)),
],
)?;
writer.write(&batch)?;
}
writer.close()?;
let size = fs::metadata(&path)?.len();
println!(
" ✓ {path} (1M rows, {:.1}MB compressed - large dataset test)",
size as f64 / 1_000_000.0
);
Ok(())
}
fn generate_drift_datasets() -> Result<(), Box<dyn std::error::Error>> {
let schema = Arc::new(Schema::new(vec![
Field::new("age", DataType::Float64, false),
Field::new("income", DataType::Float64, false),
Field::new("category", DataType::Utf8, false),
]));
let baseline_path = format!("{FIXTURE_DIR}/baseline.parquet");
let baseline_ages: Vec<f64> = (0..1000).map(|i| 30.0 + (i % 40) as f64).collect();
let baseline_incomes: Vec<f64> = (0..1000).map(|i| 50000.0 + (i % 50000) as f64).collect();
let baseline_cats: Vec<&str> = (0..1000)
.map(|i| match i % 3 {
0 => "A",
1 => "B",
_ => "C",
})
.collect();
let baseline_batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Float64Array::from(baseline_ages)),
Arc::new(Float64Array::from(baseline_incomes)),
Arc::new(StringArray::from(baseline_cats)),
],
)?;
let file = File::create(&baseline_path)?;
let mut writer = ArrowWriter::try_new(file, schema.clone(), None)?;
writer.write(&baseline_batch)?;
writer.close()?;
let current_path = format!("{FIXTURE_DIR}/current.parquet");
let current_ages: Vec<f64> = (0..1000).map(|i| 45.0 + (i % 30) as f64).collect(); let current_incomes: Vec<f64> = (0..1000).map(|i| 70000.0 + (i % 60000) as f64).collect(); let current_cats: Vec<&str> = (0..1000)
.map(|i| match i % 5 {
0 | 1 => "A",
2 => "B",
_ => "C",
})
.collect();
let current_batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Float64Array::from(current_ages)),
Arc::new(Float64Array::from(current_incomes)),
Arc::new(StringArray::from(current_cats)),
],
)?;
let file = File::create(¤t_path)?;
let mut writer = ArrowWriter::try_new(file, schema, None)?;
writer.write(¤t_batch)?;
writer.close()?;
println!(" ✓ {baseline_path} (1000 rows - drift baseline)");
println!(" ✓ {current_path} (1000 rows - drifted distribution)");
Ok(())
}