use super::*;
use arrow::array::{Array, Float64Array, Float64Builder, StringBuilder, StringViewArray};
use arrow::datatypes::{Field, Schema};
use indexmap::IndexMap;
use std::collections::HashSet;
use std::io::Cursor;
use std::sync::Arc;
use crate::constants::*;
use crate::metadata::{MissingSpec, SpssMetadata, Value, specs_to_missing};
use crate::variable::MissingValues;
fn make_simple_batch() -> RecordBatch {
let schema = Arc::new(Schema::new(vec![
Field::new("age", DataType::Float64, true),
Field::new("name", DataType::Utf8, true),
]));
let mut age_builder = Float64Builder::new();
age_builder.append_value(25.0);
age_builder.append_value(30.0);
age_builder.append_null();
let mut name_builder = StringBuilder::new();
name_builder.append_value("Alice");
name_builder.append_value("Bob");
name_builder.append_null();
RecordBatch::try_new(
schema,
vec![
Arc::new(age_builder.finish()),
Arc::new(name_builder.finish()),
],
)
.unwrap()
}
#[test]
fn test_write_header_size() {
let batch = make_simple_batch();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let layout = layout::compute_layout(&batch, &meta).unwrap();
let mut cursor = Cursor::new(Vec::new());
records::write_header(&mut cursor, &layout, &meta, Compression::None, 3).unwrap();
assert_eq!(cursor.into_inner().len(), 176);
}
#[test]
fn test_write_and_read_back() {
let batch = make_simple_batch();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::None, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 3);
assert_eq!(batch2.num_columns(), 2);
assert_eq!(meta2.variable_names.len(), 2);
}
#[test]
fn test_format_roundtrip() {
let formats = [
"F8.2",
"A50",
"DATE11",
"DATETIME23.2",
"TIME11.2",
"F1.0",
"N5",
];
for fmt_str in &formats {
let fmt = SpssFormat::from_string(fmt_str);
assert!(fmt.is_some(), "failed to parse {fmt_str}");
let fmt = fmt.unwrap();
let packed = fmt.to_packed();
let unpacked = SpssFormat::from_packed(packed);
assert!(unpacked.is_some(), "failed to unpack {fmt_str}");
let unpacked = unpacked.unwrap();
assert_eq!(
fmt.format_type, unpacked.format_type,
"type mismatch for {fmt_str}"
);
assert_eq!(fmt.width, unpacked.width, "width mismatch for {fmt_str}");
assert_eq!(
fmt.decimals, unpacked.decimals,
"decimals mismatch for {fmt_str}"
);
}
}
#[test]
fn test_short_name_generation() {
let mut used = HashSet::new();
let name1 = layout::generate_short_name("variable_long_name_1", &mut used);
assert!(name1.len() <= 8);
assert!(used.contains(&name1));
let name2 = layout::generate_short_name("variable_long_name_2", &mut used);
assert!(name2.len() <= 8);
assert_ne!(name1, name2);
}
#[test]
fn test_missing_values_roundtrip() {
let specs = vec![MissingSpec::Value(99.0), MissingSpec::Value(-1.0)];
let mv = specs_to_missing(&specs);
match &mv {
MissingValues::DiscreteNumeric(vals) => {
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], 99.0);
assert_eq!(vals[1], -1.0);
}
_ => panic!("expected DiscreteNumeric"),
}
}
fn roundtrip_file(path: &str) {
let (batch, meta) = crate::read_sav(path).unwrap();
let orig_rows = batch.num_rows();
let orig_cols = batch.num_columns();
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::None, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(
batch2.num_rows(),
orig_rows,
"row count mismatch for {path}"
);
assert_eq!(
batch2.num_columns(),
orig_cols,
"column count mismatch for {path}"
);
assert_eq!(
meta2.variable_names.len(),
meta.variable_names.len(),
"variable name count mismatch for {path}"
);
for col_idx in 0..orig_cols.min(5) {
let col1 = batch.column(col_idx);
let col2 = batch2.column(col_idx);
assert_eq!(
col1.null_count(),
col2.null_count(),
"null count mismatch for column {} in {path}",
col_idx,
);
}
}
#[test]
fn test_roundtrip_test1_small() {
let path = "test_data/test_1_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file(path);
}
}
#[test]
fn test_roundtrip_test4_small() {
let path = "test_data/test_4_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file(path);
}
}
#[test]
fn test_roundtrip_test5_small() {
let path = "test_data/test_5_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file(path);
}
}
#[test]
fn test_roundtrip_test7_nulls() {
let path = "test_data/test7_null_column.sav";
if std::path::Path::new(path).exists() {
roundtrip_file(path);
}
}
#[test]
fn test_roundtrip_test9_empty_metadata() {
let path = "test_data/test9_empty_file_with_metadata.sav";
if std::path::Path::new(path).exists() {
let result = std::panic::catch_unwind(|| crate::read_sav(path));
if let Ok(Ok((_batch, meta))) = result {
let _batch_empty = RecordBatch::new_empty(Arc::new(Schema::new(Vec::<Field>::new())));
assert!(!meta.variable_names.is_empty());
}
}
}
#[test]
fn test_write_with_value_labels() {
let schema = Arc::new(Schema::new(vec![Field::new(
"gender",
DataType::Float64,
true,
)]));
let mut builder = Float64Builder::new();
builder.append_value(1.0);
builder.append_value(2.0);
builder.append_value(1.0);
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let mut meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut labels = IndexMap::new();
labels.insert(Value::Numeric(1.0), "Male".to_string());
labels.insert(Value::Numeric(2.0), "Female".to_string());
meta.variable_value_labels
.insert("gender".to_string(), labels);
meta.variable_labels
.insert("gender".to_string(), "Respondent gender".to_string());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::None, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 3);
let gender_labels = meta2.variable_value_labels.get("gender");
assert!(gender_labels.is_some(), "value labels missing");
let labels = gender_labels.unwrap();
assert_eq!(labels.len(), 2);
assert_eq!(labels.get(&Value::Numeric(1.0)), Some(&"Male".to_string()));
assert_eq!(
meta2.variable_labels.get("gender"),
Some(&"Respondent gender".to_string())
);
}
#[test]
fn test_write_numeric_data_accuracy() {
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Float64, true)]));
let values = vec![0.0, 1.0, -1.0, 3.14159, 1e10, -1e10, 0.001];
let mut builder = Float64Builder::new();
for &v in &values {
builder.append_value(v);
}
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::None, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, _) = crate::read_sav_from_reader(reader).unwrap();
let col = batch2
.column(0)
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
for (i, &expected) in values.iter().enumerate() {
let actual = col.value(i);
assert!(
(actual - expected).abs() < 1e-15,
"row {i}: expected {expected}, got {actual}"
);
}
}
#[test]
fn test_bytecode_write_and_read_back() {
let batch = make_simple_batch();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Bytecode, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 3);
assert_eq!(batch2.num_columns(), 2);
assert_eq!(meta2.variable_names.len(), 2);
assert_eq!(meta2.compression, Compression::Bytecode);
}
#[test]
fn test_bytecode_numeric_accuracy() {
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Float64, true)]));
let values = vec![
0.0, 1.0, -1.0, 3.14159, 1e10, -1e10, 0.001, -99.0, 42.0, 151.0,
];
let mut builder = Float64Builder::new();
for &v in &values {
builder.append_value(v);
}
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Bytecode, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, _) = crate::read_sav_from_reader(reader).unwrap();
let col = batch2
.column(0)
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
for (i, &expected) in values.iter().enumerate() {
let actual = col.value(i);
assert!(
(actual - expected).abs() < 1e-15,
"row {i}: expected {expected}, got {actual}"
);
}
}
#[test]
fn test_bytecode_with_nulls() {
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Float64, true),
Field::new("b", DataType::Utf8, true),
]));
let mut a_builder = Float64Builder::new();
a_builder.append_value(1.0);
a_builder.append_null();
a_builder.append_value(3.0);
a_builder.append_null();
let mut b_builder = StringBuilder::new();
b_builder.append_value("hello");
b_builder.append_null();
b_builder.append_null();
b_builder.append_value("world");
let batch = RecordBatch::try_new(
schema,
vec![Arc::new(a_builder.finish()), Arc::new(b_builder.finish())],
)
.unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Bytecode, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, _) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 4);
let col_a = batch2
.column(0)
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
assert!(!col_a.is_null(0));
assert!(col_a.is_null(1));
assert!(!col_a.is_null(2));
assert!(col_a.is_null(3));
}
#[test]
fn test_bytecode_with_value_labels() {
let schema = Arc::new(Schema::new(vec![Field::new(
"gender",
DataType::Float64,
true,
)]));
let mut builder = Float64Builder::new();
builder.append_value(1.0);
builder.append_value(2.0);
builder.append_value(1.0);
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let mut meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut labels = IndexMap::new();
labels.insert(Value::Numeric(1.0), "Male".to_string());
labels.insert(Value::Numeric(2.0), "Female".to_string());
meta.variable_value_labels
.insert("gender".to_string(), labels);
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Bytecode, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 3);
let gender_labels = meta2.variable_value_labels.get("gender").unwrap();
assert_eq!(gender_labels.len(), 2);
assert_eq!(
gender_labels.get(&Value::Numeric(1.0)),
Some(&"Male".to_string())
);
}
#[test]
fn test_bytecode_smaller_than_uncompressed() {
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Float64, true)]));
let mut builder = Float64Builder::new();
for i in 0..100 {
builder.append_value((i % 50) as f64);
}
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut c_none = Cursor::new(Vec::new());
write_sav_to_writer(&mut c_none, &batch, &meta, Compression::None, None).unwrap();
let mut c_bytecode = Cursor::new(Vec::new());
write_sav_to_writer(&mut c_bytecode, &batch, &meta, Compression::Bytecode, None).unwrap();
let len_none = c_none.into_inner().len();
let len_bytecode = c_bytecode.into_inner().len();
assert!(
len_bytecode < len_none,
"bytecode ({len_bytecode}) should be smaller than uncompressed ({len_none})",
);
}
fn roundtrip_file_bytecode(path: &str) {
let (batch, meta) = crate::read_sav(path).unwrap();
let orig_rows = batch.num_rows();
let orig_cols = batch.num_columns();
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Bytecode, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(
batch2.num_rows(),
orig_rows,
"row count mismatch for {path}"
);
assert_eq!(
batch2.num_columns(),
orig_cols,
"column count mismatch for {path}"
);
assert_eq!(
meta2.variable_names.len(),
meta.variable_names.len(),
"variable name count mismatch for {path}"
);
assert_eq!(
meta2.compression,
Compression::Bytecode,
"compression should be bytecode for {path}"
);
for col_idx in 0..orig_cols.min(5) {
let col1 = batch.column(col_idx);
let col2 = batch2.column(col_idx);
assert_eq!(
col1.null_count(),
col2.null_count(),
"null count mismatch for column {} in {path}",
col_idx,
);
}
}
#[test]
fn test_bytecode_roundtrip_test1_small() {
let path = "test_data/test_1_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_bytecode(path);
}
}
#[test]
fn test_bytecode_roundtrip_test4_small() {
let path = "test_data/test_4_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_bytecode(path);
}
}
#[test]
fn test_bytecode_roundtrip_test5_small() {
let path = "test_data/test_5_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_bytecode(path);
}
}
#[test]
fn test_bytecode_roundtrip_test7_nulls() {
let path = "test_data/test7_null_column.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_bytecode(path);
}
}
#[test]
fn test_zsav_write_and_read_back() {
let batch = make_simple_batch();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Zlib, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 3);
assert_eq!(batch2.num_columns(), 2);
assert_eq!(meta2.variable_names.len(), 2);
assert_eq!(meta2.compression, Compression::Zlib);
}
#[test]
fn test_zsav_numeric_accuracy() {
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Float64, true)]));
let values = vec![
0.0, 1.0, -1.0, 3.14159, 1e10, -1e10, 0.001, -99.0, 42.0, 151.0,
];
let mut builder = Float64Builder::new();
for &v in &values {
builder.append_value(v);
}
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Zlib, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, _) = crate::read_sav_from_reader(reader).unwrap();
let col = batch2
.column(0)
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
for (i, &expected) in values.iter().enumerate() {
let actual = col.value(i);
assert!(
(actual - expected).abs() < 1e-15,
"row {i}: expected {expected}, got {actual}"
);
}
}
#[test]
fn test_zsav_with_nulls() {
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Float64, true),
Field::new("b", DataType::Utf8, true),
]));
let mut a_builder = Float64Builder::new();
a_builder.append_value(1.0);
a_builder.append_null();
a_builder.append_value(3.0);
a_builder.append_null();
let mut b_builder = StringBuilder::new();
b_builder.append_value("hello");
b_builder.append_null();
b_builder.append_null();
b_builder.append_value("world");
let batch = RecordBatch::try_new(
schema,
vec![Arc::new(a_builder.finish()), Arc::new(b_builder.finish())],
)
.unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Zlib, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, _) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(batch2.num_rows(), 4);
let col_a = batch2
.column(0)
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
assert!(!col_a.is_null(0));
assert!(col_a.is_null(1));
assert!(!col_a.is_null(2));
assert!(col_a.is_null(3));
}
#[test]
fn test_zsav_smaller_than_uncompressed() {
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Float64, true)]));
let mut builder = Float64Builder::new();
for i in 0..100 {
builder.append_value((i % 50) as f64);
}
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
let mut c_none = Cursor::new(Vec::new());
write_sav_to_writer(&mut c_none, &batch, &meta, Compression::None, None).unwrap();
let mut c_zlib = Cursor::new(Vec::new());
write_sav_to_writer(&mut c_zlib, &batch, &meta, Compression::Zlib, None).unwrap();
let len_none = c_none.into_inner().len();
let len_zlib = c_zlib.into_inner().len();
assert!(
len_zlib < len_none,
"zsav ({len_zlib}) should be smaller than uncompressed ({len_none})",
);
}
fn roundtrip_file_zsav(path: &str) {
let (batch, meta) = crate::read_sav(path).unwrap();
let orig_rows = batch.num_rows();
let orig_cols = batch.num_columns();
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Zlib, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(
batch2.num_rows(),
orig_rows,
"row count mismatch for {path}"
);
assert_eq!(
batch2.num_columns(),
orig_cols,
"column count mismatch for {path}"
);
assert_eq!(
meta2.variable_names.len(),
meta.variable_names.len(),
"variable name count mismatch for {path}"
);
assert_eq!(
meta2.compression,
Compression::Zlib,
"compression should be zlib for {path}"
);
for col_idx in 0..orig_cols.min(5) {
let col1 = batch.column(col_idx);
let col2 = batch2.column(col_idx);
assert_eq!(
col1.null_count(),
col2.null_count(),
"null count mismatch for column {} in {path}",
col_idx,
);
}
}
#[test]
fn test_zsav_roundtrip_test1_small() {
let path = "test_data/test_1_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_zsav(path);
}
}
#[test]
fn test_zsav_roundtrip_test4_small() {
let path = "test_data/test_4_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_zsav(path);
}
}
#[test]
fn test_zsav_roundtrip_test5_small() {
let path = "test_data/test_5_small.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_zsav(path);
}
}
#[test]
fn test_zsav_roundtrip_test7_nulls() {
let path = "test_data/test7_null_column.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_zsav(path);
}
}
#[test]
fn test_roundtrip_test2_medium() {
let path = "test_data/test_2_medium.sav";
if std::path::Path::new(path).exists() {
roundtrip_file(path);
}
}
#[test]
fn test_bytecode_roundtrip_test2_medium() {
let path = "test_data/test_2_medium.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_bytecode(path);
}
}
#[test]
fn test_zsav_roundtrip_test2_medium() {
let path = "test_data/test_2_medium.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_zsav(path);
}
}
#[test]
fn test_roundtrip_test3_large() {
let path = "test_data/test_3_medium.sav";
if std::path::Path::new(path).exists() {
roundtrip_file(path);
}
}
#[test]
fn test_bytecode_roundtrip_test3_large() {
let path = "test_data/test_3_medium.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_bytecode(path);
}
}
#[test]
fn test_zsav_roundtrip_test3_large() {
let path = "test_data/test_3_medium.sav";
if std::path::Path::new(path).exists() {
roundtrip_file_zsav(path);
}
}
#[test]
fn test_vls_last_segment_layout() {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Float64, true),
Field::new("longtext", DataType::Utf8, true),
]));
let mut id_builder = Float64Builder::new();
id_builder.append_value(1.0);
id_builder.append_value(2.0);
let mut text_builder = StringBuilder::new();
text_builder.append_value(&"A".repeat(400));
text_builder.append_value(&"B".repeat(300));
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(id_builder.finish()),
Arc::new(text_builder.finish()),
],
)
.unwrap();
let mut meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
meta.variable_formats
.insert("longtext".to_string(), "A500".to_string());
meta.variable_storage_widths
.insert("longtext".to_string(), 500);
let layout = layout::compute_layout(&batch, &meta).unwrap();
let vls_var = layout
.write_vars
.iter()
.find(|v| v.long_name == "longtext")
.unwrap();
assert_eq!(vls_var.n_segments, 2, "width=500 needs 2 segments");
assert_eq!(vls_var.n_slots, 32, "non-last segment: 32 slots");
assert_eq!(
vls_var.last_n_slots, 31,
"last segment: 31 slots (remaining=248)"
);
assert_eq!(vls_var.total_slots(), 63, "total: 32 + 31 = 63 slots");
let non_ghost_records: Vec<&layout::SlotRecord> = layout
.slot_records
.iter()
.filter(|r| !r.is_ghost && r.short_name.len() > 0)
.collect();
assert_eq!(non_ghost_records.len(), 3);
assert_eq!(non_ghost_records[1].raw_type, 255, "seg0 raw_type=255");
assert_eq!(
non_ghost_records[2].raw_type, 248,
"seg1 raw_type=248 (last)"
);
}
#[test]
fn test_vls_three_segment_layout() {
let schema = Arc::new(Schema::new(vec![Field::new(
"bigtext",
DataType::Utf8,
true,
)]));
let mut builder = StringBuilder::new();
builder.append_value(&"X".repeat(600));
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
let mut meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
meta.variable_formats
.insert("bigtext".to_string(), "A600".to_string());
meta.variable_storage_widths
.insert("bigtext".to_string(), 600);
let layout = layout::compute_layout(&batch, &meta).unwrap();
let vls_var = layout
.write_vars
.iter()
.find(|v| v.long_name == "bigtext")
.unwrap();
assert_eq!(vls_var.n_segments, 3);
assert_eq!(vls_var.last_n_slots, 12);
assert_eq!(vls_var.total_slots(), 32 + 32 + 12, "32+32+12=76");
}
#[test]
fn test_vls_roundtrip_no_ghost_columns() {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Float64, true),
Field::new("short_str", DataType::Utf8, true),
Field::new("vls_500", DataType::Utf8, true),
Field::new("vls_1000", DataType::Utf8, true),
Field::new("score", DataType::Float64, true),
]));
let n = 10;
let mut id_b = Float64Builder::new();
let mut short_b = StringBuilder::new();
let mut vls500_b = StringBuilder::new();
let mut vls1000_b = StringBuilder::new();
let mut score_b = Float64Builder::new();
for i in 0..n {
id_b.append_value(i as f64);
short_b.append_value(&format!("row_{i}"));
vls500_b.append_value(&format!("{}{}", "A".repeat(400), i));
vls1000_b.append_value(&format!("{}{}", "B".repeat(900), i));
score_b.append_value(i as f64 * 1.5);
}
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(id_b.finish()),
Arc::new(short_b.finish()),
Arc::new(vls500_b.finish()),
Arc::new(vls1000_b.finish()),
Arc::new(score_b.finish()),
],
)
.unwrap();
let mut meta = SpssMetadata::from_arrow_schema(batch.schema().as_ref());
meta.variable_formats
.insert("short_str".to_string(), "A20".to_string());
meta.variable_formats
.insert("vls_500".to_string(), "A500".to_string());
meta.variable_storage_widths
.insert("vls_500".to_string(), 500);
meta.variable_formats
.insert("vls_1000".to_string(), "A1000".to_string());
meta.variable_storage_widths
.insert("vls_1000".to_string(), 1000);
for compression in [Compression::None, Compression::Bytecode, Compression::Zlib] {
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, compression, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(
batch2.num_columns(),
5,
"VLS roundtrip ({compression:?}): expected 5 columns, got {}",
batch2.num_columns()
);
assert_eq!(
batch2.num_rows(),
n,
"VLS roundtrip ({compression:?}): row count mismatch"
);
assert_eq!(
meta2.variable_names, meta.variable_names,
"VLS roundtrip ({compression:?}): variable names mismatch"
);
let vls500_col = batch2.column(2);
let vls500_arr = vls500_col
.as_any()
.downcast_ref::<StringViewArray>()
.unwrap();
assert!(
vls500_arr.value(0).starts_with("AAAA"),
"VLS 500 data corrupted ({compression:?})"
);
assert!(
vls500_arr.value(0).ends_with("0"),
"VLS 500 data corrupted ({compression:?})"
);
let vls1000_col = batch2.column(3);
let vls1000_arr = vls1000_col
.as_any()
.downcast_ref::<StringViewArray>()
.unwrap();
assert!(
vls1000_arr.value(0).starts_with("BBBB"),
"VLS 1000 data corrupted ({compression:?})"
);
}
}
#[test]
fn test_roundtrip_test3_metadata() {
let path = "test_data/test_3_medium.sav";
if !std::path::Path::new(path).exists() {
return;
}
let (batch, meta) = crate::read_sav(path).unwrap();
let mut cursor = Cursor::new(Vec::new());
write_sav_to_writer(&mut cursor, &batch, &meta, Compression::Bytecode, None).unwrap();
let reader = Cursor::new(cursor.into_inner());
let (_batch2, meta2) = crate::read_sav_from_reader(reader).unwrap();
assert_eq!(
meta.variable_names, meta2.variable_names,
"variable_names mismatch"
);
assert_eq!(
meta.variable_value_labels.len(),
meta2.variable_value_labels.len(),
"variable_value_labels count mismatch: orig={}, written={}",
meta.variable_value_labels.len(),
meta2.variable_value_labels.len(),
);
for (var, labels) in &meta.variable_value_labels {
let labels2 = meta2.variable_value_labels.get(var);
assert!(
labels2.is_some(),
"missing value labels for variable: {var}"
);
assert_eq!(
labels.len(),
labels2.unwrap().len(),
"value label count mismatch for {var}"
);
}
assert_eq!(
meta.variable_labels.len(),
meta2.variable_labels.len(),
"variable_labels count mismatch"
);
}