use arrow::datatypes::{DataType, Schema, TimeUnit};
use indexmap::IndexMap;
use crate::constants::{Alignment, Compression, Measure, Role};
use crate::variable::MissingValues;
#[derive(Debug, Clone)]
pub enum Value {
Numeric(f64),
String(String),
}
impl std::hash::Hash for Value {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
match self {
Value::Numeric(v) => {
0_u8.hash(state);
v.to_bits().hash(state);
}
Value::String(s) => {
1_u8.hash(state);
s.hash(state);
}
}
}
}
impl PartialEq for Value {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Value::Numeric(a), Value::Numeric(b)) => a.to_bits() == b.to_bits(),
(Value::String(a), Value::String(b)) => a == b,
_ => false,
}
}
}
impl Eq for Value {}
impl PartialOrd for Value {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Value {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
match (self, other) {
(Value::Numeric(a), Value::Numeric(b)) => {
a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)
}
(Value::String(a), Value::String(b)) => a.cmp(b),
(Value::Numeric(_), Value::String(_)) => std::cmp::Ordering::Less,
(Value::String(_), Value::Numeric(_)) => std::cmp::Ordering::Greater,
}
}
}
impl std::fmt::Display for Value {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Value::Numeric(v) => {
if v.fract() == 0.0 && v.is_finite() {
write!(f, "{}", *v as i64)
} else {
write!(f, "{v}")
}
}
Value::String(s) => write!(f, "{s}"),
}
}
}
#[derive(Debug, Clone)]
pub enum MissingSpec {
Value(f64),
Range { lo: f64, hi: f64 },
StringValue(String),
}
pub fn specs_to_missing(specs: &[MissingSpec]) -> MissingValues {
if specs.is_empty() {
return MissingValues::None;
}
let mut ranges: Vec<(f64, f64)> = Vec::new();
let mut discrete_f64: Vec<f64> = Vec::new();
let mut discrete_str: Vec<Vec<u8>> = Vec::new();
for spec in specs {
match spec {
MissingSpec::Range { lo, hi } => ranges.push((*lo, *hi)),
MissingSpec::Value(v) => discrete_f64.push(*v),
MissingSpec::StringValue(s) => {
let mut bytes = s.as_bytes().to_vec();
bytes.resize(8, b' ');
discrete_str.push(bytes);
}
}
}
if !discrete_str.is_empty() {
return MissingValues::DiscreteString(discrete_str);
}
if let Some((lo, hi)) = ranges.first() {
if let Some(&val) = discrete_f64.first() {
return MissingValues::RangeAndValue {
low: *lo,
high: *hi,
value: val,
};
}
return MissingValues::Range {
low: *lo,
high: *hi,
};
}
MissingValues::DiscreteNumeric(discrete_f64)
}
pub fn missing_to_specs(mv: &MissingValues) -> Vec<MissingSpec> {
match mv {
MissingValues::None => vec![],
MissingValues::DiscreteNumeric(vals) => {
vals.iter().map(|&v| MissingSpec::Value(v)).collect()
}
MissingValues::Range { low, high } => {
vec![MissingSpec::Range {
lo: *low,
hi: *high,
}]
}
MissingValues::RangeAndValue { low, high, value } => {
vec![
MissingSpec::Range {
lo: *low,
hi: *high,
},
MissingSpec::Value(*value),
]
}
MissingValues::DiscreteString(vals) => vals
.iter()
.map(|v| MissingSpec::StringValue(String::from_utf8_lossy(v).trim_end().to_string()))
.collect(),
}
}
#[derive(Debug, Clone)]
pub struct MrSet {
pub name: String,
pub label: String,
pub mr_type: MrType,
pub counted_value: Option<String>,
pub variables: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum MrType {
MultipleDichotomy,
MultipleCategory,
}
#[derive(Debug, Clone)]
pub struct SpssMetadata {
pub file_label: String,
pub file_encoding: String,
pub compression: Compression,
pub creation_time: String,
pub notes: Vec<String>,
pub number_rows: Option<i64>,
pub number_columns: usize,
pub file_format: String,
pub variable_names: Vec<String>,
pub variable_labels: IndexMap<String, String>,
pub variable_formats: IndexMap<String, String>,
pub arrow_data_types: IndexMap<String, String>,
pub variable_value_labels: IndexMap<String, IndexMap<Value, String>>,
pub variable_alignments: IndexMap<String, Alignment>,
pub variable_storage_widths: IndexMap<String, usize>,
pub variable_display_widths: IndexMap<String, u32>,
pub variable_measures: IndexMap<String, Measure>,
pub variable_missing_values: IndexMap<String, Vec<MissingSpec>>,
pub mr_sets: IndexMap<String, MrSet>,
pub variable_roles: IndexMap<String, Role>,
pub variable_attributes: IndexMap<String, IndexMap<String, Vec<String>>>,
pub weight_variable: Option<String>,
}
impl SpssMetadata {
pub fn label(&self, name: &str) -> Option<&str> {
self.variable_labels.get(name).map(|s| s.as_str())
}
pub fn value_labels(&self, name: &str) -> Option<&IndexMap<Value, String>> {
self.variable_value_labels.get(name)
}
pub fn format(&self, name: &str) -> Option<&str> {
self.variable_formats.get(name).map(|s| s.as_str())
}
pub fn measure(&self, name: &str) -> Option<Measure> {
self.variable_measures.get(name).copied()
}
pub fn role(&self, name: &str) -> Option<Role> {
self.variable_roles.get(name).copied()
}
pub fn attributes(&self, var_name: &str) -> Option<&IndexMap<String, Vec<String>>> {
self.variable_attributes.get(var_name)
}
pub fn attribute(&self, var_name: &str, attr_name: &str) -> Option<&Vec<String>> {
self.variable_attributes.get(var_name)?.get(attr_name)
}
pub fn from_arrow_schema(schema: &Schema) -> Self {
let mut meta = SpssMetadata {
file_encoding: "UTF-8".to_string(),
file_format: "sav".to_string(),
number_columns: schema.fields().len(),
..Default::default()
};
for field in schema.fields() {
let name = field.name().clone();
meta.variable_names.push(name.clone());
let (fmt_str, rust_type, measure, alignment) = match field.data_type() {
DataType::Float64 => ("F8.2".to_string(), "f64", Measure::Scale, Alignment::Right),
DataType::Int64 | DataType::Int32 | DataType::Int16 | DataType::Int8 => {
("F8.0".to_string(), "f64", Measure::Scale, Alignment::Right)
}
DataType::Boolean => (
"F1.0".to_string(),
"f64",
Measure::Nominal,
Alignment::Right,
),
DataType::Date32 => (
"DATE11".to_string(),
"Date32",
Measure::Scale,
Alignment::Right,
),
DataType::Timestamp(TimeUnit::Microsecond, _) => (
"DATETIME23.2".to_string(),
"Timestamp[us]",
Measure::Scale,
Alignment::Right,
),
DataType::Duration(TimeUnit::Microsecond) => (
"TIME11.2".to_string(),
"Duration[us]",
Measure::Scale,
Alignment::Right,
),
DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => (
"A255".to_string(),
"String",
Measure::Nominal,
Alignment::Left,
),
_ => ("F8.2".to_string(), "f64", Measure::Scale, Alignment::Right),
};
let sw = if let Some(width_str) = fmt_str.strip_prefix('A') {
width_str.parse::<usize>().unwrap_or(255)
} else {
8
};
meta.variable_formats.insert(name.clone(), fmt_str);
meta.arrow_data_types
.insert(name.clone(), rust_type.to_string());
meta.variable_measures.insert(name.clone(), measure);
meta.variable_alignments.insert(name.clone(), alignment);
meta.variable_display_widths.insert(name.clone(), 8);
meta.variable_storage_widths.insert(name.clone(), sw);
}
meta
}
}
impl Default for SpssMetadata {
fn default() -> Self {
SpssMetadata {
file_label: String::new(),
file_encoding: "UTF-8".to_string(),
compression: Compression::None,
creation_time: String::new(),
notes: Vec::new(),
number_rows: None,
number_columns: 0,
file_format: "sav".to_string(),
variable_names: Vec::new(),
variable_labels: IndexMap::new(),
variable_formats: IndexMap::new(),
arrow_data_types: IndexMap::new(),
variable_value_labels: IndexMap::new(),
variable_alignments: IndexMap::new(),
variable_storage_widths: IndexMap::new(),
variable_display_widths: IndexMap::new(),
variable_measures: IndexMap::new(),
variable_missing_values: IndexMap::new(),
mr_sets: IndexMap::new(),
variable_roles: IndexMap::new(),
variable_attributes: IndexMap::new(),
weight_variable: None,
}
}
}
const MONTH_ABBR: [&str; 12] = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
];
pub(crate) fn format_spss_datetime(date_str: &str, time_str: &str) -> String {
let parts: Vec<&str> = date_str.split_whitespace().collect();
if parts.len() == 3 {
let day: u32 = parts[0].parse().unwrap_or(0);
let month = MONTH_ABBR
.iter()
.position(|&m| m.eq_ignore_ascii_case(parts[1]))
.map(|i| i + 1)
.unwrap_or(0);
let yy: u32 = parts[2].parse().unwrap_or(0);
let year = 2000 + yy;
if day > 0 && month > 0 {
return format!("{year:04}-{month:02}-{day:02} {time_str}");
}
}
format!("{date_str} {time_str}")
}
pub(crate) fn parse_iso_to_spss_parts(datetime: &str) -> Option<(String, String)> {
let (date_part, time_part) = datetime.split_once(' ')?;
let segs: Vec<&str> = date_part.split('-').collect();
if segs.len() != 3 {
return None;
}
let year: u32 = segs[0].parse().ok()?;
let month: usize = segs[1].parse().ok()?;
let day: u32 = segs[2].parse().ok()?;
if month == 0 || month > 12 {
return None;
}
let yy = year % 100;
let date = format!("{:02} {} {:02}", day, MONTH_ABBR[month - 1], yy);
Some((date, time_part.to_string()))
}