Skip to main content

ambers/
metadata.rs

1use arrow::datatypes::{DataType, Schema, TimeUnit};
2use indexmap::IndexMap;
3
4use crate::constants::{Alignment, Compression, Measure, Role};
5use crate::variable::MissingValues;
6
7/// A value that can be used as a key in value label maps.
8#[derive(Debug, Clone)]
9pub enum Value {
10    Numeric(f64),
11    String(String),
12}
13
14// Manual Hash/Eq for Value since f64 doesn't implement Hash.
15// We use the raw bit pattern for numeric values.
16impl std::hash::Hash for Value {
17    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
18        match self {
19            Value::Numeric(v) => {
20                0_u8.hash(state);
21                v.to_bits().hash(state);
22            }
23            Value::String(s) => {
24                1_u8.hash(state);
25                s.hash(state);
26            }
27        }
28    }
29}
30
31impl PartialEq for Value {
32    fn eq(&self, other: &Self) -> bool {
33        match (self, other) {
34            (Value::Numeric(a), Value::Numeric(b)) => a.to_bits() == b.to_bits(),
35            (Value::String(a), Value::String(b)) => a == b,
36            _ => false,
37        }
38    }
39}
40
41impl Eq for Value {}
42
43impl PartialOrd for Value {
44    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
45        Some(self.cmp(other))
46    }
47}
48
49impl Ord for Value {
50    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
51        match (self, other) {
52            (Value::Numeric(a), Value::Numeric(b)) => {
53                a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)
54            }
55            (Value::String(a), Value::String(b)) => a.cmp(b),
56            // Numeric sorts before String
57            (Value::Numeric(_), Value::String(_)) => std::cmp::Ordering::Less,
58            (Value::String(_), Value::Numeric(_)) => std::cmp::Ordering::Greater,
59        }
60    }
61}
62
63impl std::fmt::Display for Value {
64    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65        match self {
66            Value::Numeric(v) => {
67                // Display as integer if it's a whole number
68                if v.fract() == 0.0 && v.is_finite() {
69                    write!(f, "{}", *v as i64)
70                } else {
71                    write!(f, "{v}")
72                }
73            }
74            Value::String(s) => write!(f, "{s}"),
75        }
76    }
77}
78
79/// A missing value specification for the public API.
80#[derive(Debug, Clone)]
81pub enum MissingSpec {
82    /// A single discrete missing value.
83    Value(f64),
84    /// A range of missing values.
85    Range { lo: f64, hi: f64 },
86    /// A discrete string missing value.
87    StringValue(String),
88}
89
90/// Convert public MissingSpec list back to internal MissingValues.
91pub fn specs_to_missing(specs: &[MissingSpec]) -> MissingValues {
92    if specs.is_empty() {
93        return MissingValues::None;
94    }
95    // Separate ranges from discrete values
96    let mut ranges: Vec<(f64, f64)> = Vec::new();
97    let mut discrete_f64: Vec<f64> = Vec::new();
98    let mut discrete_str: Vec<Vec<u8>> = Vec::new();
99    for spec in specs {
100        match spec {
101            MissingSpec::Range { lo, hi } => ranges.push((*lo, *hi)),
102            MissingSpec::Value(v) => discrete_f64.push(*v),
103            MissingSpec::StringValue(s) => {
104                let mut bytes = s.as_bytes().to_vec();
105                bytes.resize(8, b' ');
106                discrete_str.push(bytes);
107            }
108        }
109    }
110    if !discrete_str.is_empty() {
111        return MissingValues::DiscreteString(discrete_str);
112    }
113    if let Some((lo, hi)) = ranges.first() {
114        if let Some(&val) = discrete_f64.first() {
115            return MissingValues::RangeAndValue {
116                low: *lo,
117                high: *hi,
118                value: val,
119            };
120        }
121        return MissingValues::Range {
122            low: *lo,
123            high: *hi,
124        };
125    }
126    MissingValues::DiscreteNumeric(discrete_f64)
127}
128
129/// Convert internal MissingValues to public MissingSpec list.
130pub fn missing_to_specs(mv: &MissingValues) -> Vec<MissingSpec> {
131    match mv {
132        MissingValues::None => vec![],
133        MissingValues::DiscreteNumeric(vals) => {
134            vals.iter().map(|&v| MissingSpec::Value(v)).collect()
135        }
136        MissingValues::Range { low, high } => {
137            vec![MissingSpec::Range {
138                lo: *low,
139                hi: *high,
140            }]
141        }
142        MissingValues::RangeAndValue { low, high, value } => {
143            vec![
144                MissingSpec::Range {
145                    lo: *low,
146                    hi: *high,
147                },
148                MissingSpec::Value(*value),
149            ]
150        }
151        MissingValues::DiscreteString(vals) => vals
152            .iter()
153            .map(|v| MissingSpec::StringValue(String::from_utf8_lossy(v).trim_end().to_string()))
154            .collect(),
155    }
156}
157
158/// Multiple response set definition.
159#[derive(Debug, Clone)]
160pub struct MrSet {
161    pub name: String,
162    pub label: String,
163    pub mr_type: MrType,
164    pub counted_value: Option<String>,
165    pub variables: Vec<String>,
166}
167
168#[derive(Debug, Clone, PartialEq, Eq)]
169pub enum MrType {
170    MultipleDichotomy,
171    MultipleCategory,
172}
173
174/// The complete metadata for an SPSS file.
175#[derive(Debug, Clone)]
176pub struct SpssMetadata {
177    // File-level
178    pub file_label: String,
179    pub file_encoding: String,
180    pub compression: Compression,
181    pub creation_time: String,
182    pub notes: Vec<String>,
183    pub number_rows: Option<i64>,
184    pub number_columns: usize,
185    pub file_format: String,
186
187    // Variable names (ordered -- defines Arrow schema column order)
188    pub variable_names: Vec<String>,
189
190    // Variable labels: {name -> label}
191    pub variable_labels: IndexMap<String, String>,
192
193    // Type info
194    pub variable_formats: IndexMap<String, String>,
195    pub arrow_data_types: IndexMap<String, String>,
196
197    // Value labels: {var_name -> {value -> label}}
198    pub variable_value_labels: IndexMap<String, IndexMap<Value, String>>,
199
200    // Display properties
201    pub variable_alignments: IndexMap<String, Alignment>,
202    pub variable_storage_widths: IndexMap<String, usize>,
203    pub variable_display_widths: IndexMap<String, u32>,
204    pub variable_measures: IndexMap<String, Measure>,
205
206    // Missing values
207    pub variable_missing_values: IndexMap<String, Vec<MissingSpec>>,
208
209    // SPSS-specific
210    pub mr_sets: IndexMap<String, MrSet>,
211    pub variable_roles: IndexMap<String, Role>,
212    pub variable_attributes: IndexMap<String, IndexMap<String, Vec<String>>>,
213    pub weight_variable: Option<String>,
214}
215
216impl SpssMetadata {
217    /// Get a variable label by name.
218    pub fn label(&self, name: &str) -> Option<&str> {
219        self.variable_labels.get(name).map(|s| s.as_str())
220    }
221
222    /// Get value labels for a variable.
223    pub fn value_labels(&self, name: &str) -> Option<&IndexMap<Value, String>> {
224        self.variable_value_labels.get(name)
225    }
226
227    /// Get the SPSS format string for a variable (e.g., "F8.2", "A50").
228    pub fn format(&self, name: &str) -> Option<&str> {
229        self.variable_formats.get(name).map(|s| s.as_str())
230    }
231
232    /// Get the measurement level for a variable.
233    pub fn measure(&self, name: &str) -> Option<Measure> {
234        self.variable_measures.get(name).copied()
235    }
236
237    /// Get the role for a variable.
238    pub fn role(&self, name: &str) -> Option<Role> {
239        self.variable_roles.get(name).copied()
240    }
241
242    /// Get all custom attributes for a variable.
243    pub fn attributes(&self, var_name: &str) -> Option<&IndexMap<String, Vec<String>>> {
244        self.variable_attributes.get(var_name)
245    }
246
247    /// Get a specific custom attribute's values for a variable.
248    pub fn attribute(&self, var_name: &str, attr_name: &str) -> Option<&Vec<String>> {
249        self.variable_attributes.get(var_name)?.get(attr_name)
250    }
251
252    /// Infer metadata from an Arrow schema (for write_sav without prior read metadata).
253    pub fn from_arrow_schema(schema: &Schema) -> Self {
254        let mut meta = SpssMetadata {
255            file_encoding: "UTF-8".to_string(),
256            file_format: "sav".to_string(),
257            number_columns: schema.fields().len(),
258            ..Default::default()
259        };
260
261        for field in schema.fields() {
262            let name = field.name().clone();
263            meta.variable_names.push(name.clone());
264
265            let (fmt_str, rust_type, measure, alignment) = match field.data_type() {
266                DataType::Float64 => ("F8.2".to_string(), "f64", Measure::Scale, Alignment::Right),
267                DataType::Int64 | DataType::Int32 | DataType::Int16 | DataType::Int8 => {
268                    ("F8.0".to_string(), "f64", Measure::Scale, Alignment::Right)
269                }
270                DataType::Boolean => (
271                    "F1.0".to_string(),
272                    "f64",
273                    Measure::Nominal,
274                    Alignment::Right,
275                ),
276                DataType::Date32 => (
277                    "DATE11".to_string(),
278                    "Date32",
279                    Measure::Scale,
280                    Alignment::Right,
281                ),
282                DataType::Timestamp(TimeUnit::Microsecond, _) => (
283                    "DATETIME23.2".to_string(),
284                    "Timestamp[us]",
285                    Measure::Scale,
286                    Alignment::Right,
287                ),
288                DataType::Duration(TimeUnit::Microsecond) => (
289                    "TIME11.2".to_string(),
290                    "Duration[us]",
291                    Measure::Scale,
292                    Alignment::Right,
293                ),
294                DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => (
295                    "A255".to_string(),
296                    "String",
297                    Measure::Nominal,
298                    Alignment::Left,
299                ),
300                _ => ("F8.2".to_string(), "f64", Measure::Scale, Alignment::Right),
301            };
302
303            // String storage width must match the format width so compute_layout()
304            // uses the correct VLS segment count.
305            let sw = if let Some(width_str) = fmt_str.strip_prefix('A') {
306                width_str.parse::<usize>().unwrap_or(255)
307            } else {
308                8
309            };
310            meta.variable_formats.insert(name.clone(), fmt_str);
311            meta.arrow_data_types
312                .insert(name.clone(), rust_type.to_string());
313            meta.variable_measures.insert(name.clone(), measure);
314            meta.variable_alignments.insert(name.clone(), alignment);
315            meta.variable_display_widths.insert(name.clone(), 8);
316            meta.variable_storage_widths.insert(name.clone(), sw);
317        }
318
319        meta
320    }
321}
322
323impl Default for SpssMetadata {
324    fn default() -> Self {
325        SpssMetadata {
326            file_label: String::new(),
327            file_encoding: "UTF-8".to_string(),
328            compression: Compression::None,
329            creation_time: String::new(),
330            notes: Vec::new(),
331            number_rows: None,
332            number_columns: 0,
333            file_format: "sav".to_string(),
334            variable_names: Vec::new(),
335            variable_labels: IndexMap::new(),
336            variable_formats: IndexMap::new(),
337            arrow_data_types: IndexMap::new(),
338            variable_value_labels: IndexMap::new(),
339            variable_alignments: IndexMap::new(),
340            variable_storage_widths: IndexMap::new(),
341            variable_display_widths: IndexMap::new(),
342            variable_measures: IndexMap::new(),
343            variable_missing_values: IndexMap::new(),
344            mr_sets: IndexMap::new(),
345            variable_roles: IndexMap::new(),
346            variable_attributes: IndexMap::new(),
347            weight_variable: None,
348        }
349    }
350}
351
352// ---------------------------------------------------------------------------
353// Timestamp helpers
354// ---------------------------------------------------------------------------
355
356const MONTH_ABBR: [&str; 12] = [
357    "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
358];
359
360/// Parse SPSS header date ("21 Feb 26") + time ("12:38:47") into "2026-02-21 12:38:47".
361pub(crate) fn format_spss_datetime(date_str: &str, time_str: &str) -> String {
362    let parts: Vec<&str> = date_str.split_whitespace().collect();
363    if parts.len() == 3 {
364        let day: u32 = parts[0].parse().unwrap_or(0);
365        let month = MONTH_ABBR
366            .iter()
367            .position(|&m| m.eq_ignore_ascii_case(parts[1]))
368            .map(|i| i + 1)
369            .unwrap_or(0);
370        let yy: u32 = parts[2].parse().unwrap_or(0);
371        let year = 2000 + yy;
372        if day > 0 && month > 0 {
373            return format!("{year:04}-{month:02}-{day:02} {time_str}");
374        }
375    }
376    format!("{date_str} {time_str}")
377}
378
379/// Parse "2026-02-21 12:38:47" back into SPSS parts ("21 Feb 26", "12:38:47").
380/// Returns None if the format doesn't match.
381pub(crate) fn parse_iso_to_spss_parts(datetime: &str) -> Option<(String, String)> {
382    let (date_part, time_part) = datetime.split_once(' ')?;
383    let segs: Vec<&str> = date_part.split('-').collect();
384    if segs.len() != 3 {
385        return None;
386    }
387    let year: u32 = segs[0].parse().ok()?;
388    let month: usize = segs[1].parse().ok()?;
389    let day: u32 = segs[2].parse().ok()?;
390    if month == 0 || month > 12 {
391        return None;
392    }
393    let yy = year % 100;
394    let date = format!("{:02} {} {:02}", day, MONTH_ABBR[month - 1], yy);
395    Some((date, time_part.to_string()))
396}