Skip to main content

ambers/
lib.rs

1//! ambers: Pure Rust reader for SPSS .sav and .zsav files.
2//!
3//! This library reads SPSS SAV/ZSAV files natively in Rust with no C dependencies.
4//! Data is returned as Apache Arrow RecordBatch for seamless integration with
5//! Polars, DataFusion, and other Arrow-compatible tools.
6//!
7//! # Quick Start
8//!
9//! ```no_run
10//! use ambers::read_sav;
11//!
12//! let (batch, meta) = read_sav("survey.sav").unwrap();
13//! println!("Rows: {}", batch.num_rows());
14//! println!("Columns: {}", batch.num_columns());
15//! println!("Variables: {:?}", meta.variable_names);
16//! ```
17
18pub(crate) mod arrow_convert;
19pub(crate) mod columnar;
20pub(crate) mod compression;
21pub mod constants;
22pub(crate) mod dictionary;
23pub(crate) mod document;
24pub(crate) mod encoding;
25pub mod error;
26pub(crate) mod header;
27pub(crate) mod info_records;
28pub(crate) mod io_utils;
29pub mod metadata;
30pub mod scanner;
31pub(crate) mod value_labels;
32pub(crate) mod variable;
33
34#[cfg(feature = "python")]
35mod python;
36
37use std::fs::File;
38use std::io::{BufReader, Read, Seek};
39use std::path::Path;
40
41use arrow::record_batch::RecordBatch;
42
43use crate::error::Result;
44use crate::scanner::SavScanner;
45
46// Re-export key public types
47pub use crate::constants::{Alignment, Measure};
48pub use crate::metadata::{MissingSpec, MrSet, MrType, SpssMetadata, Value};
49pub use crate::scanner::SavScanner as Scanner;
50
51/// Read an SPSS .sav or .zsav file, returning all data as an Arrow RecordBatch
52/// plus the file's metadata.
53///
54/// This loads the entire dataset into memory. For streaming batch reads or
55/// column projection, use `scan_sav()` instead.
56pub fn read_sav(path: impl AsRef<Path>) -> Result<(RecordBatch, SpssMetadata)> {
57    let mut scanner = scan_sav(path)?;
58    let metadata = scanner.metadata().clone();
59    let batch = scanner.collect_single()?;
60    Ok((batch, metadata))
61}
62
63/// Read an SPSS file from any reader that supports Read + Seek.
64pub fn read_sav_from_reader<R: Read + Seek>(reader: R) -> Result<(RecordBatch, SpssMetadata)> {
65    let mut scanner = scan_sav_from_reader(reader, usize::MAX)?;
66    let metadata = scanner.metadata().clone();
67    let batch = scanner.collect_single()?;
68    Ok((batch, metadata))
69}
70
71/// Read only the metadata from an SPSS file (no data).
72///
73/// This is much faster than `read_sav()` for files where you only need
74/// variable information, labels, or other metadata.
75pub fn read_sav_metadata(path: impl AsRef<Path>) -> Result<SpssMetadata> {
76    let file = File::open(path)?;
77    let buf_reader = BufReader::with_capacity(4 * 1024 * 1024, file);
78    let scanner = SavScanner::open(buf_reader, 0)?;
79    Ok(scanner.metadata().clone())
80}
81
82/// Create a streaming scanner for an SPSS .sav or .zsav file.
83///
84/// Reads metadata immediately. Data is read on demand via `next_batch()`
85/// or `collect_single()`. Supports column projection via `select()` and
86/// row limits via `limit()`.
87///
88/// Default batch size: 100,000 rows.
89///
90/// # Example
91/// ```no_run
92/// let mut scanner = ambers::scan_sav("survey.sav").unwrap();
93/// scanner.select(&["age", "gender"]).unwrap();
94/// scanner.limit(1000);
95/// while let Some(batch) = scanner.next_batch().unwrap() {
96///     println!("Batch: {} rows", batch.num_rows());
97/// }
98/// ```
99pub fn scan_sav(path: impl AsRef<Path>) -> Result<SavScanner<BufReader<File>>> {
100    let file = File::open(path)?;
101    let buf_reader = BufReader::with_capacity(4 * 1024 * 1024, file);
102    SavScanner::open(buf_reader, 100_000)
103}
104
105/// Create a streaming scanner from any Read+Seek source.
106pub fn scan_sav_from_reader<R: Read + Seek>(
107    reader: R,
108    batch_size: usize,
109) -> Result<SavScanner<R>> {
110    SavScanner::open(reader, batch_size)
111}