dta 0.3.0

Pure Rust streaming reader and writer for Stata's DTA file format, covering every released version (104-119), including XML-framed releases, tagged missing values, value-label sets, and long-string (strL) storage.
Documentation
use std::fs::{self, File};
use std::path::{Path, PathBuf};

use dta::stata::dta::dta_reader::DtaReader;
use dta::stata::dta::long_string_table::LongStringTable;
use dta::stata::dta::value::Value;

#[test]
#[ignore = "Using local files that require a license"]
fn read_auto_dta_section_counts() {
    let fixture_dir = Path::new("/mnt/c/Publish/pandas-stata-fixtures");
    let mut paths: Vec<PathBuf> = fs::read_dir(fixture_dir)
        .expect("failed to read fixture directory")
        .filter_map(Result::ok)
        .map(|entry| entry.path())
        .filter(|path| path.extension().and_then(|e| e.to_str()) == Some("dta"))
        .filter(|path| {
            // stata1_encoding.dta (V114/Windows-1252) reads fine, but
            // stata1_encoding_118.dta contains UTF-16-LE byte sequences
            // inside a file that declares itself UTF-8 — pandas bug or
            // not, the data genuinely isn't valid UTF-8 and the strict
            // V118 decoder rejects it. Separate concern from library
            // support.
            let name = path
                .file_name()
                .and_then(|n| n.to_str())
                .unwrap_or_default();
            name != "stata1_encoding_118.dta"
        })
        .collect();
    paths.sort();

    for path in &paths {
        read_dta_section_counts(path);
    }
}

fn read_dta_section_counts(path: &Path) {
    eprintln!("File: {}", path.to_string_lossy());
    let file =
        File::open(path).unwrap_or_else(|e| panic!("failed to open {}: {e}", path.display()));
    let header_reader = DtaReader::default().from_file(file);

    // Header + Schema
    let schema_reader = header_reader.read_header().expect("failed to read header");
    let header = schema_reader.header();
    eprintln!("Variable count: {}", header.variable_count());
    eprintln!("Observation count: {}", header.observation_count());

    let mut characteristic_reader = schema_reader.read_schema().expect("failed to read schema");
    let schema = characteristic_reader.schema();
    eprintln!("Actual variable count: {}", schema.variables().len());
    eprintln!("Sort order count: {}", schema.sort_order().len());

    // Characteristics
    let mut characteristic_count = 0;
    while let Some(_characteristic) = characteristic_reader
        .read_characteristic()
        .expect("failed to read characteristic")
    {
        characteristic_count += 1;
    }
    eprintln!("Characteristic count: {characteristic_count}");

    // Long strings (strls come before value labels in the file).
    // Jump forward to the strL section, populate the resolve table,
    // then jump to records.
    let mut long_string_reader = characteristic_reader
        .seek_long_strings()
        .expect("failed to jump to long string reader");

    let mut long_string_table = LongStringTable::for_reading();
    long_string_reader
        .read_remaining_into(&mut long_string_table)
        .expect("Could not read long string table");

    let mut record_reader = long_string_reader
        .seek_records()
        .expect("failed to jump to records");

    // Records
    let mut record_count = 0u64;
    let encoding = record_reader.encoding();
    while let Some(record) = record_reader.read_record().expect("failed to read record") {
        let mut value_strings = Vec::with_capacity(record.values().len());
        for value in record.values() {
            let value_str = match &value {
                Value::Byte(b) => b.present().map_or("NA".to_string(), |b| b.to_string()),
                Value::Int(i) => i.present().map_or("NA".to_string(), |b| b.to_string()),
                Value::Long(l) => l.present().map_or("NA".to_string(), |b| b.to_string()),
                Value::Float(f) => f.present().map_or("NA".to_string(), |b| format!("{b:0.4}")),
                Value::Double(d) => d.present().map_or("NA".to_string(), |b| format!("{b:0.4}")),
                Value::String(d) => d.to_string(),
                Value::LongStringRef(r) => long_string_table
                    .get(r)
                    .and_then(|s| s.data_str(encoding).map(|s| s.to_string()))
                    .unwrap_or("NA".to_string()),
            };
            value_strings.push(value_str);
        }
        let joined = value_strings.join("  |  ");
        eprintln!("{joined}");
        record_count += 1;
    }
    eprintln!("Actual observation count: {record_count}");

    // Long strings (strls come before value labels in the file)
    let mut long_string_reader = record_reader
        .into_long_string_reader()
        .expect("failed to transition to long string reader");

    let mut long_string_count = 0;
    while let Some(_long_string) = long_string_reader
        .read_long_string()
        .expect("failed to read long string")
    {
        long_string_count += 1;
    }
    eprintln!("Long string count: {long_string_count}");

    // Value labels
    let mut value_label_reader = long_string_reader
        .into_value_label_reader()
        .expect("failed to transition to value label reader");

    let mut value_label_set_count = 0;
    while let Some(_value_label_set) = value_label_reader
        .read_value_label_set()
        .expect("failed to read value label set")
    {
        value_label_set_count += 1;
    }
    eprintln!("Value label count: {value_label_set_count}");
}