Skip to main content

Crate dta

Crate dta 

Source
Expand description

A pure Rust reader and writer for Stata data formats.

Two related formats live in this crate:

  • DTA — Stata’s binary dataset format (stata::dta). Every released version is supported (102 through 119), including XML-framed releases (117+), tagged missing values, value-label sets, and long-string (strL) storage. The API is built around a typestate chain — you walk through the sections of a file in order, and each phase hands the underlying I/O handle to the next.
  • DCT — Stata’s plain-text dictionary format (stata::dct). Describes the schema of a fixed-width or free-format data file. The reader is a two-step builder: parse the dictionary, then pair the resulting schema with a data source.

Format-agnostic Stata-domain types — MissingValue, StataByte/Int/Long/Float/Double, StataTimestamp, the temporal helpers — live at stata and are shared between the two formats.

See the README for the full tour, including DCT examples.

§Reading a DTA file

use dta::stata::dta::dta_reader::DtaReader;
use dta::stata::dta::dta_error::Result;

let mut characteristic_reader = DtaReader::new()
    .from_path("example.dta")?
    .read_header()?
    .read_schema()?;

// Characteristics are optional — skip them if you don't care.
characteristic_reader.skip_to_end()?;

// Iterate observation rows.
let mut record_reader = characteristic_reader.into_record_reader()?;
let schema = record_reader.schema().clone();
while let Some(record) = record_reader.read_record()? {
    for (variable, value) in schema.variables().iter().zip(record.values()) {
        println!("{}: {:?}", variable.name(), value);
    }
}

§Writing a DTA file

use dta::stata::dta::byte_order::ByteOrder;
use dta::stata::dta::dta_error::Result;
use dta::stata::dta::dta_writer::DtaWriter;
use dta::stata::dta::header::Header;
use dta::stata::dta::release::Release;
use dta::stata::dta::schema::Schema;
use dta::stata::dta::value::Value;
use dta::stata::dta::variable::Variable;
use dta::stata::dta::variable_type::VariableType;
use dta::stata::stata_long::StataLong;

let header = Header::builder(Release::V118, ByteOrder::LittleEndian).build();
let schema = Schema::builder()
    .add_variable(Variable::builder(VariableType::Long, "id").format("%12.0g"))
    .build()?;

let mut record_writer = DtaWriter::new()
    .from_path("example.dta")?
    .write_header(header)?
    .write_schema(schema)?
    .into_record_writer()?;
record_writer.write_record(&[Value::Long(StataLong::Present(1))])?;

record_writer
    .into_long_string_writer()?
    .into_value_label_writer()?
    .finish()?;

§Round-trip (runnable)

Both sides together against an in-memory buffer, so this example actually executes in the test harness:

use std::io::Cursor;
use dta::stata::dta::byte_order::ByteOrder;
use dta::stata::dta::dta_error::Result;
use dta::stata::dta::dta_reader::DtaReader;
use dta::stata::dta::dta_writer::DtaWriter;
use dta::stata::dta::header::Header;
use dta::stata::dta::release::Release;
use dta::stata::dta::schema::Schema;
use dta::stata::dta::value::Value;
use dta::stata::dta::variable::Variable;
use dta::stata::dta::variable_type::VariableType;
use dta::stata::stata_long::StataLong;

let header = Header::builder(Release::V118, ByteOrder::LittleEndian).build();
let schema = Schema::builder()
    .add_variable(Variable::builder(VariableType::Long, "id").format("%12.0g"))
    .build()?;

let mut record_writer = DtaWriter::new()
    .from_writer(Cursor::new(Vec::<u8>::new()))
    .write_header(header)?
    .write_schema(schema)?
    .into_record_writer()?;
record_writer.write_record(&[Value::Long(StataLong::Present(42))])?;
let bytes = record_writer
    .into_long_string_writer()?
    .into_value_label_writer()?
    .finish()?
    .into_inner();

let mut characteristic_reader = DtaReader::new()
    .from_reader(Cursor::new(bytes))
    .read_header()?
    .read_schema()?;
characteristic_reader.skip_to_end()?;
let mut record_reader = characteristic_reader.into_record_reader()?;
let record = record_reader.read_record()?.unwrap();
assert_eq!(record.values().len(), 1);

§Reading a DCT dictionary + data file

use dta::stata::dct::dct_reader::DctReader;
use dta::stata::dct::dct_source::DctSource;
use dta::stata::dct::dct_error::Result;

let source = DctSource::options().from_path("schema.dct")?;
let mut reader = match source {
    DctSource::External(schema) => {
        DctReader::options(schema).from_path("data.dat")?
    }
    DctSource::Embedded { schema, reader } => {
        DctReader::options(schema).from_reader(reader)
    }
};

// Capture column names up front: the lending pattern means
// `record` borrows the reader exclusively, so `reader.schema()`
// can't be called inside the loop body.
let column_names: Vec<String> = reader
    .schema()
    .columns()
    .iter()
    .map(|c| c.name().to_string())
    .collect();

while let Some(record) = reader.read_record()? {
    for (name, value) in column_names.iter().zip(record.values()) {
        println!("{}: {:?}", name, value);
    }
}

§Async

Enable the tokio feature for async mirrors of every entry point. Same typestate chain, .await at each step:

  • DTA: DtaReader::from_tokio_* / DtaWriter::from_tokio_*
  • DCT: DctSource::options().from_tokio_* and DctReader::options(schema).from_tokio_*

The async DCT paths share the same pure parsing state with the sync paths — the only difference is .await on read_line and fill_buf.

Modules§

stata
Stata file format types and utilities.