use std::io::BufRead;
use super::dct_error::{DctError, Result};
use super::dct_source::DctSource;
use super::dct_source_state::{DctSourceState, FeedOutcome};
pub(super) fn parse_dct<R: BufRead>(mut reader: R) -> Result<DctSource<R>> {
let mut state = DctSourceState::new();
loop {
let read = reader.read_line(state.buffer_mut())?;
if read == 0 {
return Err(DctError::UnexpectedEofInDictionary);
}
let result = state.feed_buffered_line()?;
if matches!(result, FeedOutcome::Done) {
break;
}
}
let schema = state.into_schema();
let source = if has_more_data(&mut reader)? {
DctSource::Embedded { schema, reader }
} else {
DctSource::External(schema)
};
Ok(source)
}
fn has_more_data<R: BufRead>(reader: &mut R) -> Result<bool> {
let buffer = reader.fill_buf()?;
Ok(!buffer.is_empty())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::stata::dct::column_anchor::ColumnAnchor;
use crate::stata::dct::dct_warning::DctWarning;
use crate::stata::dct::input_format::InputFormat;
use crate::stata::dct::numeric_style::NumericStyle;
use crate::stata::dct::variable_type::VariableType;
use std::io::Cursor;
fn parse(input: &[u8]) -> Result<DctSource<Cursor<&[u8]>>> {
parse_dct(Cursor::new(input))
}
#[test]
fn parses_minimal_dictionary() {
let src = parse(b"dictionary {\n_column(1) myvar\n}\n").unwrap();
let schema = src.schema();
assert_eq!(schema.columns().len(), 1);
assert_eq!(schema.columns()[0].name(), "myvar");
assert_eq!(schema.columns()[0].anchor(), ColumnAnchor::Absolute(0));
assert_eq!(schema.columns()[0].storage_type(), VariableType::Float);
assert!(matches!(
schema.columns()[0].input_format(),
InputFormat::FreeNumeric
));
assert_eq!(schema.declared_data_path(), None);
assert!(matches!(src, DctSource::External(_)));
}
#[test]
fn parses_using_clause_emits_path_warning() {
let src = parse(b"dictionary using foo.raw {\n_column(1) v\n}\n").unwrap();
assert_eq!(src.schema().declared_data_path(), Some("foo.raw"));
assert!(
src.schema().warnings().iter().any(
|w| matches!(w, DctWarning::DeclaredPathIgnored { path } if path == "foo.raw")
)
);
}
#[test]
fn parses_quoted_using_path() {
let src = parse(b"dictionary using \"path with spaces.raw\" {\n_column(1) v\n}\n").unwrap();
assert_eq!(
src.schema().declared_data_path(),
Some("path with spaces.raw")
);
}
#[test]
fn parses_infile_prefix() {
let src = parse(b"infile dictionary using foo.raw {\n_column(1) v\n}\n").unwrap();
assert_eq!(src.schema().declared_data_path(), Some("foo.raw"));
}
#[test]
fn handles_brace_on_separate_line() {
let src = parse(b"dictionary using foo.raw\n{\n_column(1) v\n}\n").unwrap();
assert_eq!(src.schema().declared_data_path(), Some("foo.raw"));
assert_eq!(src.schema().columns().len(), 1);
}
#[test]
fn skips_comments_inside_and_around_dictionary() {
let src = parse(
b"* leading comment\n\
dictionary {\n\
* inline comment\n\
_column(1) v\n\
}\n",
)
.unwrap();
assert_eq!(src.schema().columns().len(), 1);
}
#[test]
fn parses_storage_types_and_formats() {
let dict = b"dictionary {\n\
_column(1) byte b1 %3.0f \"a byte\"\n\
_column(4) int i1\n\
_column(6) long l1 %10.0f\n\
_column(16) float f1 %9.2f\n\
_column(25) double d1\n\
_column(33) str s1 %20s\n\
_column(53) str10 s2\n\
_column(63) str s3 \"free string\"\n\
}\n";
let src = parse(dict).unwrap();
let cols = src.schema().columns();
assert_eq!(cols.len(), 8);
assert_eq!(cols[0].storage_type(), VariableType::Byte);
assert_eq!(cols[0].label(), Some("a byte"));
assert!(matches!(
cols[0].input_format(),
InputFormat::FixedNumeric {
width: 3,
decimals: 0,
style: NumericStyle::Fixed,
}
));
assert_eq!(cols[1].storage_type(), VariableType::Int);
assert!(matches!(cols[1].input_format(), InputFormat::FreeNumeric));
assert_eq!(cols[2].storage_type(), VariableType::Long);
assert_eq!(cols[3].storage_type(), VariableType::Float);
assert_eq!(cols[4].storage_type(), VariableType::Double);
assert_eq!(cols[5].storage_type(), VariableType::String);
assert!(matches!(
cols[5].input_format(),
InputFormat::FixedString { width: 20 }
));
assert_eq!(cols[6].storage_type(), VariableType::String);
assert!(matches!(
cols[6].input_format(),
InputFormat::FixedString { width: 10 }
));
assert_eq!(cols[7].storage_type(), VariableType::String);
assert!(matches!(cols[7].input_format(), InputFormat::FreeString));
assert_eq!(cols[7].label(), Some("free string"));
}
#[test]
fn parses_lrecl() {
let src = parse(b"dictionary {\nlrecl(80)\n_column(1) v\n}\n").unwrap();
assert_eq!(src.schema().logical_record_length(), Some(80));
}
#[test]
fn parses_first_line_of_file() {
let src = parse(b"dictionary {\nfirstlineoffile(5)\n_column(1) v\n}\n").unwrap();
assert_eq!(src.schema().first_line_of_file(), Some(5));
}
#[test]
fn rejects_duplicate_lrecl() {
let result = parse(b"dictionary {\nlrecl(80)\nlrecl(120)\n_column(1) v\n}\n");
assert!(matches!(
result,
Err(DctError::DuplicateDirective { directive, .. }) if directive == "lrecl"
));
}
#[test]
fn skip_modifier_advances_cursor_relatively() {
let dict = b"dictionary {\n\
_column(1) byte b1 %3.0f\n\
_skip(2) byte b2 %3.0f\n\
}\n";
let src = parse(dict).unwrap();
let cols = src.schema().columns();
assert_eq!(cols.len(), 2);
assert_eq!(cols[0].anchor(), ColumnAnchor::Absolute(0));
assert_eq!(cols[1].anchor(), ColumnAnchor::Absolute(5));
}
#[test]
fn column_then_skip_combines() {
let dict = b"dictionary {\n\
_column(10) _skip(3) byte b1 %3.0f\n\
}\n";
let src = parse(dict).unwrap();
let cols = src.schema().columns();
assert_eq!(cols[0].anchor(), ColumnAnchor::Absolute(12));
}
#[test]
fn bare_skip_advances_one_byte() {
let dict = b"dictionary {\n\
_column(1) byte b1 %3.0f\n\
_skip byte b2 %3.0f\n\
}\n";
let src = parse(dict).unwrap();
let cols = src.schema().columns();
assert_eq!(cols[1].anchor(), ColumnAnchor::Absolute(4));
}
#[test]
fn variable_with_no_anchors_uses_running_cursor() {
let dict = b"dictionary {\n\
_column(1) byte b1 %3.0f\n\
byte b2 %3.0f\n\
byte b3 %3.0f\n\
}\n";
let src = parse(dict).unwrap();
let cols = src.schema().columns();
assert_eq!(cols.len(), 3);
assert_eq!(cols[0].anchor(), ColumnAnchor::Absolute(0));
assert_eq!(cols[1].anchor(), ColumnAnchor::Absolute(3));
assert_eq!(cols[2].anchor(), ColumnAnchor::Absolute(6));
}
#[test]
fn newline_resets_cursor_to_line_start() {
let dict = b"dictionary {\n\
_column(1) byte b1 %3.0f\n\
_skip(5) byte b2 %3.0f\n\
_newline\n\
_skip(2) int i1 %5.0f\n\
}\n";
let src = parse(dict).unwrap();
let cols = src.schema().columns();
assert_eq!(cols[0].line_offset(), 0);
assert_eq!(cols[0].anchor(), ColumnAnchor::Absolute(0));
assert_eq!(cols[1].line_offset(), 0);
assert_eq!(cols[1].anchor(), ColumnAnchor::Absolute(8));
assert_eq!(cols[2].line_offset(), 1);
assert_eq!(cols[2].anchor(), ColumnAnchor::Absolute(2));
}
#[test]
fn detects_embedded_data() {
let src = parse(b"dictionary {\n_column(1) v\n}\n42\n").unwrap();
assert!(matches!(src, DctSource::Embedded { .. }));
}
#[test]
fn detects_external_data() {
let src = parse(b"dictionary {\n_column(1) v\n}\n").unwrap();
assert!(matches!(src, DctSource::External(_)));
}
#[test]
fn handles_crlf_line_endings() {
let src = parse(b"dictionary {\r\n_column(1) v\r\n}\r\n").unwrap();
assert_eq!(src.schema().columns().len(), 1);
}
#[test]
fn errors_on_missing_closing_brace() {
let result = parse(b"dictionary {\n_column(1) v\n");
assert!(matches!(result, Err(DctError::UnexpectedEofInDictionary)));
}
#[test]
fn errors_on_missing_dictionary_keyword() {
let result = parse(b"random {\n_column(1) v\n}\n");
assert!(matches!(
result,
Err(DctError::InvalidDictionaryHeader { .. })
));
}
#[test]
fn errors_on_invalid_column_directive() {
let result = parse(b"dictionary {\n_column(abc) v\n}\n");
assert!(matches!(
result,
Err(DctError::InvalidColumnDirective { .. })
));
}
#[test]
fn errors_on_zero_column_value() {
let result = parse(b"dictionary {\n_column(0) v\n}\n");
assert!(matches!(
result,
Err(DctError::InvalidColumnDirective { .. })
));
}
#[test]
fn errors_on_invalid_input_format() {
let result = parse(b"dictionary {\n_column(1) byte v %5.2x\n}\n");
assert!(matches!(result, Err(DctError::InvalidReadFormat { .. })));
}
#[test]
fn parses_multi_line_observation() {
let dict = b"dictionary {\n\
_column(1) byte b1\n\
_newline\n\
_column(1) int i1\n\
_column(5) double d1\n\
}\n";
let src = parse(dict).unwrap();
let schema = src.schema();
assert_eq!(schema.lines_per_observation(), 2);
let cols = schema.columns();
assert_eq!(cols.len(), 3);
assert_eq!(cols[0].line_offset(), 0);
assert_eq!(cols[0].anchor(), ColumnAnchor::Absolute(0));
assert_eq!(cols[1].line_offset(), 1);
assert_eq!(cols[1].anchor(), ColumnAnchor::Absolute(0));
assert_eq!(cols[2].line_offset(), 1);
assert_eq!(cols[2].anchor(), ColumnAnchor::Absolute(4));
}
#[test]
fn newline_with_no_following_columns_still_advances() {
let dict = b"dictionary {\n\
_column(1) byte b1\n\
_newline\n\
_newline\n\
_column(1) int i1\n\
}\n";
let src = parse(dict).unwrap();
assert_eq!(src.schema().lines_per_observation(), 3);
let cols = src.schema().columns();
assert_eq!(cols[0].line_offset(), 0);
assert_eq!(cols[1].line_offset(), 2);
}
#[test]
fn single_line_observation_reports_one_line() {
let src = parse(b"dictionary {\n_column(1) v\n}\n").unwrap();
assert_eq!(src.schema().lines_per_observation(), 1);
assert_eq!(src.schema().columns()[0].line_offset(), 0);
}
#[test]
fn warns_on_unrecognized_directive() {
let src = parse(b"dictionary {\n_column(1) v\nfoobar baz\n}\n").unwrap();
assert!(
src.schema()
.warnings()
.iter()
.any(|w| matches!(w, DctWarning::UnrecognizedDirective { .. }))
);
}
}