use std::iter::Peekable;
use std::ops::Range;
use super::column::Column;
use super::column_anchor::ColumnAnchor;
use super::dct_error::{DctError, Result};
use super::dct_warning::DctWarning;
use super::input_format::InputFormat;
use super::line_ending::strip_terminator;
use super::numeric_style::NumericStyle;
use super::schema::Schema;
use super::variable_type::VariableType;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum FeedOutcome {
NeedMore,
Done,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Stage {
Header,
Body,
Done,
}
#[derive(Debug)]
pub(super) struct DctSourceState {
buffer: String,
line_number: usize,
stage: Stage,
header_tokens: Vec<String>,
body_data: BodyData,
cursor_offset: usize,
cursor_is_static: bool,
scratch_ranges: Vec<Range<usize>>,
declared_data_path: Option<String>,
}
impl DctSourceState {
#[must_use]
pub(super) fn new() -> Self {
Self {
buffer: String::new(),
line_number: 0,
stage: Stage::Header,
header_tokens: Vec::new(),
body_data: BodyData {
columns: Vec::new(),
logical_record_length: None,
first_line_of_file: None,
lines_per_observation: 0, warnings: Vec::new(),
},
cursor_offset: 0,
cursor_is_static: true,
scratch_ranges: Vec::new(),
declared_data_path: None,
}
}
pub(super) fn buffer_mut(&mut self) -> &mut String {
self.buffer.clear();
&mut self.buffer
}
pub(super) fn feed_buffered_line(&mut self) -> Result<FeedOutcome> {
strip_terminator(&mut self.buffer);
self.line_number += 1;
match self.stage {
Stage::Header => self.feed_header_line(),
Stage::Body => self.feed_body_line(),
Stage::Done => Ok(FeedOutcome::Done),
}
}
#[must_use]
pub(super) fn into_schema(self) -> Schema {
let DctSourceState {
mut body_data,
declared_data_path,
..
} = self;
body_data.lines_per_observation += 1;
let mut warnings = body_data.warnings;
if let Some(path) = &declared_data_path {
let warning = DctWarning::DeclaredPathIgnored { path: path.clone() };
warnings.push(warning);
}
Schema::new(
body_data.columns,
body_data.logical_record_length,
body_data.first_line_of_file,
body_data.lines_per_observation,
declared_data_path,
warnings,
)
}
fn feed_header_line(&mut self) -> Result<FeedOutcome> {
let line = self.buffer.trim_ascii_start();
if line.starts_with('*') {
return Ok(FeedOutcome::NeedMore);
}
tokenize(line, &mut self.scratch_ranges);
let mut found_brace = false;
for range in &self.scratch_ranges {
let token = token(line, range);
if token == "{" {
found_brace = true;
}
self.header_tokens.push(token.to_string());
}
if !found_brace {
return Ok(FeedOutcome::NeedMore);
}
let header_line = self.line_number;
let header_tokens = &self.header_tokens;
let invalid = || DctError::InvalidDictionaryHeader {
line: header_line,
content: header_tokens.join(" "),
};
let mut iter = self.header_tokens.iter().map(String::as_str).peekable();
if matches!(iter.peek(), Some(&"infile")) {
iter.next();
}
if !matches!(iter.next(), Some("dictionary")) {
return Err(invalid());
}
let using_path = parse_using_path(&mut iter).map_err(|()| invalid())?;
if !matches!(iter.next(), Some("{")) {
return Err(invalid());
}
self.declared_data_path = using_path;
self.stage = Stage::Body;
Ok(FeedOutcome::NeedMore)
}
fn feed_body_line(&mut self) -> Result<FeedOutcome> {
let trimmed = self.buffer.trim_ascii();
if trimmed.is_empty() || trimmed.starts_with('*') {
return Ok(FeedOutcome::NeedMore);
}
if trimmed == "}" {
self.stage = Stage::Done;
return Ok(FeedOutcome::Done);
}
tokenize(trimmed, &mut self.scratch_ranges);
let Some(first_range) = self.scratch_ranges.first() else {
return Ok(FeedOutcome::NeedMore);
};
let first = token(trimmed, first_range);
let line_number = self.line_number;
if process_directive(
first,
line_number,
&mut self.body_data,
&mut self.cursor_offset,
&mut self.cursor_is_static,
)? {
return Ok(FeedOutcome::NeedMore);
}
if looks_like_variable_line(first) {
let column = parse_variable_line(
&self.scratch_ranges,
trimmed,
line_number,
self.body_data.lines_per_observation,
&mut self.cursor_offset,
&mut self.cursor_is_static,
)?;
self.body_data.columns.push(column);
return Ok(FeedOutcome::NeedMore);
}
let warning = DctWarning::UnrecognizedDirective {
line: line_number,
content: trimmed.to_string(),
};
self.body_data.warnings.push(warning);
Ok(FeedOutcome::NeedMore)
}
}
#[derive(Debug)]
struct BodyData {
columns: Vec<Column>,
logical_record_length: Option<usize>,
first_line_of_file: Option<usize>,
lines_per_observation: usize,
warnings: Vec<DctWarning>,
}
fn tokenize(line: &str, ranges: &mut Vec<Range<usize>>) {
ranges.clear();
let bytes = line.as_bytes();
let mut index = 0;
while index < bytes.len() {
while index < bytes.len() && bytes[index].is_ascii_whitespace() {
index += 1;
}
if index >= bytes.len() {
break;
}
let start = index;
if bytes[index] == b'"' {
index += 1;
while index < bytes.len() && bytes[index] != b'"' {
index += 1;
}
if index < bytes.len() {
index += 1; }
} else {
while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
index += 1;
}
}
ranges.push(start..index);
}
}
fn token<'a>(line: &'a str, range: &Range<usize>) -> &'a str {
&line[range.start..range.end]
}
#[must_use]
fn unquote(token: &str) -> &str {
if token.len() >= 2 && token.starts_with('"') && token.ends_with('"') {
&token[1..token.len() - 1]
} else {
token
}
}
fn parse_using_path<'a, I: Iterator<Item = &'a str>>(
iter: &mut Peekable<I>,
) -> std::result::Result<Option<String>, ()> {
if !matches!(iter.peek(), Some(&"using")) {
return Ok(None);
}
iter.next();
let path_token = iter.next().ok_or(())?;
let path_token = unquote(path_token);
let path_token = path_token.to_string();
Ok(Some(path_token))
}
enum Directive {
LogicalRecordLength(usize),
FirstLineOfFile(usize),
Newline,
}
fn process_directive(
first: &str,
line_number: usize,
data: &mut BodyData,
cursor_offset: &mut usize,
cursor_is_static: &mut bool,
) -> Result<bool> {
let Some(directive) = parse_directive(first) else {
return Ok(false);
};
match directive {
Directive::LogicalRecordLength(n) => {
if data.logical_record_length.is_some() {
let error = DctError::DuplicateDirective {
line: line_number,
directive: "lrecl".to_string(),
};
return Err(error);
}
data.logical_record_length = Some(n);
}
Directive::FirstLineOfFile(n) => {
if data.first_line_of_file.is_some() {
let error = DctError::DuplicateDirective {
line: line_number,
directive: "firstlineoffile".to_string(),
};
return Err(error);
}
data.first_line_of_file = Some(n);
}
Directive::Newline => {
data.lines_per_observation += 1;
*cursor_offset = 0;
*cursor_is_static = true;
}
}
Ok(true)
}
fn parse_directive(token: &str) -> Option<Directive> {
if token == "_newline" {
return Some(Directive::Newline);
}
if let Some(inner) = directive_argument(token, "lrecl") {
return inner.parse().ok().map(Directive::LogicalRecordLength);
}
if let Some(inner) = directive_argument(token, "firstlineoffile") {
return inner.parse().ok().map(Directive::FirstLineOfFile);
}
None
}
fn looks_like_variable_line(first: &str) -> bool {
first.starts_with("_column(")
|| first.starts_with("_skip(")
|| first == "_skip"
|| parse_storage_type(first).is_some()
}
fn parse_skip_modifier(token: &str) -> Option<usize> {
if token == "_skip" {
return Some(1);
}
directive_argument(token, "_skip")?.parse().ok()
}
fn directive_argument<'a>(token: &'a str, name: &str) -> Option<&'a str> {
let after_name = token.strip_prefix(name)?;
let after_open = after_name.strip_prefix('(')?;
let inside = after_open.strip_suffix(')')?;
Some(inside.trim_ascii())
}
fn parse_variable_line(
ranges: &[Range<usize>],
line: &str,
line_number: usize,
line_offset: usize,
cursor_offset: &mut usize,
cursor_is_static: &mut bool,
) -> Result<Column> {
let invalid = || DctError::InvalidColumnDirective {
line: line_number,
content: line.trim_ascii().to_string(),
};
let overflow = || DctError::DictionaryOffsetOverflow {
line: line_number,
content: line.trim_ascii().to_string(),
};
let mut iterator = ranges.iter().map(|range| token(line, range)).peekable();
let anchor =
compute_anchor(&mut iterator, *cursor_offset, *cursor_is_static).map_err(|fault| {
match fault {
OffsetFault::Invalid => invalid(),
OffsetFault::Overflow => overflow(),
}
})?;
let (storage_type, storage_str_width) =
try_parse_storage_type_width(&mut iterator).map_err(|()| invalid())?;
let name = try_parse_name(&mut iterator).map_err(|()| invalid())?;
let input_format = try_parse_input_format(&mut iterator, line_number)?;
let label = parse_label(&mut iterator);
if iterator.next().is_some() {
return Err(invalid());
}
let input_format = input_format.unwrap_or(select_fallback_input_format(
storage_type,
storage_str_width,
));
advance_parser_cursor(
anchor,
input_format,
cursor_offset,
cursor_is_static,
overflow,
)?;
let column = Column::new(line_offset, anchor, storage_type, name, input_format, label);
Ok(column)
}
enum OffsetFault {
Invalid,
Overflow,
}
fn compute_anchor<'a, I: Iterator<Item = &'a str>>(
iterator: &mut Peekable<I>,
cursor_offset: usize,
cursor_is_static: bool,
) -> std::result::Result<ColumnAnchor, OffsetFault> {
let mut explicit_column: Option<usize> = None;
if let Some(&token) = iterator.peek()
&& token.starts_with("_column(")
{
iterator.next();
let one_based = parse_column_directive(token).ok_or(OffsetFault::Invalid)?;
if one_based < 1 {
return Err(OffsetFault::Invalid);
}
explicit_column = Some(one_based - 1);
}
let mut skip = 0usize;
if let Some(&token) = iterator.peek()
&& let Some(s) = parse_skip_modifier(token)
{
iterator.next();
skip = s;
}
let anchor = match (explicit_column, cursor_is_static) {
(Some(base), _) => {
let resolved = base.checked_add(skip).ok_or(OffsetFault::Overflow)?;
ColumnAnchor::Absolute(resolved)
}
(None, true) => {
let resolved = cursor_offset
.checked_add(skip)
.ok_or(OffsetFault::Overflow)?;
ColumnAnchor::Absolute(resolved)
}
(None, false) => ColumnAnchor::RelativeToCursor { skip },
};
Ok(anchor)
}
fn advance_parser_cursor(
anchor: ColumnAnchor,
input_format: InputFormat,
cursor_offset: &mut usize,
cursor_is_static: &mut bool,
overflow: impl FnOnce() -> DctError,
) -> Result<()> {
match (anchor, input_format) {
(
ColumnAnchor::Absolute(start),
InputFormat::FixedNumeric { width, .. } | InputFormat::FixedString { width },
) => {
*cursor_offset = start.checked_add(width).ok_or_else(overflow)?;
*cursor_is_static = true;
}
(ColumnAnchor::Absolute(start), InputFormat::FreeNumeric | InputFormat::FreeString) => {
*cursor_offset = start;
*cursor_is_static = false;
}
(ColumnAnchor::RelativeToCursor { .. }, _) => {
*cursor_is_static = false;
}
}
Ok(())
}
fn try_parse_storage_type_width<'a, I: Iterator<Item = &'a str>>(
iterator: &mut Peekable<I>,
) -> std::result::Result<(VariableType, Option<usize>), ()> {
let Some(&token) = iterator.peek() else {
return Err(());
};
if let Some(parsed) = parse_storage_type(token) {
iterator.next();
Ok(parsed)
} else {
Ok((VariableType::Float, None))
}
}
fn try_parse_name<'a, I: Iterator<Item = &'a str>>(
iterator: &mut Peekable<I>,
) -> std::result::Result<String, ()> {
let name_token = iterator.next().ok_or(())?;
if name_token.starts_with('%') || name_token.starts_with('"') || name_token.starts_with('_') {
return Err(());
}
let name = name_token.to_string();
Ok(name)
}
fn try_parse_input_format<'a, I: Iterator<Item = &'a str>>(
iterator: &mut Peekable<I>,
line_number: usize,
) -> Result<Option<InputFormat>> {
let Some(&token) = iterator.peek() else {
return Ok(None);
};
if !token.starts_with('%') {
return Ok(None);
}
let input_format = parse_input_format(token).ok_or_else(|| DctError::InvalidReadFormat {
line: line_number,
format: token.to_string(),
})?;
let input_format = Some(input_format);
iterator.next();
Ok(input_format)
}
fn parse_label<'a, I: Iterator<Item = &'a str>>(iterator: &mut Peekable<I>) -> Option<String> {
let &token = iterator.peek()?;
if !token.starts_with('"') {
return None;
}
let label = unquote(token).to_string();
let label = Some(label);
iterator.next();
label
}
fn select_fallback_input_format(
storage_type: VariableType,
storage_str_width: Option<usize>,
) -> InputFormat {
match (storage_type, storage_str_width) {
(VariableType::String, Some(width)) => InputFormat::FixedString { width },
(VariableType::String, None) => InputFormat::FreeString,
_ => InputFormat::FreeNumeric,
}
}
fn parse_column_directive(token: &str) -> Option<usize> {
directive_argument(token, "_column").and_then(|inner| inner.parse().ok())
}
fn parse_storage_type(token: &str) -> Option<(VariableType, Option<usize>)> {
match token {
"byte" => Some((VariableType::Byte, None)),
"int" => Some((VariableType::Int, None)),
"long" => Some((VariableType::Long, None)),
"float" => Some((VariableType::Float, None)),
"double" => Some((VariableType::Double, None)),
"str" => Some((VariableType::String, None)),
token if token.starts_with("str") => {
let width: usize = token[3..].parse().ok()?;
Some((VariableType::String, Some(width)))
}
_ => None,
}
}
fn parse_input_format(token: &str) -> Option<InputFormat> {
let body = token.strip_prefix('%')?;
if body.is_empty() {
return None;
}
let last = body.chars().last()?;
if !matches!(last, 'f' | 'g' | 'e' | 's') {
return None;
}
let prefix = &body[..body.len() - last.len_utf8()];
if last == 's' {
if prefix.is_empty() {
return Some(InputFormat::FreeString);
}
let width: usize = prefix.parse().ok()?;
return Some(InputFormat::FixedString { width });
}
let style = match last {
'f' => NumericStyle::Fixed,
'g' => NumericStyle::General,
'e' => NumericStyle::Scientific,
_ => unreachable!(),
};
if prefix.is_empty() {
return Some(InputFormat::FreeNumeric);
}
let (width_str, decimals) = match prefix.find('.') {
Some(idx) => {
let dec: u8 = prefix[idx + 1..].parse().ok()?;
(&prefix[..idx], dec)
}
None => (prefix, 0),
};
let width: usize = width_str.parse().ok()?;
Some(InputFormat::FixedNumeric {
width,
decimals,
style,
})
}