use std::borrow::Cow;
use crate::stata::missing_value::MissingValue;
use crate::stata::stata_byte::{DTA_113_MAX_INT8, StataByte};
use crate::stata::stata_double::StataDouble;
use crate::stata::stata_float::StataFloat;
use crate::stata::stata_int::{DTA_113_MAX_INT16, StataInt};
use crate::stata::stata_long::{DTA_113_MAX_INT32, StataLong};
use std::cell::RefCell;
use super::column::Column;
use super::column_anchor::ColumnAnchor;
use super::dct_error::{DctError, Result};
use super::dct_warning::DctWarning;
use super::input_format::InputFormat;
use super::lazy_record::LazyRecord;
use super::line_ending::strip_terminator;
use super::record::Record;
use super::schema::Schema;
use super::value::Value;
use super::variable_type::VariableType;
pub(super) type RelativeOffsetCache = RefCell<Vec<Option<usize>>>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum LineOutcome {
Read,
CleanEof,
PartialObservation,
}
#[derive(Debug)]
pub(super) struct DctReaderState {
schema: Schema,
line_buffers: Vec<String>,
observation_number: usize,
completed: bool,
warnings: Vec<DctWarning>,
record_warnings: bool,
runtime_cursors: Vec<usize>,
relative_offset_cache: RelativeOffsetCache,
}
impl DctReaderState {
#[must_use]
pub(super) fn new(schema: Schema, record_warnings: bool) -> Self {
Self {
schema,
line_buffers: Vec::new(),
observation_number: 0,
completed: false,
warnings: Vec::new(),
record_warnings,
runtime_cursors: Vec::new(),
relative_offset_cache: RefCell::new(Vec::new()),
}
}
#[must_use]
pub(super) fn schema(&self) -> &Schema {
&self.schema
}
#[must_use]
pub(super) fn warnings(&self) -> &[DctWarning] {
&self.warnings
}
#[must_use]
pub(super) fn is_completed(&self) -> bool {
self.completed
}
pub(super) fn begin_observation(&mut self) -> usize {
self.warnings.clear();
let lines_per_observation = self.schema.lines_per_observation();
self.line_buffers
.resize_with(lines_per_observation, String::new);
for buffer in &mut self.line_buffers {
buffer.clear();
}
lines_per_observation
}
pub(super) fn line_buffer_mut(&mut self, line_index: usize) -> &mut String {
&mut self.line_buffers[line_index]
}
pub(super) fn finalize_line(&mut self, line_index: usize, bytes_read: usize) -> LineOutcome {
if bytes_read == 0 {
self.completed = true;
if line_index == 0 {
LineOutcome::CleanEof
} else {
LineOutcome::PartialObservation
}
} else {
strip_terminator(&mut self.line_buffers[line_index]);
LineOutcome::Read
}
}
pub(super) fn advance_observation(&mut self) {
self.observation_number += 1;
}
pub(super) fn unexpected_eof_error(&self) -> DctError {
DctError::UnexpectedEofInData {
observation: self.observation_number + 1,
variables_read: 0,
}
}
pub(super) fn build_record(&mut self) -> Result<Record<'_>> {
let schema = &self.schema;
let line_buffers = &self.line_buffers;
let observation_number = self.observation_number;
let warnings_vec = &mut self.warnings;
let record_warnings = self.record_warnings;
let runtime_cursors = &mut self.runtime_cursors;
let lines_per_observation = schema.lines_per_observation();
runtime_cursors.clear();
runtime_cursors.resize(lines_per_observation, 0);
let mut values = Vec::with_capacity(schema.columns().len());
for column in schema.columns() {
let line_index = column.line_offset();
let line = &line_buffers[line_index];
let cursor = runtime_cursors[line_index];
let start = resolve_column_start(column, cursor, observation_number)?;
let warnings = if record_warnings {
Some(&mut *warnings_vec)
} else {
None
};
let value = parse_field(line, start, column, observation_number, warnings)?;
values.push(value);
runtime_cursors[line_index] = simulate_read_advance(line, start, column.input_format());
}
Ok(Record::new(values))
}
pub(super) fn build_lazy_record(&self) -> LazyRecord<'_> {
let column_count = self.schema.columns().len();
{
let mut cache = self.relative_offset_cache.borrow_mut();
cache.clear();
cache.resize(column_count, None);
}
LazyRecord::new(
&self.line_buffers,
self.schema.columns(),
self.observation_number,
&self.relative_offset_cache,
)
}
}
pub(super) fn resolve_runtime_offset(
line: &str,
columns: &[Column],
column_index: usize,
observation: usize,
) -> Result<usize> {
let target_line = columns[column_index].line_offset();
let (walk_start, mut cursor) =
find_nearest_absolute_anchor(&columns, column_index, target_line);
let columns_slice = &columns[walk_start..column_index];
cursor = simulate_reading(line, columns_slice, observation, target_line, cursor)?;
resolve_column_start(&columns[column_index], cursor, observation)
}
fn find_nearest_absolute_anchor(
columns: &&[Column],
column_index: usize,
target_line: usize,
) -> (usize, usize) {
let mut walk_start = 0usize;
let mut cursor = 0usize;
for index in (0..column_index).rev() {
let prev = &columns[index];
if prev.line_offset() != target_line {
continue;
}
if let ColumnAnchor::Absolute(offset) = prev.anchor() {
walk_start = index;
cursor = offset;
break;
}
}
(walk_start, cursor)
}
fn simulate_reading(
line: &str,
columns: &[Column],
observation: usize,
target_line: usize,
cursor: usize,
) -> Result<usize> {
let mut cursor = cursor;
for column in columns {
if column.line_offset() != target_line {
continue;
}
let start = resolve_column_start(column, cursor, observation)?;
cursor = simulate_read_advance(line, start, column.input_format());
}
Ok(cursor)
}
fn resolve_column_start(column: &Column, cursor: usize, observation: usize) -> Result<usize> {
match column.anchor() {
ColumnAnchor::Absolute(offset) => Ok(offset),
ColumnAnchor::RelativeToCursor { skip } => cursor
.checked_add(skip)
.ok_or_else(|| record_offset_overflow(column, observation)),
}
}
fn simulate_read_advance(line: &str, start: usize, input_format: InputFormat) -> usize {
let line_len = line.len();
let from = start.min(line_len);
match input_format {
InputFormat::FixedNumeric { width, .. } | InputFormat::FixedString { width } => {
from.saturating_add(width).min(line_len)
}
InputFormat::FreeNumeric => {
let after = line[from..].trim_ascii_start();
let leading = (line_len - from) - after.len();
let token_end = after
.find(|c: char| c.is_ascii_whitespace())
.unwrap_or(after.len());
from + leading + token_end
}
InputFormat::FreeString => {
let after = line[from..].trim_ascii_start();
let leading = (line_len - from) - after.len();
if let Some(body) = after.strip_prefix('"') {
let close = body.find('"').unwrap_or(body.len());
let closing = usize::from(close < body.len());
from + leading + 1 + close + closing
} else {
let token_end = after
.find(|c: char| c.is_ascii_whitespace())
.unwrap_or(after.len());
from + leading + token_end
}
}
}
}
pub(super) fn parse_field<'a>(
line: &'a str,
runtime_offset: usize,
column: &Column,
observation: usize,
warnings: Option<&mut Vec<DctWarning>>,
) -> Result<Value<'a>> {
match column.input_format() {
InputFormat::FixedNumeric {
width, decimals, ..
} => parse_fixed_numeric(
line,
runtime_offset,
column,
width,
decimals,
observation,
warnings,
),
InputFormat::FixedString { width } => {
parse_fixed_string(line, runtime_offset, column, width, observation, warnings)
}
InputFormat::FreeNumeric => {
parse_free_numeric(line, runtime_offset, column, observation, warnings)
}
InputFormat::FreeString => Ok(parse_free_string(line, runtime_offset)),
}
}
fn parse_fixed_numeric<'a>(
line: &str,
offset: usize,
column: &Column,
width: usize,
decimals: u8,
observation: usize,
warnings: Option<&mut Vec<DctWarning>>,
) -> Result<Value<'a>> {
let end = offset
.checked_add(width)
.ok_or_else(|| record_offset_overflow(column, observation))?;
let line_len = line.len();
let truncated = end > line_len;
let raw_field = &line[offset.min(line_len)..end.min(line_len)];
let trimmed = raw_field.trim_ascii();
if trimmed.is_empty() {
if truncated && let Some(warnings) = warnings {
let variable = column.name().to_string();
let warning = DctWarning::BlankFieldTreatedAsMissing {
variable,
observation,
};
warnings.push(warning);
}
let value = missing_value_for(column.storage_type());
return Ok(value);
}
if trimmed == "." {
let value = missing_value_for(column.storage_type());
return Ok(value);
}
let raw: f64 = trimmed
.parse()
.map_err(|_| invalid_numeric(column, observation, trimmed))?;
let shifted = if decimals == 0 {
raw
} else {
raw / 10f64.powi(i32::from(decimals))
};
coerce_numeric(shifted, column, observation, warnings)
}
fn parse_free_numeric<'a>(
line: &str,
offset: usize,
column: &Column,
observation: usize,
warnings: Option<&mut Vec<DctWarning>>,
) -> Result<Value<'a>> {
let token = take_free_token(line, offset);
if token.is_empty() || token == "." {
let value = missing_value_for(column.storage_type());
return Ok(value);
}
let raw: f64 = token
.parse()
.map_err(|_| invalid_numeric(column, observation, token))?;
coerce_numeric(raw, column, observation, warnings)
}
fn parse_fixed_string<'a>(
line: &'a str,
offset: usize,
column: &Column,
width: usize,
observation: usize,
warnings: Option<&mut Vec<DctWarning>>,
) -> Result<Value<'a>> {
let end = offset
.checked_add(width)
.ok_or_else(|| record_offset_overflow(column, observation))?;
let line_len = line.len();
let truncated = end > line_len;
let raw = &line[offset.min(line_len)..end.min(line_len)];
let trimmed = raw.trim_ascii_end();
if truncated
&& trimmed.is_empty()
&& let Some(warnings) = warnings
{
let variable = column.name().to_string();
let warning = DctWarning::BlankFieldTreatedAsMissing {
variable,
observation,
};
warnings.push(warning);
}
Ok(Value::String(Cow::Borrowed(trimmed)))
}
fn parse_free_string(line: &str, offset: usize) -> Value<'_> {
let from = offset.min(line.len());
let after = line[from..].trim_ascii_start();
if let Some(body) = after.strip_prefix('"') {
let close = body.find('"').unwrap_or(body.len());
let slice = &body[..close];
return Value::String(Cow::Borrowed(slice));
}
let end = after
.find(|c: char| c.is_ascii_whitespace())
.unwrap_or(after.len());
let slice = &after[..end];
Value::String(Cow::Borrowed(slice))
}
fn take_free_token(line: &str, offset: usize) -> &str {
let from = offset.min(line.len());
let after = line[from..].trim_ascii_start();
let end = after
.find(|c: char| c.is_ascii_whitespace())
.unwrap_or(after.len());
&after[..end]
}
fn missing_value_for(storage_type: VariableType) -> Value<'static> {
match storage_type {
VariableType::Byte => Value::Byte(StataByte::Missing(MissingValue::System)),
VariableType::Int => Value::Int(StataInt::Missing(MissingValue::System)),
VariableType::Long => Value::Long(StataLong::Missing(MissingValue::System)),
VariableType::Float => Value::Float(StataFloat::Missing(MissingValue::System)),
VariableType::Double => Value::Double(StataDouble::Missing(MissingValue::System)),
VariableType::String => Value::String(Cow::Borrowed("")),
}
}
fn coerce_numeric<'a>(
value: f64,
column: &Column,
observation: usize,
warnings: Option<&mut Vec<DctWarning>>,
) -> Result<Value<'a>> {
if !value.is_finite() {
return Err(invalid_numeric(column, observation, &value.to_string()));
}
match column.storage_type() {
VariableType::Byte | VariableType::Int | VariableType::Long => {
promote_integer(value, column, observation, warnings)
}
VariableType::Float => Ok(Value::Float(StataFloat::Present(f64_to_f32(value)))),
VariableType::Double => Ok(Value::Double(StataDouble::Present(value))),
VariableType::String => Err(invalid_numeric(column, observation, &value.to_string())),
}
}
fn promote_integer<'a>(
value: f64,
column: &Column,
observation: usize,
warnings: Option<&mut Vec<DctWarning>>,
) -> Result<Value<'a>> {
let declared = column.storage_type();
let rounded = value.round();
let chain = promotion_chain(declared);
for &candidate in chain {
if let Some(fitted) = fit_integer(rounded, candidate) {
if candidate != declared
&& let Some(warnings) = warnings
{
let variable = column.name().to_string();
let warning = DctWarning::IntegerPromotion {
variable,
observation,
from: declared,
to: candidate,
};
warnings.push(warning);
}
return Ok(fitted);
}
}
Err(invalid_numeric(column, observation, &value.to_string()))
}
fn promotion_chain(declared: VariableType) -> &'static [VariableType] {
match declared {
VariableType::Byte => &[
VariableType::Byte,
VariableType::Int,
VariableType::Long,
VariableType::Double,
],
VariableType::Int => &[VariableType::Int, VariableType::Long, VariableType::Double],
VariableType::Long => &[VariableType::Long, VariableType::Double],
_ => &[],
}
}
fn fit_integer<'a>(rounded: f64, target: VariableType) -> Option<Value<'a>> {
match target {
VariableType::Byte => fit_i8(rounded).map(|n| Value::Byte(StataByte::Present(n))),
VariableType::Int => fit_i16(rounded).map(|n| Value::Int(StataInt::Present(n))),
VariableType::Long => fit_i32(rounded).map(|n| Value::Long(StataLong::Present(n))),
VariableType::Double => Some(Value::Double(StataDouble::Present(rounded))),
_ => None,
}
}
#[allow(clippy::cast_possible_truncation)]
fn fit_i8(value: f64) -> Option<i8> {
if (f64::from(i8::MIN)..=f64::from(DTA_113_MAX_INT8)).contains(&value) {
Some(value as i8)
} else {
None
}
}
#[allow(clippy::cast_possible_truncation)]
fn fit_i16(value: f64) -> Option<i16> {
if (f64::from(i16::MIN)..=f64::from(DTA_113_MAX_INT16)).contains(&value) {
Some(value as i16)
} else {
None
}
}
#[allow(clippy::cast_possible_truncation)]
fn fit_i32(value: f64) -> Option<i32> {
if (f64::from(i32::MIN)..=f64::from(DTA_113_MAX_INT32)).contains(&value) {
Some(value as i32)
} else {
None
}
}
#[allow(clippy::cast_possible_truncation)]
fn f64_to_f32(value: f64) -> f32 {
value as f32
}
fn invalid_numeric(column: &Column, observation: usize, content: &str) -> DctError {
DctError::InvalidNumericValue {
observation,
variable: column.name().to_string(),
content: content.to_string(),
}
}
fn record_offset_overflow(column: &Column, observation: usize) -> DctError {
DctError::RecordOffsetOverflow {
observation,
variable: column.name().to_string(),
}
}