use std::{
borrow::Cow,
collections::BTreeMap,
fmt::{Debug, Display, Formatter},
io::{Cursor, ErrorKind, Read, Seek, SeekFrom},
ops::Range,
str::from_utf8,
};
use crate::{
data::{ByteStrArray, ByteString, Datum},
dictionary::CategoryLabels,
endian::FromBytes,
format::{DisplayPlain, Format, Type},
identifier::{Error as IdError, Identifier},
sys::{
ProductVersion,
raw::{
Decoder, Error, ErrorDetails, Magic, RawDatum, RawWidth, Record, RecordString,
UntypedDatum, VarTypes, Warning, WarningDetails, read_bytes, read_string, read_vec,
},
serialize_endian,
},
variable::{
Alignment, Attributes, Measure, MissingValueRange, MissingValues, MissingValuesError,
VarType, VarWidth,
},
};
use binrw::{BinRead, BinWrite, Endian, Error as BinError, binrw};
use clap::ValueEnum;
use encoding_rs::Encoding;
use itertools::Itertools;
use serde::{Serialize, Serializer, ser::SerializeTuple};
use thiserror::Error as ThisError;
#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, ValueEnum)]
pub enum Compression {
Simple,
#[value(name = "zlib", help = "ZLIB space-efficient compression")]
ZLib,
}
#[derive(ThisError, Debug)]
pub enum HeaderWarning {
#[error("Compression bias is {0} instead of the usual values of 0 or 100.")]
UnexpectedBias(f64),
}
#[derive(Clone, Debug, Serialize)]
pub struct FileHeader<S>
where
S: Debug + Serialize,
{
pub magic: Magic,
pub eye_catcher: S,
pub layout_code: u32,
pub nominal_case_size: Option<u32>,
pub compression: Option<Compression>,
pub weight_index: Option<u32>,
pub n_cases: Option<u32>,
pub bias: f64,
pub creation_date: S,
pub creation_time: S,
pub file_label: S,
#[serde(serialize_with = "serialize_endian")]
pub endian: Endian,
}
#[derive(BinRead, BinWrite)]
pub struct RawHeader {
pub magic: [u8; 4],
pub eye_catcher: [u8; 60],
pub layout_code: u32,
pub nominal_case_size: u32,
pub compression_code: u32,
pub weight_index: u32,
pub n_cases: u32,
pub bias: f64,
pub creation_date: [u8; 9],
pub creation_time: [u8; 8],
#[brw(pad_after = 3)]
pub file_label: [u8; 64],
}
impl FileHeader<ByteString> {
pub fn read<R>(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result<Self, Error<ErrorDetails>>
where
R: Read + Seek,
{
let header_bytes = read_vec(r, 176).map_err(|e| {
Error::new(
None,
if e.kind() == ErrorKind::UnexpectedEof {
ErrorDetails::NotASystemFile
} else {
e.into()
},
)
})?;
Self::read_inner(&header_bytes, warn).map_err(|details| Error::new(Some(0..176), details))
}
fn read_inner(
header_bytes: &[u8],
warn: &mut dyn FnMut(Warning),
) -> Result<Self, ErrorDetails> {
if &header_bytes[8..20] == b"ENCRYPTEDSAV" {
return Err(ErrorDetails::Encrypted);
}
let be_header = RawHeader::read_be(&mut Cursor::new(&header_bytes)).unwrap();
let le_header = RawHeader::read_le(&mut Cursor::new(&header_bytes)).unwrap();
let magic: Magic = be_header
.magic
.try_into()
.map_err(|_| ErrorDetails::NotASystemFile)?;
let (endian, header) = if be_header.layout_code == 2 {
(Endian::Big, &be_header)
} else if le_header.layout_code == 2 {
(Endian::Little, &le_header)
} else {
return Err(ErrorDetails::NotASystemFile);
};
let nominal_case_size = (1..i32::MAX.cast_unsigned() / 16)
.contains(&header.nominal_case_size)
.then_some(header.nominal_case_size);
let compression = match (magic, header.compression_code) {
(Magic::Zsav, 2) => Some(Compression::ZLib),
(Magic::Zsav, code) => return Err(ErrorDetails::InvalidZsavCompression(code)),
(_, 0) => None,
(_, 1) => Some(Compression::Simple),
(_, code) => return Err(ErrorDetails::InvalidSavCompression(code)),
};
let weight_index = (header.weight_index > 0).then_some(header.weight_index);
let n_cases = (header.n_cases <= u32::MAX / 2).then_some(header.n_cases);
if header.bias != 100.0 && header.bias != 0.0 {
warn(Warning::new(
Some(84..92),
HeaderWarning::UnexpectedBias(header.bias),
));
}
Ok(FileHeader {
magic,
layout_code: header.layout_code,
nominal_case_size,
compression,
weight_index,
n_cases,
bias: header.bias,
creation_date: header.creation_date.into(),
creation_time: header.creation_time.into(),
eye_catcher: header.eye_catcher.into(),
file_label: header.file_label.into(),
endian,
})
}
pub fn decode(self, decoder: &mut Decoder) -> FileHeader<String> {
let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
let file_label = decoder.decode(&self.file_label).to_string();
let creation_date = decoder.decode(&self.creation_date).to_string();
let creation_time = decoder.decode(&self.creation_time).to_string();
FileHeader {
eye_catcher,
weight_index: self.weight_index,
n_cases: self.n_cases,
file_label,
magic: self.magic,
layout_code: self.layout_code,
nominal_case_size: self.nominal_case_size,
compression: self.compression,
bias: self.bias,
creation_date,
creation_time,
endian: self.endian,
}
}
pub fn get_strings(&self) -> Vec<RecordString> {
vec![
RecordString::new("Product", &self.eye_catcher.0[5..], false),
RecordString::new("File Label", &self.file_label, false),
]
}
}
#[derive(Copy, Clone, PartialEq, Eq, Hash, BinRead, BinWrite)]
pub struct RawFormat(
pub u32,
);
#[derive(Copy, Clone, Debug)]
pub struct VeryLongStringError;
impl TryFrom<Format> for RawFormat {
type Error = VeryLongStringError;
fn try_from(value: Format) -> Result<Self, Self::Error> {
let type_ = u16::from(value.type_()) as u32;
let w = match value.var_width() {
VarWidth::Numeric => value.w() as u8,
VarWidth::String(w) if w > 255 => return Err(VeryLongStringError),
VarWidth::String(w) if value.type_() == Type::AHex => (w * 2).min(255) as u8,
VarWidth::String(w) => w as u8,
} as u32;
let d = value.d() as u32;
Ok(Self((type_ << 16) | (w << 8) | d))
}
}
struct RawFormatDisplayMeaning(RawFormat);
impl Display for RawFormatDisplayMeaning {
fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
let type_ = format_name(self.0.0 >> 16);
let w = (self.0.0 >> 8) & 0xff;
let d = self.0.0 & 0xff;
write!(f, "{type_}{w}.{d}")
}
}
impl Debug for RawFormat {
fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
write!(f, "{:06x} ({})", self.0, RawFormatDisplayMeaning(*self))
}
}
impl Serialize for RawFormat {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&self.0)?;
tuple.serialize_element(&RawFormatDisplayMeaning(*self).to_string())?;
tuple.end()
}
}
fn format_name(type_: u32) -> Cow<'static, str> {
match type_ {
1 => "A",
2 => "AHEX",
3 => "COMMA",
4 => "DOLLAR",
5 => "F",
6 => "IB",
7 => "PIBHEX",
8 => "P",
9 => "PIB",
10 => "PK",
11 => "RB",
12 => "RBHEX",
15 => "Z",
16 => "N",
17 => "E",
20 => "DATE",
21 => "TIME",
22 => "DATETIME",
23 => "ADATE",
24 => "JDATE",
25 => "DTIME",
26 => "WKDAY",
27 => "MONTH",
28 => "MOYR",
29 => "QYR",
30 => "WKYR",
31 => "PCT",
32 => "DOT",
33 => "CCA",
34 => "CCB",
35 => "CCC",
36 => "CCD",
37 => "CCE",
38 => "EDATE",
39 => "SDATE",
40 => "MTIME",
41 => "YMDHMS",
_ => return format!("<unknown format {type_}>").into(),
}
.into()
}
#[derive(Clone, Debug, Default, Serialize)]
pub struct RawMissingValues {
pub values: Vec<Datum<ByteString>>,
pub range: Option<MissingValueRange>,
}
impl RawMissingValues {
pub fn new(values: Vec<Datum<ByteString>>, range: Option<MissingValueRange>) -> Self {
Self { values, range }
}
fn read<R>(
r: &mut R,
offsets: Range<u64>,
raw_width: RawWidth,
code: i32,
endian: Endian,
warn: &mut dyn FnMut(Warning),
) -> Result<Self, Error<ErrorDetails>>
where
R: Read + Seek,
{
let (individual_values, has_range) = match code {
0 => return Ok(Self::default()),
1..=3 => (code as usize, false),
-2 => (0, true),
-3 => (1, true),
_ => {
return Err(Error::new(
Some(offsets),
ErrorDetails::BadMissingValueCode(code),
));
}
};
Self::read_inner(
r,
offsets.clone(),
raw_width,
individual_values,
has_range,
endian,
warn,
)
.map_err(|details| {
Error::new(
{
let n = individual_values + if has_range { 2 } else { 0 };
Some(offsets.start..offsets.end + 8 * n as u64)
},
details,
)
})
}
fn read_inner<R>(
r: &mut R,
offsets: Range<u64>,
raw_width: RawWidth,
individual_values: usize,
has_range: bool,
endian: Endian,
warn: &mut dyn FnMut(Warning),
) -> Result<Self, ErrorDetails>
where
R: Read + Seek,
{
let mut values = Vec::with_capacity(individual_values);
let range = if has_range {
let low = read_bytes::<8, _>(r)?;
let high = read_bytes::<8, _>(r)?;
Some((low, high))
} else {
None
};
for _ in 0..individual_values {
values.push(read_bytes::<8, _>(r)?);
}
match VarWidth::try_from(raw_width) {
Ok(VarWidth::Numeric) => {
let values = values
.into_iter()
.map(|v| Datum::Number(endian.parse(v)))
.collect();
let range = range.map(|(low, high)| {
MissingValueRange::new(endian.parse(low), endian.parse(high))
});
return Ok(Self::new(values, range));
}
Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::new(
Some(offsets),
VariableWarning::MissingValueStringRange,
)),
Ok(VarWidth::String(width)) => {
let width = width.min(8) as usize;
let values = values
.into_iter()
.map(|value| Datum::String(ByteString::from(&value[..width])))
.collect();
return Ok(Self::new(values, None));
}
Err(()) => warn(Warning::new(
Some(offsets),
VariableWarning::MissingValueContinuation,
)),
}
Ok(Self::default())
}
pub fn decode(&self, encoding: &'static Encoding) -> Result<MissingValues, MissingValuesError> {
MissingValues::new(
self.values
.iter()
.map(|datum| datum.clone().with_encoding(encoding))
.collect(),
self.range,
)
}
}
#[derive(ThisError, Debug)]
pub enum VariableWarning {
#[error("Missing value record with range not allowed for string variable.")]
MissingValueStringRange,
#[error("Missing value not allowed for long string continuation")]
MissingValueContinuation,
}
#[derive(Clone, Debug, Serialize)]
pub struct VariableRecord<S>
where
S: Debug + Serialize,
{
pub offsets: Range<u64>,
pub width: RawWidth,
pub name: S,
pub print_format: RawFormat,
pub write_format: RawFormat,
pub missing_values: RawMissingValues,
pub label: Option<S>,
}
#[derive(BinRead, BinWrite)]
pub struct RawVariableRecord {
pub width: i32,
pub has_variable_label: u32,
pub missing_value_code: i32,
pub print_format: RawFormat,
pub write_format: RawFormat,
pub name: [u8; 8],
}
impl VariableRecord<ByteString> {
pub fn read<R>(
r: &mut R,
endian: Endian,
warn: &mut dyn FnMut(Warning),
) -> Result<Self, Error<ErrorDetails>>
where
R: Read + Seek,
{
let start_offset = r.stream_position()?;
let offsets = start_offset..start_offset + 28;
let raw_record =
read_vec(r, 28).map_err(|e| Error::new(Some(offsets.clone()), e.into()))?;
let raw_record =
RawVariableRecord::read_options(&mut Cursor::new(&raw_record), endian, ()).unwrap();
let width: RawWidth = raw_record.width.try_into().map_err(|_| {
Error::new(
Some(offsets.clone()),
ErrorDetails::BadVariableWidth(raw_record.width),
)
})?;
let label = match raw_record.has_variable_label {
0 => None,
1 => {
let len: u32 = endian.parse(read_bytes(r)?);
let read_len = len.min(65535) as usize;
let label = read_vec(r, read_len)?;
let padding_bytes = len.next_multiple_of(4) - len;
let _ = read_vec(r, padding_bytes as usize)?;
Some(label.into())
}
_ => {
return Err(Error::new(
Some(offsets),
ErrorDetails::BadVariableLabelCode(raw_record.has_variable_label),
));
}
};
let missing_values = RawMissingValues::read(
r,
offsets,
width,
raw_record.missing_value_code,
endian,
warn,
)?;
let end_offset = r.stream_position()?;
Ok(Self {
offsets: start_offset..end_offset,
width,
name: raw_record.name.into(),
print_format: raw_record.print_format,
write_format: raw_record.write_format,
missing_values,
label,
})
}
pub fn decode(self, decoder: &mut Decoder) -> VariableRecord<String> {
VariableRecord {
offsets: self.offsets.clone(),
width: self.width,
name: decoder.decode(&self.name).to_string(),
print_format: self.print_format,
write_format: self.write_format,
missing_values: self.missing_values,
label: self
.label
.as_ref()
.map(|label| decoder.decode(label).to_string()),
}
}
}
#[derive(ThisError, Debug)]
pub enum ValueLabelWarning {
#[error("At least one valid variable index is required but none were specified.")]
NoVarIndexes,
#[error("First variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", !var_type)]
MixedVarTypes {
var_type: VarType,
wrong_types: Vec<u32>,
},
#[error(
"One or more variable indexes were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}"
)]
InvalidVarIndexes {
max: usize,
invalid: Vec<u32>,
},
}
#[derive(Clone, Debug, Serialize)]
pub struct ValueLabel<D, S>
where
D: Debug + Serialize,
S: Debug + Serialize,
{
pub datum: D,
pub label: S,
}
#[derive(Clone, Debug, Serialize)]
pub struct ValueLabelRecord<D, S>
where
D: Debug + Serialize,
S: Debug + Serialize,
{
pub offsets: Range<u64>,
pub labels: Vec<ValueLabel<D, S>>,
pub dict_indexes: Vec<u32>,
pub var_type: VarType,
}
impl<D, S> ValueLabelRecord<D, S>
where
D: Debug + Serialize,
S: Debug + Serialize,
{
pub const MAX_LABELS: u32 = u32::MAX / 8;
pub const MAX_INDEXES: u32 = u32::MAX / 8;
}
impl ValueLabelRecord<RawDatum, ByteString> {
pub fn read<R: Read + Seek>(
r: &mut R,
endian: Endian,
var_types: &VarTypes,
warn: &mut dyn FnMut(Warning),
) -> Result<Option<Self>, Error<ErrorDetails>> {
let label_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
if n > Self::MAX_LABELS {
return Err(Error::new(
Some(label_offset..label_offset + 4),
ErrorDetails::BadNumberOfValueLabels {
n,
max: Self::MAX_LABELS,
},
));
}
let mut labels = Vec::new();
for _ in 0..n {
let value = UntypedDatum(read_bytes(r)?);
let label_len: u8 = endian.parse(read_bytes(r)?);
let label_len = label_len as usize;
let padded_len = (label_len + 1).next_multiple_of(8);
let mut label = read_vec(r, padded_len - 1)?;
label.truncate(label_len);
labels.push((value, label.into()));
}
let index_offset = r.stream_position()?;
let rec_type: u32 = endian.parse(read_bytes(r)?);
if rec_type != 4 {
return Err(Error::new(
Some(index_offset..index_offset + 4),
ErrorDetails::ExpectedVarIndexRecord(rec_type),
));
}
let n: u32 = endian.parse(read_bytes(r)?);
let n_offsets = index_offset + 4..index_offset + 8;
if n > Self::MAX_INDEXES {
return Err(Error::new(
Some(n_offsets),
ErrorDetails::TooManyVarIndexes {
n,
max: Self::MAX_INDEXES,
},
));
} else if n == 0 {
warn(Warning::new(
Some(n_offsets),
ValueLabelWarning::NoVarIndexes,
));
return Ok(None);
}
let index_offset = r.stream_position()?;
let mut dict_indexes = Vec::with_capacity(n as usize);
let mut invalid_indexes = Vec::new();
for _ in 0..n {
let index: u32 = endian.parse(read_bytes(r)?);
if var_types.is_valid_index(index as usize) {
dict_indexes.push(index);
} else {
invalid_indexes.push(index);
}
}
let index_offsets = index_offset..r.stream_position()?;
if !invalid_indexes.is_empty() {
warn(Warning::new(
Some(index_offsets.clone()),
ValueLabelWarning::InvalidVarIndexes {
max: var_types.n_values(),
invalid: invalid_indexes,
},
));
}
let Some(&first_index) = dict_indexes.first() else {
return Ok(None);
};
let var_type = var_types.var_type_at(first_index as usize).unwrap();
let mut wrong_type_indexes = Vec::new();
dict_indexes.retain(|&index| {
if var_types.var_type_at(index as usize) != Some(var_type) {
wrong_type_indexes.push(index);
false
} else {
true
}
});
if !wrong_type_indexes.is_empty() {
warn(Warning::new(
Some(index_offsets),
ValueLabelWarning::MixedVarTypes {
var_type,
wrong_types: wrong_type_indexes,
},
));
}
let labels = labels
.into_iter()
.map(|(value, label)| ValueLabel {
datum: RawDatum::from_raw(&value, var_type, endian),
label,
})
.collect();
let end_offset = r.stream_position()?;
Ok(Some(ValueLabelRecord {
offsets: label_offset..end_offset,
labels,
dict_indexes,
var_type,
}))
}
pub fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord<RawDatum, String> {
let labels = self
.labels
.iter()
.map(
|ValueLabel {
datum: value,
label,
}| ValueLabel {
datum: *value,
label: decoder.decode(label).to_string(),
},
)
.collect();
ValueLabelRecord {
offsets: self.offsets.clone(),
labels,
dict_indexes: self.dict_indexes.clone(),
var_type: self.var_type,
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct DocumentRecord<S>
where
S: Debug + Serialize,
{
pub offsets: Range<u64>,
pub lines: Vec<S>,
}
pub type RawDocumentLine = ByteStrArray<DOC_LINE_LEN>;
pub const DOC_LINE_LEN: usize = 80;
impl DocumentRecord<RawDocumentLine> {
pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
pub fn read<R>(r: &mut R, endian: Endian) -> Result<Self, Error<ErrorDetails>>
where
R: Read + Seek,
{
let start_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
let n = n as usize;
if n > Self::MAX_LINES {
Err(Error::new(
Some(start_offset..start_offset + 4),
ErrorDetails::BadDocumentLength {
n,
max: Self::MAX_LINES,
},
))
} else {
let offsets = start_offset..start_offset.saturating_add((n * DOC_LINE_LEN) as u64);
let mut lines = Vec::with_capacity(n);
for _ in 0..n {
lines.push(ByteStrArray(
read_bytes(r).map_err(|e| Error::new(Some(offsets.clone()), e.into()))?,
));
}
Ok(DocumentRecord { offsets, lines })
}
}
pub fn decode(self, decoder: &mut Decoder) -> DocumentRecord<String> {
DocumentRecord {
offsets: self.offsets.clone(),
lines: self
.lines
.iter()
.map(|s| decoder.decode_slice(&s.0).to_string())
.collect(),
}
}
}
pub struct ExtensionRecord<'a> {
pub size: Option<u32>,
pub count: Option<u32>,
pub name: &'a str,
}
#[derive(Clone, Debug, Serialize)]
pub struct IntegerInfoRecord {
pub offsets: Range<u64>,
#[serde(flatten)]
pub inner: RawIntegerInfoRecord,
}
#[derive(Clone, Debug, BinRead, BinWrite, Serialize)]
pub struct RawIntegerInfoRecord {
pub version: ProductVersion,
pub machine_code: i32,
pub floating_point_rep: i32,
pub compression_code: i32,
pub endianness: i32,
pub character_code: i32,
}
impl IntegerInfoRecord {
pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
ext.check_size(Some(4), Some(8), "integer record")?;
let inner =
RawIntegerInfoRecord::read_options(&mut Cursor::new(ext.data.as_slice()), endian, ())
.unwrap();
Ok(Record::IntegerInfo(IntegerInfoRecord {
offsets: ext.offsets.clone(),
inner,
}))
}
}
impl FloatInfoRecord {
pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
ext.check_size(Some(8), Some(3), "floating point record")?;
let data = FloatInfoRecord::read_options(&mut Cursor::new(ext.data.as_slice()), endian, ())
.unwrap();
Ok(Record::FloatInfo(data))
}
}
#[derive(Clone, Debug, BinRead, BinWrite, Serialize)]
pub struct FloatInfoRecord {
pub sysmis: f64,
pub highest: f64,
pub lowest: f64,
}
#[derive(Clone, Debug, Serialize)]
pub struct RawLongNamesRecord(
pub TextRecord,
);
impl RawLongNamesRecord {
pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
Ok(Record::LongNames(Self(TextRecord::parse(
extension,
"long names record",
)?)))
}
pub fn decode(self, decoder: &mut Decoder) -> LongNamesRecord {
let input = decoder.decode(&self.0.text);
let mut names = Vec::new();
for pair in input.split('\t').filter(|s| !s.is_empty()) {
if let Some(long_name) =
LongName::parse(pair, decoder).issue_warning(&self.0.offsets, &mut decoder.warn)
{
names.push(long_name);
}
}
LongNamesRecord(names)
}
}
#[derive(Clone, Debug, Serialize)]
pub struct TextRecord {
pub offsets: Range<u64>,
pub text: ByteString,
}
impl TextRecord {
pub fn parse(extension: Extension, name: &'static str) -> Result<TextRecord, WarningDetails> {
extension.check_size(Some(1), None, name)?;
Ok(Self {
offsets: extension.offsets,
text: extension.data.into(),
})
}
}
#[derive(ThisError, Debug)]
pub enum VeryLongStringWarning {
#[error("Invalid variable name. {0}")]
InvalidLongStringName(
IdError,
),
#[error("Missing delimiter in {0:?}.")]
VeryLongStringMissingDelimiter(String),
#[error("Invalid length in {0:?}.")]
VeryLongStringInvalidLength(
String,
),
}
#[derive(Clone, Debug, Serialize)]
pub struct VeryLongString {
pub short_name: Identifier,
pub length: u16,
}
impl VeryLongString {
pub fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, WarningDetails> {
let Some((short_name, length)) = input.split_once('=') else {
return Err(VeryLongStringWarning::VeryLongStringMissingDelimiter(input.into()).into());
};
let short_name = decoder
.new_identifier(short_name)
.and_then(Identifier::must_be_ordinary)
.map_err(VeryLongStringWarning::InvalidLongStringName)?;
let length = length
.parse()
.map_err(|_| VeryLongStringWarning::VeryLongStringInvalidLength(input.into()))?;
Ok(VeryLongString { short_name, length })
}
}
#[derive(Clone, Debug, Serialize)]
pub struct RawVeryLongStringsRecord(pub TextRecord);
#[derive(Clone, Debug, Serialize)]
pub struct VeryLongStringsRecord(
pub Vec<VeryLongString>,
);
impl RawVeryLongStringsRecord {
pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
Ok(Record::VeryLongStrings(Self(TextRecord::parse(
extension,
"very long strings record",
)?)))
}
pub fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord {
let input = decoder.decode(&self.0.text);
let mut very_long_strings = Vec::new();
for tuple in input
.split('\0')
.map(|s| s.trim_start_matches('\t'))
.filter(|s| !s.is_empty())
{
if let Some(vls) = VeryLongString::parse(decoder, tuple)
.issue_warning(&self.0.offsets, &mut decoder.warn)
{
very_long_strings.push(vls)
}
}
VeryLongStringsRecord(very_long_strings)
}
}
#[derive(ThisError, Debug)]
pub enum MultipleResponseWarning {
#[error("Invalid multiple response set name. {0}")]
InvalidMrSetName(
IdError,
),
#[error("Invalid variable name. {0}")]
InvalidMrSetVariableName(
IdError,
),
#[error("Invalid multiple dichotomy label type.")]
InvalidMultipleDichotomyLabelType,
#[error("Invalid multiple response type.")]
InvalidMultipleResponseType,
#[error("Syntax error ({0}).")]
MultipleResponseSyntaxError(
&'static str,
),
#[error("Syntax error parsing counted string (missing trailing space).")]
CountedStringMissingSpace,
#[error("Syntax error parsing counted string (invalid UTF-8).")]
CountedStringInvalidUTF8,
#[error("Syntax error parsing counted string (invalid length {0:?}).")]
CountedStringInvalidLength(
String,
),
#[error("Syntax error parsing counted string (length {0:?} goes past end of input).")]
CountedStringTooLong(
usize,
),
}
#[derive(Clone, Debug, Serialize)]
pub enum MultipleResponseType {
MultipleDichotomy {
value: ByteString,
labels: CategoryLabels,
},
MultipleCategory,
}
impl MultipleResponseType {
fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), WarningDetails> {
let (mr_type, input) = match input.split_first() {
Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
Some((b'D', input)) => {
let (value, input) = parse_counted_string(input)?;
(
MultipleResponseType::MultipleDichotomy {
value,
labels: CategoryLabels::VarLabels,
},
input,
)
}
Some((b'E', input)) => {
let (use_var_label_as_mrset_label, input) = if let Some(rest) =
input.strip_prefix(b" 1 ")
{
(false, rest)
} else if let Some(rest) = input.strip_prefix(b" 11 ") {
(true, rest)
} else {
return Err(MultipleResponseWarning::InvalidMultipleDichotomyLabelType.into());
};
let (value, input) = parse_counted_string(input)?;
(
MultipleResponseType::MultipleDichotomy {
value,
labels: CategoryLabels::CountedValues {
use_var_label_as_mrset_label,
},
},
input,
)
}
_ => return Err(MultipleResponseWarning::InvalidMultipleResponseType.into()),
};
Ok((mr_type, input))
}
}
#[derive(Clone, Debug, Serialize)]
pub struct MultipleResponseSet<I, S>
where
I: Debug + Serialize,
S: Debug + Serialize,
{
pub name: I,
pub label: S,
pub mr_type: MultipleResponseType,
pub short_names: Vec<I>,
}
impl MultipleResponseSet<ByteString, ByteString> {
fn parse(input: &[u8]) -> Result<(Self, &[u8]), WarningDetails> {
let Some(equals) = input.iter().position(|&b| b == b'=') else {
return Err(MultipleResponseWarning::MultipleResponseSyntaxError("missing `=`").into());
};
let (name, input) = input.split_at(equals);
let input = input.strip_prefix(b"=").unwrap();
let (mr_type, input) = MultipleResponseType::parse(input)?;
let Some(input) = input.strip_prefix(b" ") else {
return Err(MultipleResponseWarning::MultipleResponseSyntaxError(
"missing space after multiple response type",
)
.into());
};
let (label, mut input) = parse_counted_string(input)?;
let mut vars = Vec::new();
while input.first() != Some(&b'\n') {
match input.split_first() {
Some((b' ', rest)) => {
let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
return Err(MultipleResponseWarning::MultipleResponseSyntaxError(
"missing variable name delimiter",
)
.into());
};
let (var, rest) = rest.split_at(length);
if !var.is_empty() {
vars.push(var.into());
}
input = rest;
}
_ => {
return Err(MultipleResponseWarning::MultipleResponseSyntaxError(
"missing space preceding variable name",
)
.into());
}
}
}
while input.first() == Some(&b'\n') {
input = &input[1..];
}
Ok((
MultipleResponseSet {
name: name.into(),
label,
mr_type,
short_names: vars,
},
input,
))
}
fn decode(
&self,
offsets: &Range<u64>,
decoder: &mut Decoder,
) -> Result<MultipleResponseSet<Identifier, String>, WarningDetails> {
let mut short_names = Vec::with_capacity(self.short_names.len());
for short_name in self.short_names.iter() {
if let Some(short_name) = decoder
.decode_identifier(short_name)
.map_err(MultipleResponseWarning::InvalidMrSetName)
.issue_warning(offsets, &mut decoder.warn)
{
short_names.push(short_name);
}
}
Ok(MultipleResponseSet {
name: decoder
.decode_identifier(&self.name)
.map_err(MultipleResponseWarning::InvalidMrSetVariableName)?,
label: decoder.decode(&self.label).to_string(),
mr_type: self.mr_type.clone(),
short_names,
})
}
}
#[derive(Clone, Debug, Serialize)]
pub struct MultipleResponseRecord<I, S>
where
I: Debug + Serialize,
S: Debug + Serialize,
{
pub offsets: Range<u64>,
pub sets: Vec<MultipleResponseSet<I, S>>,
}
impl MultipleResponseRecord<ByteString, ByteString> {
pub fn parse(ext: &Extension) -> Result<Record, WarningDetails> {
ext.check_size(Some(1), None, "multiple response set record")?;
let mut input = &ext.data[..];
let mut sets = Vec::new();
loop {
while let Some(suffix) = input.strip_prefix(b"\n") {
input = suffix;
}
if input.is_empty() {
break;
}
let (set, rest) = MultipleResponseSet::parse(input)?;
sets.push(set);
input = rest;
}
Ok(Record::MultipleResponse(MultipleResponseRecord {
offsets: ext.offsets.clone(),
sets,
}))
}
}
impl MultipleResponseRecord<ByteString, ByteString> {
pub fn decode(self, decoder: &mut Decoder) -> MultipleResponseRecord<Identifier, String> {
let mut sets = Vec::new();
for set in self.sets.iter() {
if let Some(set) = set
.decode(&self.offsets, decoder)
.issue_warning(&self.offsets, &mut decoder.warn)
{
sets.push(set);
}
}
MultipleResponseRecord {
offsets: self.offsets,
sets,
}
}
}
fn parse_counted_string(input: &[u8]) -> Result<(ByteString, &[u8]), WarningDetails> {
let Some(space) = input.iter().position(|&b| b == b' ') else {
return Err(MultipleResponseWarning::CountedStringMissingSpace.into());
};
let Ok(length) = from_utf8(&input[..space]) else {
return Err(MultipleResponseWarning::CountedStringInvalidUTF8.into());
};
let Ok(length): Result<usize, _> = length.parse() else {
return Err(MultipleResponseWarning::CountedStringInvalidLength(length.into()).into());
};
let Some((string, rest)) = input[space + 1..].split_at_checked(length) else {
return Err(MultipleResponseWarning::CountedStringTooLong(length).into());
};
Ok((string.into(), rest))
}
#[derive(ThisError, Debug)]
pub enum VariableDisplayWarning {
#[error("Record contains {count} items but should contain either {first} or {second}.")]
InvalidVariableDisplayCount {
count: usize,
first: usize,
second: usize,
},
#[error("Invalid variable measurement level value {0}.")]
InvalidMeasurement(
u32,
),
#[error("Invalid variable display alignment value {0}.")]
InvalidAlignment(
u32,
),
}
impl Measure {
fn try_decode(source: u32) -> Result<Option<Measure>, WarningDetails> {
match source {
0 => Ok(None),
1 => Ok(Some(Measure::Nominal)),
2 => Ok(Some(Measure::Ordinal)),
3 => Ok(Some(Measure::Scale)),
_ => Err(VariableDisplayWarning::InvalidMeasurement(source).into()),
}
}
}
impl Alignment {
fn try_decode(source: u32) -> Result<Option<Alignment>, WarningDetails> {
match source {
0 => Ok(Some(Alignment::Left)),
1 => Ok(Some(Alignment::Right)),
2 => Ok(Some(Alignment::Center)),
_ => Err(VariableDisplayWarning::InvalidAlignment(source).into()),
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct VarDisplay {
pub measure: Option<Measure>,
pub width: Option<u32>,
pub alignment: Option<Alignment>,
}
#[derive(Clone, Debug, Serialize)]
pub struct VarDisplayRecord(
pub Vec<VarDisplay>,
);
impl VarDisplayRecord {
fn parse(
ext: &Extension,
var_types: &VarTypes,
endian: Endian,
warn: &mut dyn FnMut(Warning),
) -> Result<Record, WarningDetails> {
ext.check_size(Some(4), None, "variable display record")?;
let n_vars = var_types.n_vars();
let has_width = if ext.count as usize == 3 * n_vars {
true
} else if ext.count as usize == 2 * n_vars {
false
} else {
return Err(VariableDisplayWarning::InvalidVariableDisplayCount {
count: ext.count as usize,
first: 2 * n_vars,
second: 3 * n_vars,
}
.into());
};
let mut var_displays = Vec::new();
let mut input = &ext.data[..];
for _ in 0..n_vars {
let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
.issue_warning(&ext.offsets, warn)
.flatten();
let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
.issue_warning(&ext.offsets, warn)
.flatten();
var_displays.push(VarDisplay {
measure,
width,
alignment,
});
}
Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
}
}
#[derive(ThisError, Debug)]
pub enum LongStringMissingValuesWarning {
#[error("Value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
BadValueLength {
offset: u64,
value_len: u32,
},
#[error("Invalid variable name. {0}")]
InvalidVariableName(
IdError,
),
}
#[derive(Clone, Debug, Serialize)]
pub struct LongStringMissingValues<N>
where
N: Debug + Serialize,
{
pub var_name: N,
pub missing_values: Vec<ByteStrArray<8>>,
}
impl LongStringMissingValues<ByteString> {
fn decode(
&self,
decoder: &mut Decoder,
) -> Result<LongStringMissingValues<Identifier>, IdError> {
Ok(LongStringMissingValues {
var_name: decoder.decode_identifier(&self.var_name)?,
missing_values: self.missing_values.clone(),
})
}
}
#[derive(Clone, Debug, Serialize)]
pub struct LongStringMissingValueRecord<N>
where
N: Debug + Serialize,
{
pub offsets: Range<u64>,
pub values: Vec<LongStringMissingValues<N>>,
}
impl LongStringMissingValueRecord<ByteString> {
pub fn parse(
ext: &Extension,
endian: Endian,
warn: &mut dyn FnMut(Warning),
) -> Result<Record, WarningDetails> {
ext.check_size(Some(1), None, "long string missing values record")?;
let mut input = &ext.data[..];
let mut missing_value_set = Vec::new();
while !input.is_empty() {
let var_name = read_string(&mut input, endian)?;
let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
let value_len: u32 = endian.parse(read_bytes(&mut input)?);
if value_len != 8 {
let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
warn(Warning::new(
Some(ext.offsets.clone()),
LongStringMissingValuesWarning::BadValueLength { offset, value_len },
));
read_vec(&mut input, value_len as usize * n_missing_values as usize)?;
continue;
}
let mut missing_values = Vec::new();
for i in 0..n_missing_values {
if i > 0 {
let mut peek = input;
let number: u32 = endian.parse(read_bytes(&mut peek)?);
if number == 8 {
input = peek;
}
}
let value: [u8; 8] = read_bytes(&mut input)?;
missing_values.push(ByteStrArray(value));
}
missing_value_set.push(LongStringMissingValues {
var_name,
missing_values,
});
}
Ok(Record::LongStringMissingValues(
LongStringMissingValueRecord {
offsets: ext.offsets.clone(),
values: missing_value_set,
},
))
}
pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord<Identifier> {
let mut mvs = Vec::with_capacity(self.values.len());
for mv in self.values.iter() {
if let Some(mv) = mv
.decode(decoder)
.map_err(LongStringMissingValuesWarning::InvalidVariableName)
.issue_warning(&self.offsets, &mut decoder.warn)
{
mvs.push(mv);
}
}
LongStringMissingValueRecord {
offsets: self.offsets,
values: mvs,
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct EncodingRecord(
pub String,
);
impl EncodingRecord {
pub fn parse(ext: &Extension) -> Result<Record, WarningDetails> {
ext.check_size(Some(1), None, "encoding record")?;
Ok(Record::Encoding(EncodingRecord(
String::from_utf8(ext.data.clone()).map_err(|_| WarningDetails::BadEncodingName)?,
)))
}
}
#[derive(Clone, Debug, Serialize)]
pub struct NumberOfCasesRecord {
pub one: u64,
pub n_cases: Option<u64>,
}
impl NumberOfCasesRecord {
pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
ext.check_size(Some(8), Some(2), "extended number of cases record")?;
let mut input = &ext.data[..];
let one = endian.parse(read_bytes(&mut input)?);
let n_cases = endian.parse(read_bytes(&mut input)?);
let n_cases = (n_cases < u64::MAX).then_some(n_cases);
Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
}
}
#[derive(ThisError, Debug)]
pub enum VariableSetWarning {
#[error("Invalid variable name. {0}")]
InvalidVariableSetName(
IdError,
),
#[error("Missing name delimiter.")]
VariableSetMissingEquals,
}
#[derive(Clone, Debug, Serialize)]
pub struct RawVariableSetRecord(TextRecord);
impl RawVariableSetRecord {
pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
Ok(Record::VariableSets(Self(TextRecord::parse(
extension,
"variable sets record",
)?)))
}
pub fn decode(self, decoder: &mut Decoder) -> VariableSetRecord {
let mut sets = Vec::new();
let input = decoder.decode(&self.0.text);
for line in input.lines() {
if let Some(set) = VariableSet::parse(line, decoder, &self.0.offsets)
.issue_warning(&self.0.offsets, &mut decoder.warn)
{
sets.push(set)
}
}
VariableSetRecord {
offsets: self.0.offsets,
sets,
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct RawProductInfoRecord(pub TextRecord);
impl RawProductInfoRecord {
pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
Ok(Record::ProductInfo(Self(TextRecord::parse(
extension,
"product info record",
)?)))
}
pub fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord {
ProductInfoRecord(decoder.decode(&self.0.text).into())
}
}
#[derive(ThisError, Debug)]
pub enum AttributeWarning {
#[error("Invalid attribute name. {0}")]
InvalidAttributeName(
IdError,
),
#[error("Invalid variable name in attribute record. {0}")]
InvalidAttributeVariableName(
IdError,
),
#[error("Attribute record missing left parenthesis, in {0:?}.")]
AttributeMissingLParen(
String,
),
#[error("Attribute for {name}[{}] lacks value.", index + 1)]
AttributeMissingValue {
name: Identifier,
index: usize,
},
#[error("Attribute for {name}[{}] missing quotations.", index + 1)]
AttributeMissingQuotes {
name: Identifier,
index: usize,
},
#[error("Variable attribute missing `:`.")]
VariableAttributeMissingColon,
#[error("Duplicate attributes for variable {variable}: {}.", attributes.iter().join(", "))]
DuplicateVariableAttributes {
variable: Identifier,
attributes: Vec<Identifier>,
},
#[error("Duplicate dataset attributes with names: {}.", attributes.iter().join(", "))]
DuplicateFileAttributes {
attributes: Vec<Identifier>,
},
#[error("File attributes record contains trailing garbage.")]
FileAttributesTrailingGarbage,
}
#[derive(Clone, Debug)]
pub struct Attribute {
pub name: Identifier,
pub values: Vec<String>,
}
impl Attribute {
fn parse<'a>(
decoder: &mut Decoder,
offsets: &Range<u64>,
input: &'a str,
) -> Result<(Attribute, &'a str), WarningDetails> {
let Some((name, mut input)) = input.split_once('(') else {
return Err(AttributeWarning::AttributeMissingLParen(input.into()).into());
};
let name = decoder
.new_identifier(name)
.map_err(AttributeWarning::InvalidAttributeName)?;
let mut values = Vec::new();
loop {
let Some((value, rest)) = input.split_once('\n') else {
return Err(AttributeWarning::AttributeMissingValue {
name: name.clone(),
index: values.len(),
}
.into());
};
if let Some(stripped) = value
.strip_prefix('\'')
.and_then(|value| value.strip_suffix('\''))
{
values.push(stripped.into());
} else {
decoder.warn(Warning::new(
Some(offsets.clone()),
AttributeWarning::AttributeMissingQuotes {
name: name.clone(),
index: values.len(),
},
));
values.push(value.into());
}
if let Some(rest) = rest.strip_prefix(')') {
let attribute = Attribute { name, values };
return Ok((attribute, rest));
};
input = rest;
}
}
}
impl Attributes {
fn parse<'a>(
decoder: &mut Decoder,
offsets: &Range<u64>,
mut input: &'a str,
sentinel: Option<char>,
) -> Result<(Attributes, &'a str, Vec<Identifier>), WarningDetails> {
let mut attributes = BTreeMap::new();
let mut duplicates = Vec::new();
let rest = loop {
match input.chars().next() {
None => break input,
c if c == sentinel => break &input[1..],
_ => {
let (attribute, rest) = Attribute::parse(decoder, offsets, input)?;
if attributes.contains_key(&attribute.name) {
duplicates.push(attribute.name.clone());
}
attributes.insert(attribute.name, attribute.values);
input = rest;
}
}
};
Ok((Attributes(attributes), rest, duplicates))
}
}
#[derive(Clone, Debug, Serialize)]
pub struct RawFileAttributesRecord(TextRecord);
#[derive(Clone, Debug, Default, Serialize)]
pub struct FileAttributesRecord(pub Attributes);
impl RawFileAttributesRecord {
pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
Ok(Record::FileAttributes(Self(TextRecord::parse(
extension,
"file attributes record",
)?)))
}
pub fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord {
let input = decoder.decode(&self.0.text);
match Attributes::parse(decoder, &self.0.offsets, &input, None)
.issue_warning(&self.0.offsets, &mut decoder.warn)
{
Some((set, rest, duplicates)) => {
if !duplicates.is_empty() {
decoder.warn(Warning::new(
Some(self.0.offsets.clone()),
AttributeWarning::DuplicateFileAttributes {
attributes: duplicates,
},
));
}
if !rest.is_empty() {
decoder.warn(Warning::new(
Some(self.0.offsets.clone()),
AttributeWarning::FileAttributesTrailingGarbage,
));
}
FileAttributesRecord(set)
}
None => FileAttributesRecord::default(),
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct VarAttributes {
pub long_var_name: Identifier,
pub attributes: Attributes,
}
impl VarAttributes {
fn parse<'a>(
decoder: &mut Decoder,
offsets: &Range<u64>,
input: &'a str,
) -> Result<(VarAttributes, &'a str), WarningDetails> {
let Some((long_var_name, rest)) = input.split_once(':') else {
return Err(AttributeWarning::VariableAttributeMissingColon.into());
};
let long_var_name = decoder
.new_identifier(long_var_name)
.and_then(Identifier::must_be_ordinary)
.map_err(AttributeWarning::InvalidAttributeVariableName)?;
let (attributes, rest, duplicates) = Attributes::parse(decoder, offsets, rest, Some('/'))?;
if !duplicates.is_empty() {
decoder.warn(Warning::new(
Some(offsets.clone()),
AttributeWarning::DuplicateVariableAttributes {
variable: long_var_name.clone(),
attributes: duplicates,
},
));
}
Ok((
VarAttributes {
long_var_name,
attributes,
},
rest,
))
}
}
#[derive(Clone, Debug, Serialize)]
pub struct RawVariableAttributesRecord(TextRecord);
#[derive(Clone, Debug, Serialize)]
pub struct VariableAttributesRecord(pub Vec<VarAttributes>);
impl RawVariableAttributesRecord {
pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
Ok(Record::VariableAttributes(Self(TextRecord::parse(
extension,
"variable attributes record",
)?)))
}
pub fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord {
let decoded = decoder.decode(&self.0.text);
let mut input = decoded.as_ref();
let mut var_attribute_sets = Vec::new();
while !input.is_empty() {
let Some((var_attribute, rest)) = VarAttributes::parse(decoder, &self.0.offsets, input)
.issue_warning(&self.0.offsets, &mut decoder.warn)
else {
break;
};
var_attribute_sets.push(var_attribute);
input = rest;
}
VariableAttributesRecord(var_attribute_sets)
}
}
#[derive(ThisError, Debug)]
pub enum LongNameWarning {
#[error("Missing `=` separator.")]
LongNameMissingEquals,
#[error("Invalid short name. {0}")]
InvalidShortName(
IdError,
),
#[error("Invalid long name. {0}")]
InvalidLongName(
IdError,
),
}
#[derive(Clone, Debug, Serialize)]
pub struct LongName {
pub short_name: Identifier,
pub long_name: Identifier,
}
impl LongName {
pub fn parse(input: &str, decoder: &Decoder) -> Result<Self, WarningDetails> {
let Some((short_name, long_name)) = input.split_once('=') else {
return Err(LongNameWarning::LongNameMissingEquals.into());
};
let short_name = decoder
.new_identifier(short_name)
.and_then(Identifier::must_be_ordinary)
.map_err(LongNameWarning::InvalidShortName)?;
let long_name = decoder
.new_identifier(long_name)
.and_then(Identifier::must_be_ordinary)
.map_err(LongNameWarning::InvalidLongName)?;
Ok(LongName {
short_name,
long_name,
})
}
}
#[derive(Clone, Debug, Serialize)]
pub struct LongNamesRecord(pub Vec<LongName>);
#[derive(Clone, Debug, Serialize)]
pub struct ProductInfoRecord(pub String);
#[derive(Clone, Debug, Serialize)]
pub struct VariableSet {
pub name: String,
pub variable_names: Vec<Identifier>,
}
impl VariableSet {
fn parse(
input: &str,
decoder: &mut Decoder,
offsets: &Range<u64>,
) -> Result<Self, WarningDetails> {
let (name, input) = input
.split_once('=')
.ok_or(VariableSetWarning::VariableSetMissingEquals)?;
let mut vars = Vec::new();
for var in input.split_ascii_whitespace() {
if let Some(identifier) = decoder
.new_identifier(var)
.and_then(Identifier::must_be_ordinary)
.map_err(VariableSetWarning::InvalidVariableSetName)
.issue_warning(offsets, &mut decoder.warn)
{
vars.push(identifier);
}
}
Ok(VariableSet {
name: name.to_string(),
variable_names: vars,
})
}
}
#[derive(Clone, Debug, Serialize)]
pub struct VariableSetRecord {
pub offsets: Range<u64>,
pub sets: Vec<VariableSet>,
}
trait IssueWarning<T> {
fn issue_warning(self, offsets: &Range<u64>, warn: &mut dyn FnMut(Warning)) -> Option<T>;
}
impl<T, W> IssueWarning<T> for Result<T, W>
where
W: Into<WarningDetails>,
{
fn issue_warning(self, offsets: &Range<u64>, warn: &mut dyn FnMut(Warning)) -> Option<T> {
match self {
Ok(result) => Some(result),
Err(error) => {
warn(Warning::new(Some(offsets.clone()), error.into()));
None
}
}
}
}
#[derive(ThisError, Debug)]
pub enum ExtensionWarning {
#[error("Unexpected end of data.")]
UnexpectedEndOfData,
#[error("{record} has bad size {size} bytes instead of the expected {expected_size}.")]
BadRecordSize {
record: &'static str,
size: u32,
expected_size: u32,
},
#[error("{record} has bad count {count} instead of the expected {expected_count}.")]
BadRecordCount {
record: &'static str,
count: u32,
expected_count: u32,
},
}
#[derive(Clone, Debug, Serialize)]
pub struct Extension {
pub offsets: Range<u64>,
pub subtype: u32,
pub size: u32,
pub count: u32,
pub data: Vec<u8>,
}
impl Extension {
pub fn check_size(
&self,
size: Option<u32>,
count: Option<u32>,
name: &'static str,
) -> Result<(), WarningDetails> {
if let Some(expected_size) = size
&& self.size != expected_size
{
Err(ExtensionWarning::BadRecordSize {
record: name,
size: self.size,
expected_size,
}
.into())
} else if let Some(expected_count) = count
&& self.count != expected_count
{
Err(ExtensionWarning::BadRecordCount {
record: name,
count: self.count,
expected_count,
}
.into())
} else {
Ok(())
}
}
pub fn read<R: Read + Seek>(
r: &mut R,
endian: Endian,
var_types: &VarTypes,
warn: &mut dyn FnMut(Warning),
) -> Result<Option<Record>, Error<ErrorDetails>> {
let subtype = endian.parse(read_bytes(r)?);
let header_offset = r.stream_position()?;
let size: u32 = endian.parse(read_bytes(r)?);
let count = endian.parse(read_bytes(r)?);
let Some(product) = size.checked_mul(count) else {
return Err(Error::new(
Some(header_offset..header_offset + 8),
ErrorDetails::ExtensionRecordTooLarge {
subtype,
size,
count,
},
));
};
let start_offset = r.stream_position()?;
let data = read_vec(r, product as usize)?;
let end_offset = start_offset + product as u64;
let offsets = start_offset..end_offset;
let extension = Extension {
offsets: offsets.clone(),
subtype,
size,
count,
data,
};
let result = match subtype {
3 => IntegerInfoRecord::parse(&extension, endian),
4 => FloatInfoRecord::parse(&extension, endian),
11 => VarDisplayRecord::parse(&extension, var_types, endian, warn),
7 | 19 => MultipleResponseRecord::parse(&extension),
21 => LongStringValueLabelRecord::parse(&extension, endian),
22 => LongStringMissingValueRecord::parse(&extension, endian, warn),
20 => EncodingRecord::parse(&extension),
16 => NumberOfCasesRecord::parse(&extension, endian),
5 => RawVariableSetRecord::parse(extension),
10 => RawProductInfoRecord::parse(extension),
13 => RawLongNamesRecord::parse(extension),
14 => RawVeryLongStringsRecord::parse(extension),
17 => RawFileAttributesRecord::parse(extension),
18 => RawVariableAttributesRecord::parse(extension),
_ => Ok(Record::OtherExtension(extension)),
};
match result {
Ok(result) => Ok(Some(result)),
Err(details) => {
warn(Warning::new(Some(offsets), details));
Ok(None)
}
}
}
}
#[derive(ThisError, Debug)]
pub enum LongStringValueLabelWarning {
#[error("Invalid variable name. {0}")]
InvalidVariableName(
IdError,
),
}
#[derive(Clone, Debug, Serialize)]
pub struct LongStringValueLabels<N, S>
where
S: Debug + Serialize,
{
pub var_name: N,
pub width: u32,
pub labels: Vec<(ByteString, S)>,
}
impl LongStringValueLabels<ByteString, ByteString> {
fn decode(
&self,
decoder: &mut Decoder,
) -> Result<LongStringValueLabels<Identifier, String>, WarningDetails> {
let var_name = decoder.decode(&self.var_name);
let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
.map_err(LongStringValueLabelWarning::InvalidVariableName)?;
let mut labels = Vec::with_capacity(self.labels.len());
for (value, label) in self.labels.iter() {
let label = decoder.decode(label).to_string();
labels.push((value.clone(), label));
}
Ok(LongStringValueLabels {
var_name,
width: self.width,
labels,
})
}
}
#[derive(Clone, Debug, Serialize)]
pub struct LongStringValueLabelRecord<N, S>
where
N: Debug + Serialize,
S: Debug + Serialize,
{
pub offsets: Range<u64>,
pub labels: Vec<LongStringValueLabels<N, S>>,
}
impl LongStringValueLabelRecord<ByteString, ByteString> {
fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
ext.check_size(Some(1), None, "long string value labels record")?;
let mut input = &ext.data[..];
let mut label_set = Vec::new();
while !input.is_empty() {
let var_name = read_string(&mut input, endian)?;
let width: u32 = endian.parse(read_bytes(&mut input)?);
let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
let mut labels = Vec::new();
for _ in 0..n_labels {
let value = read_string(&mut input, endian)?;
let label = read_string(&mut input, endian)?;
labels.push((value, label));
}
label_set.push(LongStringValueLabels {
var_name,
width,
labels,
})
}
Ok(Record::LongStringValueLabels(LongStringValueLabelRecord {
offsets: ext.offsets.clone(),
labels: label_set,
}))
}
pub fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord<Identifier, String> {
let mut labels = Vec::with_capacity(self.labels.len());
for label in &self.labels {
match label.decode(decoder) {
Ok(set) => labels.push(set),
Err(error) => decoder.warn(Warning::new(Some(self.offsets.clone()), error)),
}
}
LongStringValueLabelRecord {
offsets: self.offsets,
labels,
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct ZHeader {
pub offset: u64,
#[serde(flatten)]
pub inner: RawZHeader,
}
#[derive(Clone, Debug, BinRead, BinWrite, Serialize)]
pub struct RawZHeader {
pub zheader_offset: u64,
pub ztrailer_offset: u64,
pub ztrailer_len: u64,
}
impl ZHeader {
pub fn read<R>(r: &mut R, endian: Endian) -> Result<ZHeader, Error<ErrorDetails>>
where
R: Read + Seek,
{
let offset = r.stream_position()?;
let inner = RawZHeader::read_options(r, endian, ()).map_err(|e| Error {
offsets: Some(offset..offset + 24),
details: ZHeaderError::from(e).into(),
})?;
if inner.zheader_offset != offset {
Err(ZHeaderError::UnexpectedZHeaderOffset {
actual: inner.zheader_offset,
expected: offset,
}
.into())
} else if inner.ztrailer_offset < offset {
Err(ZHeaderError::ImpossibleZTrailerOffset(inner.ztrailer_offset).into())
} else if inner.ztrailer_len < 24 || inner.ztrailer_len % 24 != 0 {
Err(ZHeaderError::InvalidZTrailerLength(inner.ztrailer_len).into())
} else {
Ok(ZHeader { offset, inner })
}
.map_err(|details| Error::new(Some(offset..offset + 12), details))
}
}
#[derive(ThisError, Debug)]
pub enum ZHeaderError {
#[error("{}", DisplayBinError(.0, "ZLIB header"))]
BinError(#[from] BinError),
#[error("Impossible ztrailer_offset {0:#x}.")]
ImpossibleZTrailerOffset(
u64,
),
#[error("zlib_offset is {actual:#x} instead of expected {expected:#x}.")]
UnexpectedZHeaderOffset {
actual: u64,
expected: u64,
},
#[error("Invalid ZLIB trailer length {0}.")]
InvalidZTrailerLength(
u64,
),
}
#[derive(Clone, Debug, Serialize)]
pub struct ZTrailer {
pub offset: u64,
#[serde(flatten)]
pub inner: RawZTrailer,
}
#[binrw]
#[derive(Clone, Debug, Serialize)]
pub struct RawZTrailer {
pub int_bias: i64,
pub zero: u64,
pub block_size: u32,
#[bw(calc(blocks.len() as u32))]
pub n_blocks: u32,
#[br(count = n_blocks)]
pub blocks: Vec<ZBlock>,
}
impl RawZTrailer {
#[allow(clippy::len_without_is_empty)]
pub fn len(&self) -> usize {
24 + self.blocks.len() * 24
}
}
#[derive(ThisError, Debug)]
pub enum ZlibTrailerWarning {
#[error(
"Block descriptor {index} reported block size {actual:#x}, when {expected:#x} was expected."
)]
ZlibTrailerBlockWrongSize {
index: usize,
actual: u32,
expected: u32,
},
#[error(
"Block descriptor {index} reported block size {actual:#x}, when at most {max_expected:#x} was expected."
)]
ZlibTrailerBlockTooBig {
index: usize,
actual: u32,
max_expected: u32,
},
}
#[derive(Clone, Debug, BinRead, BinWrite, Serialize)]
pub struct ZBlock {
pub uncompressed_ofs: u64,
pub compressed_ofs: u64,
pub uncompressed_size: u32,
pub compressed_size: u32,
}
impl ZBlock {
fn has_plausible_sizes(&self) -> bool {
self.uncompressed_size
.checked_add(self.uncompressed_size / 7 + 11)
.is_some_and(|max| self.compressed_size <= max)
}
}
struct DisplayBinError<'a>(&'a BinError, &'static str);
impl<'a> Display for DisplayBinError<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
if self.0.is_eof() {
write!(f, "Unexpected end-of-file reading {}", self.1)
} else {
write!(f, "Error reading {}: {}", self.1, self.0.root_cause())
}
}
}
#[derive(ThisError, Debug)]
pub enum ZTrailerError {
#[error("{}", DisplayBinError(.0, "ZLIB trailer"))]
BinError(#[from] BinError),
#[
error(
"Bias {actual} is not {} as expected from file header.",
expected.display_plain()
)]
WrongZlibTrailerBias {
actual: i64,
expected: f64,
},
#[error("Expected zero field has nonzero value {0}.")]
WrongZlibTrailerZero(
u64,
),
#[error("Unexpected {0:x}-byte block size (expected 0x3ff000).")]
WrongZlibTrailerBlockSize(
u32,
),
#[error(
"Block count {n_blocks} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}."
)]
BadZlibTrailerNBlocks {
n_blocks: usize,
expected_n_blocks: u64,
ztrailer_len: u64,
},
#[error(
"Block descriptor {index} reported uncompressed data offset {actual:#x}, when {expected:#x} was expected."
)]
ZlibTrailerBlockWrongUncmpOfs {
index: usize,
actual: u64,
expected: u64,
},
#[error(
"Block descriptor {index} reported compressed data offset {actual:#x}, when {expected:#x} was expected."
)]
ZlibTrailerBlockWrongCmpOfs {
index: usize,
actual: u64,
expected: u64,
},
#[error(
"Block descriptor {index} reports compressed size {compressed_size} and uncompressed size {uncompressed_size}."
)]
ZlibExpansion {
index: usize,
compressed_size: u32,
uncompressed_size: u32,
},
#[error(
"ZLIB trailer is at offset {actual:#x} but {expected:#x} would be expected from block descriptors."
)]
ZlibTrailerOffsetInconsistency {
expected: u64,
actual: u64,
},
}
impl ZTrailer {
pub fn read<R>(
reader: &mut R,
endian: Endian,
bias: f64,
zheader: &RawZHeader,
warn: &mut dyn FnMut(Warning),
) -> Result<Option<ZTrailer>, Error<ErrorDetails>>
where
R: Read + Seek,
{
let start_offset = reader.stream_position()?;
if reader
.seek(SeekFrom::Start(zheader.ztrailer_offset))
.is_err()
{
return Ok(None);
}
let inner = RawZTrailer::read_options(reader, endian, ()).map_err(|e| Error {
offsets: Some(zheader.ztrailer_offset..zheader.ztrailer_offset + zheader.ztrailer_len),
details: ZTrailerError::from(e).into(),
})?;
if inner.int_bias as f64 != -bias {
Err(ZTrailerError::WrongZlibTrailerBias {
actual: inner.int_bias,
expected: -bias,
}
.into())
} else if inner.zero != 0 {
Err(ZTrailerError::WrongZlibTrailerZero(inner.zero).into())
} else if inner.block_size != 0x3ff000 {
Err(ZTrailerError::WrongZlibTrailerBlockSize(inner.block_size).into())
} else if let expected_n_blocks = (zheader.ztrailer_len - 24) / 24
&& inner.blocks.len() as u64 != expected_n_blocks
{
Err(ZTrailerError::BadZlibTrailerNBlocks {
n_blocks: inner.blocks.len(),
expected_n_blocks,
ztrailer_len: zheader.ztrailer_len,
}
.into())
} else {
Ok(())
}
.map_err(|details| Error::new(Some(start_offset..start_offset + 24), details))?;
let mut expected_uncmp_ofs = zheader.zheader_offset;
let mut expected_cmp_ofs = zheader.zheader_offset + 24;
for (index, block) in inner.blocks.iter().enumerate() {
let block_start = start_offset + 24 + 24 * index as u64;
let block_offsets = block_start..block_start + 24;
if block.uncompressed_ofs != expected_uncmp_ofs {
Err(ZTrailerError::ZlibTrailerBlockWrongUncmpOfs {
index,
actual: block.uncompressed_ofs,
expected: expected_cmp_ofs,
}
.into())
} else if block.compressed_ofs != expected_cmp_ofs {
Err(ZTrailerError::ZlibTrailerBlockWrongCmpOfs {
index,
actual: block.compressed_ofs,
expected: expected_cmp_ofs,
}
.into())
} else if !block.has_plausible_sizes() {
Err(ZTrailerError::ZlibExpansion {
index,
compressed_size: block.compressed_size,
uncompressed_size: block.uncompressed_size,
}
.into())
} else {
Ok(())
}
.map_err(|details| Error::new(Some(block_offsets.clone()), details))?;
if index < inner.blocks.len() - 1 {
if block.uncompressed_size != inner.block_size {
warn(Warning::new(
Some(block_offsets),
ZlibTrailerWarning::ZlibTrailerBlockWrongSize {
index,
actual: block.uncompressed_size,
expected: inner.block_size,
},
));
}
} else if block.uncompressed_size > inner.block_size {
warn(Warning::new(
Some(block_offsets),
ZlibTrailerWarning::ZlibTrailerBlockTooBig {
index,
actual: block.uncompressed_size,
max_expected: inner.block_size,
},
));
}
expected_cmp_ofs += block.compressed_size as u64;
expected_uncmp_ofs += block.uncompressed_size as u64;
}
if expected_cmp_ofs != zheader.ztrailer_offset {
return Err(Error::new(
Some(start_offset..start_offset + 24 + 24 * inner.blocks.len() as u64),
ZTrailerError::ZlibTrailerOffsetInconsistency {
expected: expected_cmp_ofs,
actual: zheader.ztrailer_offset,
}
.into(),
));
}
reader.seek(SeekFrom::Start(start_offset))?;
Ok(Some(ZTrailer {
offset: zheader.ztrailer_offset,
inner,
}))
}
}