use crate::error::{BytesNear, MarcError, Result};
use crate::record::{Field, Subfield};
use crate::recovery::RecoveryMode;
use crate::validation::IndicatorValidator;
use smallvec::SmallVec;
use std::io::Read;
use std::sync::OnceLock;
fn marc21_indicator_validator() -> &'static IndicatorValidator {
static V: OnceLock<IndicatorValidator> = OnceLock::new();
V.get_or_init(IndicatorValidator::new)
}
pub const RECORD_TERMINATOR: u8 = 0x1D;
pub const FIELD_TERMINATOR: u8 = 0x1E;
pub const SUBFIELD_DELIMITER: u8 = 0x1F;
pub const LEADER_LEN: usize = 24;
pub const DIRECTORY_ENTRY_LEN: usize = 12;
#[derive(Debug, Clone, Default)]
pub struct ParseContext {
pub source_name: Option<String>,
pub record_index: usize,
pub stream_byte_offset: usize,
pub record_start_offset: usize,
pub record_control_number: Option<String>,
pub current_field_tag: Option<[u8; 3]>,
pub current_subfield_code: Option<u8>,
pub current_indicator_position: Option<u8>,
current_buffer: Option<Vec<u8>>,
current_buffer_base_offset: Option<usize>,
}
impl ParseContext {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_source_name(mut self, name: impl Into<String>) -> Self {
self.source_name = Some(name.into());
self
}
pub fn begin_record(&mut self) {
self.record_index = self.record_index.saturating_add(1);
self.record_start_offset = self.stream_byte_offset;
self.record_control_number = None;
self.current_field_tag = None;
self.current_subfield_code = None;
self.current_indicator_position = None;
self.current_buffer = None;
self.current_buffer_base_offset = None;
}
pub fn advance(&mut self, n: usize) {
self.stream_byte_offset = self.stream_byte_offset.saturating_add(n);
}
#[must_use]
pub fn record_byte_offset(&self) -> usize {
self.stream_byte_offset
.saturating_sub(self.record_start_offset)
}
fn record_index_opt(&self) -> Option<usize> {
if self.record_index == 0 {
None
} else {
Some(self.record_index)
}
}
fn field_tag_as_string(&self) -> Option<String> {
self.current_field_tag
.and_then(|bytes| std::str::from_utf8(&bytes).ok().map(String::from))
}
pub fn set_parse_buffer(&mut self, buffer: &[u8], buffer_start_offset: usize) {
self.current_buffer = Some(buffer.to_vec());
self.current_buffer_base_offset = Some(buffer_start_offset);
}
pub fn clear_parse_buffer(&mut self) {
self.current_buffer = None;
self.current_buffer_base_offset = None;
}
fn capture_bytes_near(&self) -> Option<BytesNear> {
let buffer = self.current_buffer.as_deref()?;
let base = self.current_buffer_base_offset?;
BytesNear::capture(buffer, base, self.stream_byte_offset)
}
#[must_use]
pub fn err_directory_invalid(
&self,
found: Option<&[u8]>,
expected: impl Into<String>,
) -> MarcError {
let found_bytes = found.map(crate::error::truncate_bytes);
MarcError::DirectoryInvalid {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
record_byte_offset: Some(self.record_byte_offset()),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
field_tag: self.field_tag_as_string(),
found: found_bytes,
expected: Some(expected.into()),
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_truncated_record(
&self,
expected_length: Option<usize>,
actual_length: Option<usize>,
) -> MarcError {
MarcError::TruncatedRecord {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
record_byte_offset: Some(self.record_byte_offset()),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
expected_length,
actual_length,
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_end_of_record_not_found(&self) -> MarcError {
MarcError::EndOfRecordNotFound {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
record_byte_offset: Some(self.record_byte_offset()),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_invalid_indicator(
&self,
indicator_position: u8,
found: &[u8],
expected: impl Into<String>,
) -> MarcError {
let found_bytes = crate::error::truncate_bytes(found);
MarcError::InvalidIndicator {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
record_byte_offset: Some(self.record_byte_offset()),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
field_tag: self.field_tag_as_string(),
indicator_position: Some(indicator_position),
found: Some(found_bytes),
expected: Some(expected.into()),
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_bad_subfield_code(&self, subfield_code: u8) -> MarcError {
MarcError::BadSubfieldCode {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
record_byte_offset: Some(self.record_byte_offset()),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
field_tag: self.field_tag_as_string(),
subfield_code,
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_invalid_field(&self, message: impl Into<String>) -> MarcError {
MarcError::InvalidField {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
record_byte_offset: Some(self.record_byte_offset()),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
field_tag: self.field_tag_as_string(),
message: message.into(),
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_encoding(&self, message: impl Into<String>) -> MarcError {
MarcError::EncodingError {
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
source_name: self.source_name.clone(),
record_control_number: self.record_control_number.clone(),
field_tag: self.field_tag_as_string(),
message: message.into(),
bytes_near: self.capture_bytes_near(),
}
}
#[must_use]
pub fn err_io(&self, cause: std::io::Error) -> MarcError {
MarcError::IoError {
cause,
record_index: self.record_index_opt(),
byte_offset: Some(self.stream_byte_offset),
source_name: self.source_name.clone(),
}
}
#[must_use]
pub fn err_xml(&self, cause: impl std::error::Error + Send + Sync + 'static) -> MarcError {
MarcError::XmlError {
cause: Box::new(cause),
record_index: self.record_index_opt(),
byte_offset: None,
source_name: self.source_name.clone(),
}
}
#[must_use]
pub fn err_json(&self, cause: serde_json::Error) -> MarcError {
MarcError::JsonError {
cause,
record_index: self.record_index_opt(),
byte_offset: None,
source_name: self.source_name.clone(),
}
}
}
pub fn read_leader_bytes<R: Read>(reader: &mut R) -> Result<Option<[u8; LEADER_LEN]>> {
let mut buf = [0u8; LEADER_LEN];
match reader.read_exact(&mut buf) {
Ok(()) => Ok(Some(buf)),
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
Err(e) => Err(e.into()),
}
}
pub fn read_record_data<R: Read>(
reader: &mut R,
record_length: usize,
recovery_mode: RecoveryMode,
ctx: &ParseContext,
) -> Result<(Vec<u8>, usize)> {
let expected_len = record_length.saturating_sub(LEADER_LEN);
let mut data = vec![0u8; expected_len];
let mut bytes_read = 0;
while bytes_read < expected_len {
match reader.read(&mut data[bytes_read..]) {
Ok(0) => break,
Ok(n) => bytes_read += n,
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {},
Err(e) => return Err(ctx.err_io(e)),
}
}
if bytes_read < expected_len && recovery_mode == RecoveryMode::Strict {
return Err(ctx.err_truncated_record(Some(expected_len), Some(bytes_read)));
}
Ok((data, bytes_read))
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DirectoryEntry {
pub tag: String,
pub length: usize,
pub start: usize,
}
pub fn parse_directory_entry(entry: &[u8]) -> Result<DirectoryEntry> {
if entry.len() < DIRECTORY_ENTRY_LEN {
return Err(MarcError::invalid_field_msg(format!(
"Directory entry too short: expected {DIRECTORY_ENTRY_LEN} bytes, got {}",
entry.len()
)));
}
let tag = std::str::from_utf8(&entry[0..3])
.map_err(|_| MarcError::invalid_field_msg("Invalid tag encoding".to_string()))?
.to_string();
let length = parse_4digits(&entry[3..7])?;
let start = parse_5digits(&entry[7..12])?;
Ok(DirectoryEntry { tag, length, start })
}
pub fn parse_4digits(bytes: &[u8]) -> Result<usize> {
if bytes.len() != 4 {
return Err(MarcError::invalid_field_msg(format!(
"Expected 4-digit field, got {} bytes",
bytes.len()
)));
}
parse_ascii_digits(bytes)
}
pub fn parse_5digits(bytes: &[u8]) -> Result<usize> {
if bytes.len() != 5 {
return Err(MarcError::invalid_field_msg(format!(
"Expected 5-digit field, got {} bytes",
bytes.len()
)));
}
parse_ascii_digits(bytes)
}
fn parse_ascii_digits(bytes: &[u8]) -> Result<usize> {
let mut result = 0usize;
for &byte in bytes {
if byte.is_ascii_digit() {
result = result * 10 + (byte - b'0') as usize;
} else {
return Err(MarcError::invalid_field_msg(format!(
"Invalid numeric field: expected digits, got byte {}",
byte as char
)));
}
}
Ok(result)
}
#[must_use]
pub fn is_control_field_tag(tag: &str) -> bool {
tag.len() == 3 && tag.starts_with('0') && tag.chars().all(|c| c.is_ascii_digit()) && tag < "010"
}
pub fn validate_directory_tag(
tag: &str,
record_index: Option<usize>,
record_control_number: Option<&str>,
) -> Result<()> {
if tag.len() == 3 && tag.as_bytes().iter().all(u8::is_ascii) {
return Ok(());
}
Err(MarcError::WriterError {
record_index,
record_control_number: record_control_number.map(String::from),
message: format!(
"Field tag {tag:?} is not 3 ASCII bytes (got {} bytes); cannot fit into the ISO 2709 directory entry's tag field",
tag.len()
),
})
}
pub const ISO2709_MAX_FIELD: usize = 99_999;
pub fn check_iso2709_size(
record_length: usize,
base_address: usize,
record_index: Option<usize>,
record_control_number: Option<&str>,
) -> Result<()> {
if record_length > ISO2709_MAX_FIELD {
return Err(MarcError::WriterError {
record_index,
record_control_number: record_control_number.map(String::from),
message: format!(
"Record length exceeds ISO 2709 limit ({record_length} bytes; max {ISO2709_MAX_FIELD})"
),
});
}
if base_address > ISO2709_MAX_FIELD {
return Err(MarcError::WriterError {
record_index,
record_control_number: record_control_number.map(String::from),
message: format!(
"Base address exceeds ISO 2709 limit ({base_address} bytes; max {ISO2709_MAX_FIELD})"
),
});
}
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SubfieldStructureMode {
Strict,
Permissive,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Utf8DecodeMode {
Lossy,
Strict,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndicatorMode {
Lossy,
Strict,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SubfieldCodeMode {
Lossy,
Strict,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct DataFieldParseConfig {
pub structure: SubfieldStructureMode,
pub utf8: Utf8DecodeMode,
pub indicator: IndicatorMode,
pub subfield_code: SubfieldCodeMode,
}
impl DataFieldParseConfig {
const fn modes_for(
level: crate::ValidationLevel,
) -> (Utf8DecodeMode, IndicatorMode, SubfieldCodeMode) {
match level {
crate::ValidationLevel::Structural => (
Utf8DecodeMode::Lossy,
IndicatorMode::Lossy,
SubfieldCodeMode::Lossy,
),
crate::ValidationLevel::StrictMarc => (
Utf8DecodeMode::Strict,
IndicatorMode::Strict,
SubfieldCodeMode::Strict,
),
}
}
#[must_use]
pub const fn bibliographic(level: crate::ValidationLevel) -> Self {
let (utf8, indicator, subfield_code) = Self::modes_for(level);
Self {
structure: SubfieldStructureMode::Strict,
utf8,
indicator,
subfield_code,
}
}
#[must_use]
pub const fn authority(level: crate::ValidationLevel) -> Self {
let (utf8, indicator, subfield_code) = Self::modes_for(level);
Self {
structure: SubfieldStructureMode::Permissive,
utf8,
indicator,
subfield_code,
}
}
#[must_use]
pub const fn holdings(level: crate::ValidationLevel) -> Self {
let (utf8, indicator, subfield_code) = Self::modes_for(level);
Self {
structure: SubfieldStructureMode::Permissive,
utf8,
indicator,
subfield_code,
}
}
}
#[allow(clippy::inline_always)]
#[inline(always)]
pub fn parse_data_field(
field_data: &[u8],
tag: &str,
config: DataFieldParseConfig,
ctx: &ParseContext,
) -> Result<Field> {
if field_data.len() < 2 {
return Err(ctx.err_invalid_field("Data field too short (needs indicators)"));
}
let i1 = field_data[0];
let i2 = field_data[1];
if config.indicator == IndicatorMode::Strict {
if !is_valid_indicator(i1) {
return Err(ctx.err_invalid_indicator(0, &[i1], "ASCII digit (0-9) or space"));
}
if !is_valid_indicator(i2) {
return Err(ctx.err_invalid_indicator(1, &[i2], "ASCII digit (0-9) or space"));
}
if let Some(rules) = marc21_indicator_validator().get_rules(tag) {
if !rules.indicator1.is_valid(i1 as char) {
return Err(ctx.err_invalid_indicator(0, &[i1], rules.indicator1.expected_human()));
}
if !rules.indicator2.is_valid(i2 as char) {
return Err(ctx.err_invalid_indicator(1, &[i2], rules.indicator2.expected_human()));
}
}
}
let mut field = Field::new(tag.to_string(), i1 as char, i2 as char);
let subfields = parse_subfields(&field_data[2..], config, ctx)?;
field.subfields = subfields;
Ok(field)
}
#[inline]
fn is_valid_indicator(b: u8) -> bool {
b.is_ascii_digit() || b == b' '
}
pub fn parse_subfields(
bytes: &[u8],
config: DataFieldParseConfig,
ctx: &ParseContext,
) -> Result<SmallVec<[Subfield; 4]>> {
let mut subfields: SmallVec<[Subfield; 4]> = SmallVec::new();
let mut pos = 0;
while pos < bytes.len() {
let byte = bytes[pos];
if byte == FIELD_TERMINATOR {
break;
}
if byte != SUBFIELD_DELIMITER {
match config.structure {
SubfieldStructureMode::Strict => {
return Err(ctx.err_invalid_field("Expected subfield delimiter"));
},
SubfieldStructureMode::Permissive => {
pos += 1;
continue;
},
}
}
pos += 1;
if pos >= bytes.len() {
break;
}
let code_byte = bytes[pos];
if config.subfield_code == SubfieldCodeMode::Strict && !code_byte.is_ascii_graphic() {
return Err(ctx.err_bad_subfield_code(code_byte));
}
let code = code_byte as char;
pos += 1;
let mut end = pos;
while end < bytes.len()
&& bytes[end] != SUBFIELD_DELIMITER
&& bytes[end] != FIELD_TERMINATOR
{
end += 1;
}
let value_bytes = &bytes[pos..end];
let value = match config.utf8 {
Utf8DecodeMode::Lossy => String::from_utf8_lossy(value_bytes).to_string(),
Utf8DecodeMode::Strict => std::str::from_utf8(value_bytes)
.map_err(|e| ctx.err_encoding(format!("Invalid UTF-8 in subfield value: {e}")))?
.to_string(),
};
subfields.push(Subfield { code, value });
pos = end;
}
Ok(subfields)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn read_leader_bytes_eof_returns_none() {
let mut reader = Cursor::new(Vec::<u8>::new());
assert!(matches!(read_leader_bytes(&mut reader), Ok(None)));
}
#[test]
fn read_leader_bytes_full_returns_bytes() {
let leader = b"00100nam a2200049 i 4500".to_vec();
let mut reader = Cursor::new(leader.clone());
let bytes = read_leader_bytes(&mut reader).unwrap().unwrap();
assert_eq!(&bytes[..], &leader[..]);
}
#[test]
fn read_leader_bytes_partial_treated_as_eof() {
let mut reader = Cursor::new(vec![b'0'; 10]);
assert!(matches!(read_leader_bytes(&mut reader), Ok(None)));
}
#[test]
fn read_record_data_full_read() {
let mut reader = Cursor::new(vec![b'x'; 76]);
let ctx = ParseContext::new();
let (data, bytes_read) =
read_record_data(&mut reader, 100, RecoveryMode::Strict, &ctx).unwrap();
assert_eq!(data.len(), 76);
assert_eq!(bytes_read, 76);
}
#[test]
fn read_record_data_strict_mode_errors_on_truncation() {
let mut reader = Cursor::new(vec![b'x'; 50]);
let mut ctx = ParseContext::new().with_source_name("test.mrc");
ctx.begin_record();
let err = read_record_data(&mut reader, 100, RecoveryMode::Strict, &ctx)
.expect_err("strict mode should error on truncation");
match err {
MarcError::TruncatedRecord {
record_index,
source_name,
expected_length,
actual_length,
..
} => {
assert_eq!(record_index, Some(1));
assert_eq!(source_name.as_deref(), Some("test.mrc"));
assert_eq!(expected_length, Some(76));
assert_eq!(actual_length, Some(50));
},
other => panic!("expected TruncatedRecord, got {other:?}"),
}
}
#[test]
fn read_record_data_lenient_mode_returns_truncated_count() {
let mut reader = Cursor::new(vec![b'x'; 50]);
let ctx = ParseContext::new();
let (data, bytes_read) =
read_record_data(&mut reader, 100, RecoveryMode::Lenient, &ctx).unwrap();
assert_eq!(
data.len(),
76,
"buffer is sized to expected_len, zero-padded"
);
assert_eq!(bytes_read, 50, "actual bytes read short of expected_len");
}
#[test]
fn parse_4digits_valid() {
assert_eq!(parse_4digits(b"0042").unwrap(), 42);
assert_eq!(parse_4digits(b"9999").unwrap(), 9999);
}
#[test]
fn parse_4digits_wrong_length() {
assert!(parse_4digits(b"42").is_err());
assert!(parse_4digits(b"00042").is_err());
}
#[test]
fn parse_4digits_non_digit() {
assert!(parse_4digits(b"00X2").is_err());
}
#[test]
fn parse_5digits_valid() {
assert_eq!(parse_5digits(b"00042").unwrap(), 42);
assert_eq!(parse_5digits(b"99999").unwrap(), 99999);
}
#[test]
fn parse_5digits_wrong_length() {
assert!(parse_5digits(b"0042").is_err());
assert!(parse_5digits(b"000042").is_err());
}
#[test]
fn parse_directory_entry_valid() {
let entry = b"245001500042";
let parsed = parse_directory_entry(entry).unwrap();
assert_eq!(parsed.tag, "245");
assert_eq!(parsed.length, 15);
assert_eq!(parsed.start, 42);
}
#[test]
fn parse_directory_entry_too_short() {
assert!(parse_directory_entry(b"24500").is_err());
}
#[test]
fn parse_directory_entry_invalid_length() {
let entry = b"245XX1500042";
assert!(parse_directory_entry(entry).is_err());
}
#[test]
fn is_control_field_tag_recognizes_control_tags() {
assert!(is_control_field_tag("001"));
assert!(is_control_field_tag("008"));
assert!(is_control_field_tag("009"));
}
#[test]
fn is_control_field_tag_rejects_data_tags() {
assert!(!is_control_field_tag("010"));
assert!(!is_control_field_tag("245"));
assert!(!is_control_field_tag("999"));
}
#[test]
fn is_control_field_tag_rejects_non_numeric() {
assert!(!is_control_field_tag("LDR"));
assert!(!is_control_field_tag("00A"));
}
#[test]
fn is_control_field_tag_rejects_wrong_length() {
assert!(!is_control_field_tag("01"));
assert!(!is_control_field_tag("0010"));
}
}