marc_record/
lib.rs

1//! This crates provides means to parse MARC21 records. It supports normal MARC21 records
2//! using either MARC-8 (for latin languages) or Unicode and tries to transform as much as
3//! possible into strings. It doesn't interpret the field data much, so lookup from tag numbers will be required
4//!
5//! Info about the format can be found here: https://www.loc.gov/marc/bibliographic/
6//!
7//! The general structure of a MARC record is as follows:
8//!
9//! A file can contain many MARC records. Each records has the following parts:
10//! - a leader: a header that contains info about the structure of the record;
11//! - a directory: an index of the various fields;
12//! - fields, which can either be control fields or data fields
13//!
14//! All the fields have an identifying tag.
15//!
16//! Control fields simply contain ASCII data.
17//!
18//! Each data field can have a 2-character set of indicators, for which some meaning can be derived.
19//!
20//! They also contain a list of subfields which are identified by a single ASCII character.
21//!
22//! The only entrypoint to the library is the parse_records function:
23//! ```rust
24//! use marc_record::parse_records;
25//!
26//! let binary_data = include_bytes!("../samples/marc8_multiple.mrc");
27//! let records = parse_records(binary_data).unwrap();
28//! assert_eq!(records.len(), 109);
29//! ```
30
31// Support for the MARC-8 encoding format
32pub mod marc8;
33// Parsing rules
34mod parser;
35
36use core::str;
37use std::{
38    borrow::Cow,
39    fmt::{Display, Write},
40    str::Utf8Error,
41};
42
43use marc8::Marc8Decoder;
44use thiserror::Error;
45use winnow::{
46    combinator::repeat,
47    error::{ContextError, ParseError, StrContext},
48    Parser,
49};
50
51/// Parse a set of MARC records from bytes
52//
53/// This requires bytes because a MARC record can have various encodings,
54/// UTF-8 and MARC-8 being the most common ones.
55/// It assumes all the records are valid and complete and will also fail if
56/// any extra content is found.
57///
58/// Right now, the parser enforces a few constraints:
59/// - It only accepts MARC records with a value of 2 for the indicator count;
60/// - It only accepts singular character subfields (described as 2 for the marker);
61/// - It only accepts directory entries using the standard 4-5-0-0 schema
62/// - MARC-8 encoding support is limited to latin character sets
63
64pub fn parse_records(data: &[u8]) -> Result<Vec<Record>, Error> {
65    Ok(repeat(
66        0..,
67        parser::parse_record.context(StrContext::Label("record")),
68    )
69    .parse(data)?)
70}
71
72// Represents a parsing error
73#[derive(thiserror::Error, Debug)]
74pub enum Error {
75    #[error("Failed to parse: {} at byte offset `{}`", reason, offset)]
76    ParseFailed { reason: String, offset: usize },
77}
78
79impl From<ParseError<&[u8], ContextError>> for Error {
80    fn from(value: ParseError<&[u8], ContextError>) -> Self {
81        Self::ParseFailed {
82            reason: value.inner().to_string(),
83            offset: value.offset(),
84        }
85    }
86}
87
88/// A MARC record describes a content or piece of content using a series of fields.
89/// Fields are identified by a three-digit ASCII code (e.g. `001`) and contain
90/// either control or field data. Control fields are identified by being in the 000-099 range
91/// while data fields are all the others.
92/// Control fields contain a single piece of information
93pub struct Record {
94    /// The leader is MARC's way of naming the header info
95    pub leader: Leader,
96    /// The actual control and data fields
97    pub fields: Vec<Field>,
98}
99
100/// The leader is MARC's equivalent of a header. It contains internal bookkeeping info about the record
101/// as well as some information of interest to the applications reading it.
102#[derive(Debug)]
103pub struct Leader {
104    /// Length in bytes of the record
105    pub record_length: u16,
106
107    /// The status of the information, e.g. whether the sender signals it is a new entry or a correction
108    pub status: Status,
109
110    /// The type of content described by the record, will alter which fields *should* be provided.
111    pub record_type: RecordType,
112
113    /// Mostly used to tell whether this is for a single item or a collection
114    pub bibliographical_level: BibliographicalLevel,
115
116    pub control_type: ControlType,
117
118    /// Tells which encoding was used for the text of the variable data fields.
119    pub coding_scheme: CodingScheme,
120
121    /// Byte index of the variable field data, after the leader and the dictionary
122    pub data_base_address: u16,
123
124    /// Represents something like the quality of the data about the entry
125    pub encoding_level: EncodingLevel,
126    pub descriptive_cataloging_form: CatalogingForm,
127    pub multipart_resource_record_level: MultipartResourceRecordLevel,
128}
129
130#[derive(Debug, PartialEq, Eq)]
131pub enum Status {
132    IncreaseInEncoding,
133    Corrected,
134    Deleted,
135    New,
136    IncreaseFromPrepublication,
137}
138
139#[derive(Debug, PartialEq, Eq)]
140pub enum RecordType {
141    LanguageMaterial,
142    NotatedMusic,
143    ManuscriptNotatedMusic,
144    CartographicMaterial,
145    ManuscriptCartographicMaterial,
146    ProjectedMedium,
147    NonmusicalSoundRecording,
148    MusicalSoundRecording,
149    TwoDimensionalNonprojectableGraphic,
150    ComputerFile,
151    Kit,
152    MixedMaterials,
153    ThreeDimensionalArtifact,
154    ManuscriptLanguageMaterial,
155}
156
157#[derive(Debug, PartialEq, Eq)]
158pub enum BibliographicalLevel {
159    MonographicComponentPart,
160    SerialComponentPart,
161    Collection,
162    Subunit,
163    IntegratingResource,
164    Monograph,
165    Serial,
166    Unknown,
167}
168
169#[derive(Debug, PartialEq, Eq)]
170pub enum ControlType {
171    Unspecified,
172    Archival,
173}
174
175#[derive(Debug, PartialEq, Eq)]
176pub enum CodingScheme {
177    /// The MARC-8 Encoding scheme
178    Marc8,
179    /// Unicode code points encoded as UTF-8
180    Ucs,
181}
182
183impl CodingScheme {
184    fn decoder(&self) -> Decoder {
185        match self {
186            CodingScheme::Marc8 => Decoder::Marc8(marc8::Marc8Decoder {}),
187            CodingScheme::Ucs => Decoder::Utf8(Utf8Decoder {}),
188        }
189    }
190}
191
192#[derive(Debug, PartialEq, Eq)]
193pub enum EncodingLevel {
194    Full,
195    FullMaterialNotExamined,
196    LessThanFullMaterialNotExamined,
197    Abbreviated,
198    Core,
199    Partial,
200    Minimal,
201    Prepublication,
202    Unknown,
203    NotApplicable,
204    ObsoleteFull,
205    ObsoleteMinimal,
206    AddedFromBatch,
207}
208
209#[derive(Debug, PartialEq, Eq)]
210pub enum CatalogingForm {
211    NonIsbd,
212    Aacr2,
213    IsbdPunctuationOmitted,
214    IsbdPunctuationIncluded,
215    NonIsbdPunctuationOmitted,
216    Unknown,
217}
218
219#[derive(Debug, PartialEq, Eq)]
220pub enum MultipartResourceRecordLevel {
221    NotApplicable,
222    Set,
223    PartWithIndependentTitle,
224    PartwithDependentTitle,
225}
226
227enum Decoder {
228    Marc8(Marc8Decoder),
229    Utf8(Utf8Decoder),
230}
231
232impl TextDecoder for Decoder {
233    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
234        match self {
235            Decoder::Marc8(marc8_decoder) => marc8_decoder.decode(text),
236            Decoder::Utf8(utf8_decoder) => utf8_decoder.decode(text),
237        }
238    }
239}
240
241const RECORD_SEPARATOR: u8 = 0x1D;
242const FIELD_SEPARATOR: u8 = 0x1E;
243const SUBFIELD_SEPARATOR: u8 = 0x1F;
244
245#[derive(Debug)]
246pub struct DirectoryEntry {
247    pub tag: [char; 3],
248    pub field_length: usize,
249    pub starting_pos: usize,
250}
251
252impl DirectoryEntry {
253    fn is_control(&self) -> bool {
254        &self.tag[0..2] == ['0', '0']
255    }
256}
257
258#[derive(Debug, Error)]
259enum DecodeError {
260    #[error("UTF-8 error: {0}")]
261    Utf(Utf8Error),
262    #[error("Unknown char: {0}")]
263    Unknown(u8),
264    #[error("Invalid pair: base `{0}` with combining `{1}`")]
265    InvalidPair(char, char),
266    #[error("Invalid character sequence")]
267    InvalidSequence,
268}
269
270trait TextDecoder {
271    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError>;
272}
273
274// Blanket impl for boxes
275impl<T> TextDecoder for Box<T>
276where
277    T: TextDecoder + Sized,
278{
279    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
280        self.as_ref().decode(text)
281    }
282}
283
284#[derive(Debug, PartialEq, Eq)]
285pub struct FieldTag([char; 3]);
286
287impl Display for FieldTag {
288    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
289        f.write_char(self.0[0])?;
290        f.write_char(self.0[1])?;
291        f.write_char(self.0[2])
292    }
293}
294
295#[derive(Debug, PartialEq, Eq)]
296pub enum Field {
297    Control(ControlField),
298    Data(DataField),
299}
300
301impl Field {
302    /// A quick accessor to get the control field variant if the field is actually one.
303    pub fn control(&self) -> Option<&ControlField> {
304        match self {
305            Field::Control(control_field) => Some(control_field),
306            Field::Data(_) => None,
307        }
308    }
309
310    /// A quick accessor to get the data field variant if the field is actually one.
311    pub fn data(&self) -> Option<&DataField> {
312        match self {
313            Field::Control(_) => None,
314            Field::Data(data_field) => Some(data_field),
315        }
316    }
317}
318
319/// The first fields of a MARC record represent control data. Unlike
320/// the variable data fields, they are simple blob of ASCII content, although
321/// some of them are encoded with a specific scheme (for example, some of them are pipe-separated values)
322#[derive(Debug, PartialEq, Eq)]
323pub struct ControlField {
324    pub tag: FieldTag,
325    pub data: String,
326}
327
328/// One of the variable data fields representing the bulk of the information
329/// found in the MARC record. The tag along with the indicators (of which there are typically 2) help
330/// figure out the specific meaning of the data found within the subfields or blocks of content.
331#[derive(Debug, PartialEq, Eq)]
332pub struct DataField {
333    pub tag: FieldTag,
334    pub indicator: Vec<char>,
335    pub subfields: Vec<Subfield>,
336}
337
338/// A type representing the function of a block of content
339/// within a variable data field. Typically a single character.
340#[derive(Debug, PartialEq, Eq)]
341pub struct SubfieldTag(char);
342
343impl Display for SubfieldTag {
344    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
345        f.write_char(self.0)
346    }
347}
348
349#[derive(Debug, PartialEq, Eq)]
350pub struct Subfield {
351    pub tag: SubfieldTag,
352    pub data: String,
353}
354
355struct Utf8Decoder {}
356
357impl TextDecoder for Utf8Decoder {
358    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
359        str::from_utf8(&text)
360            .map(|s| Cow::Borrowed(s))
361            .map_err(|e| DecodeError::Utf(e))
362    }
363}