marc_record/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
//! This crates provides means to parse MARC21 records. It supports normal MARC21 records
//! using either MARC-8 (for latin languages) or Unicode and tries to transform as much as
//! possible into strings. It doesn't interpret the field data much, so lookup from tag numbers will be required
//!
//! Info about the format can be found here: https://www.loc.gov/marc/bibliographic/
//!
//! The general structure of a MARC record is as follows:
//!
//! A file can contain many MARC records. Each records has the following parts:
//! - a leader: a header that contains info about the structure of the record;
//! - a directory: an index of the various fields;
//! - fields, which can either be control fields or data fields
//!
//! All the fields have an identifying tag.
//!
//! Control fields simply contain ASCII data.
//!
//! Each data field can have a 2-character set of indicators, for which some meaning can be derived.
//!
//! They also contain a list of subfields which are identified by a single ASCII character.
//!
//! The only entrypoint to the library is the parse_records function:
//! ```rust
//! use marc_record::parse_records;
//!
//! let binary_data = include_bytes!("../samples/marc8_multiple.mrc");
//! let records = parse_records(binary_data).unwrap();
//! assert_eq!(records.len(), 109);
//! ```

// Support for the MARC-8 encoding format
pub mod marc8;
// Parsing rules
mod parser;

use core::str;
use std::{
    borrow::Cow,
    fmt::{Display, Write},
    str::Utf8Error,
};

use marc8::Marc8Decoder;
use thiserror::Error;
use winnow::{
    combinator::repeat,
    error::{ContextError, ParseError, StrContext},
    Parser,
};

/// Parse a set of MARC records from bytes
//
/// This requires bytes because a MARC record can have various encodings,
/// UTF-8 and MARC-8 being the most common ones.
/// It assumes all the records are valid and complete and will also fail if
/// any extra content is found.
///
/// Right now, the parser enforces a few constraints:
/// - It only accepts MARC records with a value of 2 for the indicator count;
/// - It only accepts singular character subfields (described as 2 for the marker);
/// - It only accepts directory entries using the standard 4-5-0-0 schema
/// - MARC-8 encoding support is limited to latin character sets

pub fn parse_records(data: &[u8]) -> Result<Vec<Record>, Error> {
    Ok(repeat(
        0..,
        parser::parse_record.context(StrContext::Label("record")),
    )
    .parse(data)?)
}

// Represents a parsing error
#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("Failed to parse: {} at byte offset `{}`", reason, offset)]
    ParseFailed { reason: String, offset: usize },
}

impl From<ParseError<&[u8], ContextError>> for Error {
    fn from(value: ParseError<&[u8], ContextError>) -> Self {
        Self::ParseFailed {
            reason: value.inner().to_string(),
            offset: value.offset(),
        }
    }
}

/// A MARC record describes a content or piece of content using a series of fields.
/// Fields are identified by a three-digit ASCII code (e.g. `001`) and contain
/// either control or field data. Control fields are identified by being in the 000-099 range
/// while data fields are all the others.
/// Control fields contain a single piece of information
pub struct Record {
    /// The leader is MARC's way of naming the header info
    pub leader: Leader,
    /// The actual control and data fields
    pub fields: Vec<Field>,
}

/// The leader is MARC's equivalent of a header. It contains internal bookkeeping info about the record
/// as well as some information of interest to the applications reading it.
#[derive(Debug)]
pub struct Leader {
    /// Length in bytes of the record
    pub record_length: u16,

    /// The status of the information, e.g. whether the sender signals it is a new entry or a correction
    pub status: Status,

    /// The type of content described by the record, will alter which fields *should* be provided.
    pub record_type: RecordType,

    /// Mostly used to tell whether this is for a single item or a collection
    pub bibliographical_level: BibliographicalLevel,

    pub control_type: ControlType,

    /// Tells which encoding was used for the text of the variable data fields.
    pub coding_scheme: CodingScheme,

    /// Byte index of the variable field data, after the leader and the dictionary
    pub data_base_address: u16,

    /// Represents something like the quality of the data about the entry
    pub encoding_level: EncodingLevel,
    pub descriptive_cataloging_form: CatalogingForm,
    pub multipart_resource_record_level: MultipartResourceRecordLevel,
}

#[derive(Debug, PartialEq, Eq)]
pub enum Status {
    IncreaseInEncoding,
    Corrected,
    Deleted,
    New,
    IncreaseFromPrepublication,
}

#[derive(Debug, PartialEq, Eq)]
pub enum RecordType {
    LanguageMaterial,
    NotatedMusic,
    ManuscriptNotatedMusic,
    CartographicMaterial,
    ManuscriptCartographicMaterial,
    ProjectedMedium,
    NonmusicalSoundRecording,
    MusicalSoundRecording,
    TwoDimensionalNonprojectableGraphic,
    ComputerFile,
    Kit,
    MixedMaterials,
    ThreeDimensionalArtifact,
    ManuscriptLanguageMaterial,
}

#[derive(Debug, PartialEq, Eq)]
pub enum BibliographicalLevel {
    MonographicComponentPart,
    SerialComponentPart,
    Collection,
    Subunit,
    IntegratingResource,
    Monograph,
    Serial,
    Unknown,
}

#[derive(Debug, PartialEq, Eq)]
pub enum ControlType {
    Unspecified,
    Archival,
}

#[derive(Debug, PartialEq, Eq)]
pub enum CodingScheme {
    /// The MARC-8 Encoding scheme
    Marc8,
    /// Unicode code points encoded as UTF-8
    Ucs,
}

impl CodingScheme {
    fn decoder(&self) -> Decoder {
        match self {
            CodingScheme::Marc8 => Decoder::Marc8(marc8::Marc8Decoder {}),
            CodingScheme::Ucs => Decoder::Utf8(Utf8Decoder {}),
        }
    }
}

#[derive(Debug, PartialEq, Eq)]
pub enum EncodingLevel {
    Full,
    FullMaterialNotExamined,
    LessThanFullMaterialNotExamined,
    Abbreviated,
    Core,
    Partial,
    Minimal,
    Prepublication,
    Unknown,
    NotApplicable,
    ObsoleteFull,
    ObsoleteMinimal,
    AddedFromBatch,
}

#[derive(Debug, PartialEq, Eq)]
pub enum CatalogingForm {
    NonIsbd,
    Aacr2,
    IsbdPunctuationOmitted,
    IsbdPunctuationIncluded,
    NonIsbdPunctuationOmitted,
    Unknown,
}

#[derive(Debug, PartialEq, Eq)]
pub enum MultipartResourceRecordLevel {
    NotApplicable,
    Set,
    PartWithIndependentTitle,
    PartwithDependentTitle,
}

enum Decoder {
    Marc8(Marc8Decoder),
    Utf8(Utf8Decoder),
}

impl TextDecoder for Decoder {
    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
        match self {
            Decoder::Marc8(marc8_decoder) => marc8_decoder.decode(text),
            Decoder::Utf8(utf8_decoder) => utf8_decoder.decode(text),
        }
    }
}

const RECORD_SEPARATOR: u8 = 0x1D;
const FIELD_SEPARATOR: u8 = 0x1E;
const SUBFIELD_SEPARATOR: u8 = 0x1F;

#[derive(Debug)]
pub struct DirectoryEntry {
    pub tag: [char; 3],
    pub field_length: usize,
    pub starting_pos: usize,
}

impl DirectoryEntry {
    fn is_control(&self) -> bool {
        &self.tag[0..2] == ['0', '0']
    }
}

#[derive(Debug, Error)]
enum DecodeError {
    #[error("UTF-8 error: {0}")]
    Utf(Utf8Error),
    #[error("Unknown char: {0}")]
    Unknown(u8),
    #[error("Invalid pair: base `{0}` with combining `{1}`")]
    InvalidPair(char, char),
    #[error("Invalid character sequence")]
    InvalidSequence,
}

trait TextDecoder {
    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError>;
}

// Blanket impl for boxes
impl<T> TextDecoder for Box<T>
where
    T: TextDecoder + Sized,
{
    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
        self.as_ref().decode(text)
    }
}

#[derive(Debug, PartialEq, Eq)]
pub struct FieldTag([char; 3]);

impl Display for FieldTag {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_char(self.0[0])?;
        f.write_char(self.0[1])?;
        f.write_char(self.0[2])
    }
}

#[derive(Debug, PartialEq, Eq)]
pub enum Field {
    Control(ControlField),
    Data(DataField),
}

impl Field {
    /// A quick accessor to get the control field variant if the field is actually one.
    pub fn control(&self) -> Option<&ControlField> {
        match self {
            Field::Control(control_field) => Some(control_field),
            Field::Data(_) => None,
        }
    }

    /// A quick accessor to get the data field variant if the field is actually one.
    pub fn data(&self) -> Option<&DataField> {
        match self {
            Field::Control(_) => None,
            Field::Data(data_field) => Some(data_field),
        }
    }
}

/// The first fields of a MARC record represent control data. Unlike
/// the variable data fields, they are simple blob of ASCII content, although
/// some of them are encoded with a specific scheme (for example, some of them are pipe-separated values)
#[derive(Debug, PartialEq, Eq)]
pub struct ControlField {
    pub tag: FieldTag,
    pub data: String,
}

/// One of the variable data fields representing the bulk of the information
/// found in the MARC record. The tag along with the indicators (of which there are typically 2) help
/// figure out the specific meaning of the data found within the subfields or blocks of content.
#[derive(Debug, PartialEq, Eq)]
pub struct DataField {
    pub tag: FieldTag,
    pub indicator: Vec<char>,
    pub subfields: Vec<Subfield>,
}

/// A type representing the function of a block of content
/// within a variable data field. Typically a single character.
#[derive(Debug, PartialEq, Eq)]
pub struct SubfieldTag(char);

impl Display for SubfieldTag {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_char(self.0)
    }
}

#[derive(Debug, PartialEq, Eq)]
pub struct Subfield {
    pub tag: SubfieldTag,
    pub data: String,
}

struct Utf8Decoder {}

impl TextDecoder for Utf8Decoder {
    fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
        str::from_utf8(&text)
            .map(|s| Cow::Borrowed(s))
            .map_err(|e| DecodeError::Utf(e))
    }
}