marc_record/lib.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
//! This crates provides means to parse MARC21 records. It supports normal MARC21 records
//! using either MARC-8 (for latin languages) or Unicode and tries to transform as much as
//! possible into strings. It doesn't interpret the field data much, so lookup from tag numbers will be required
//!
//! Info about the format can be found here: https://www.loc.gov/marc/bibliographic/
//!
//! The general structure of a MARC record is as follows:
//!
//! A file can contain many MARC records. Each records has the following parts:
//! - a leader: a header that contains info about the structure of the record;
//! - a directory: an index of the various fields;
//! - fields, which can either be control fields or data fields
//!
//! All the fields have an identifying tag.
//!
//! Control fields simply contain ASCII data.
//!
//! Each data field can have a 2-character set of indicators, for which some meaning can be derived.
//!
//! They also contain a list of subfields which are identified by a single ASCII character.
//!
//! The only entrypoint to the library is the parse_records function:
//! ```rust
//! use marc_record::parse_records;
//!
//! let binary_data = include_bytes!("../samples/marc8_multiple.mrc");
//! let records = parse_records(binary_data).unwrap();
//! assert_eq!(records.len(), 109);
//! ```
// Support for the MARC-8 encoding format
pub mod marc8;
// Parsing rules
mod parser;
use core::str;
use std::{
borrow::Cow,
fmt::{Display, Write},
str::Utf8Error,
};
use marc8::Marc8Decoder;
use thiserror::Error;
use winnow::{
combinator::repeat,
error::{ContextError, ParseError, StrContext},
Parser,
};
/// Parse a set of MARC records from bytes
//
/// This requires bytes because a MARC record can have various encodings,
/// UTF-8 and MARC-8 being the most common ones.
/// It assumes all the records are valid and complete and will also fail if
/// any extra content is found.
///
/// Right now, the parser enforces a few constraints:
/// - It only accepts MARC records with a value of 2 for the indicator count;
/// - It only accepts singular character subfields (described as 2 for the marker);
/// - It only accepts directory entries using the standard 4-5-0-0 schema
/// - MARC-8 encoding support is limited to latin character sets
pub fn parse_records(data: &[u8]) -> Result<Vec<Record>, Error> {
Ok(repeat(
0..,
parser::parse_record.context(StrContext::Label("record")),
)
.parse(data)?)
}
// Represents a parsing error
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("Failed to parse: {} at byte offset `{}`", reason, offset)]
ParseFailed { reason: String, offset: usize },
}
impl From<ParseError<&[u8], ContextError>> for Error {
fn from(value: ParseError<&[u8], ContextError>) -> Self {
Self::ParseFailed {
reason: value.inner().to_string(),
offset: value.offset(),
}
}
}
/// A MARC record describes a content or piece of content using a series of fields.
/// Fields are identified by a three-digit ASCII code (e.g. `001`) and contain
/// either control or field data. Control fields are identified by being in the 000-099 range
/// while data fields are all the others.
/// Control fields contain a single piece of information
pub struct Record {
/// The leader is MARC's way of naming the header info
pub leader: Leader,
/// The actual control and data fields
pub fields: Vec<Field>,
}
/// The leader is MARC's equivalent of a header. It contains internal bookkeeping info about the record
/// as well as some information of interest to the applications reading it.
#[derive(Debug)]
pub struct Leader {
/// Length in bytes of the record
pub record_length: u16,
/// The status of the information, e.g. whether the sender signals it is a new entry or a correction
pub status: Status,
/// The type of content described by the record, will alter which fields *should* be provided.
pub record_type: RecordType,
/// Mostly used to tell whether this is for a single item or a collection
pub bibliographical_level: BibliographicalLevel,
pub control_type: ControlType,
/// Tells which encoding was used for the text of the variable data fields.
pub coding_scheme: CodingScheme,
/// Byte index of the variable field data, after the leader and the dictionary
pub data_base_address: u16,
/// Represents something like the quality of the data about the entry
pub encoding_level: EncodingLevel,
pub descriptive_cataloging_form: CatalogingForm,
pub multipart_resource_record_level: MultipartResourceRecordLevel,
}
#[derive(Debug, PartialEq, Eq)]
pub enum Status {
IncreaseInEncoding,
Corrected,
Deleted,
New,
IncreaseFromPrepublication,
}
#[derive(Debug, PartialEq, Eq)]
pub enum RecordType {
LanguageMaterial,
NotatedMusic,
ManuscriptNotatedMusic,
CartographicMaterial,
ManuscriptCartographicMaterial,
ProjectedMedium,
NonmusicalSoundRecording,
MusicalSoundRecording,
TwoDimensionalNonprojectableGraphic,
ComputerFile,
Kit,
MixedMaterials,
ThreeDimensionalArtifact,
ManuscriptLanguageMaterial,
}
#[derive(Debug, PartialEq, Eq)]
pub enum BibliographicalLevel {
MonographicComponentPart,
SerialComponentPart,
Collection,
Subunit,
IntegratingResource,
Monograph,
Serial,
Unknown,
}
#[derive(Debug, PartialEq, Eq)]
pub enum ControlType {
Unspecified,
Archival,
}
#[derive(Debug, PartialEq, Eq)]
pub enum CodingScheme {
/// The MARC-8 Encoding scheme
Marc8,
/// Unicode code points encoded as UTF-8
Ucs,
}
impl CodingScheme {
fn decoder(&self) -> Decoder {
match self {
CodingScheme::Marc8 => Decoder::Marc8(marc8::Marc8Decoder {}),
CodingScheme::Ucs => Decoder::Utf8(Utf8Decoder {}),
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum EncodingLevel {
Full,
FullMaterialNotExamined,
LessThanFullMaterialNotExamined,
Abbreviated,
Core,
Partial,
Minimal,
Prepublication,
Unknown,
NotApplicable,
ObsoleteFull,
ObsoleteMinimal,
AddedFromBatch,
}
#[derive(Debug, PartialEq, Eq)]
pub enum CatalogingForm {
NonIsbd,
Aacr2,
IsbdPunctuationOmitted,
IsbdPunctuationIncluded,
NonIsbdPunctuationOmitted,
Unknown,
}
#[derive(Debug, PartialEq, Eq)]
pub enum MultipartResourceRecordLevel {
NotApplicable,
Set,
PartWithIndependentTitle,
PartwithDependentTitle,
}
enum Decoder {
Marc8(Marc8Decoder),
Utf8(Utf8Decoder),
}
impl TextDecoder for Decoder {
fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
match self {
Decoder::Marc8(marc8_decoder) => marc8_decoder.decode(text),
Decoder::Utf8(utf8_decoder) => utf8_decoder.decode(text),
}
}
}
const RECORD_SEPARATOR: u8 = 0x1D;
const FIELD_SEPARATOR: u8 = 0x1E;
const SUBFIELD_SEPARATOR: u8 = 0x1F;
#[derive(Debug)]
pub struct DirectoryEntry {
pub tag: [char; 3],
pub field_length: usize,
pub starting_pos: usize,
}
impl DirectoryEntry {
fn is_control(&self) -> bool {
&self.tag[0..2] == ['0', '0']
}
}
#[derive(Debug, Error)]
enum DecodeError {
#[error("UTF-8 error: {0}")]
Utf(Utf8Error),
#[error("Unknown char: {0}")]
Unknown(u8),
#[error("Invalid pair: base `{0}` with combining `{1}`")]
InvalidPair(char, char),
#[error("Invalid character sequence")]
InvalidSequence,
}
trait TextDecoder {
fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError>;
}
// Blanket impl for boxes
impl<T> TextDecoder for Box<T>
where
T: TextDecoder + Sized,
{
fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
self.as_ref().decode(text)
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct FieldTag([char; 3]);
impl Display for FieldTag {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_char(self.0[0])?;
f.write_char(self.0[1])?;
f.write_char(self.0[2])
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum Field {
Control(ControlField),
Data(DataField),
}
impl Field {
/// A quick accessor to get the control field variant if the field is actually one.
pub fn control(&self) -> Option<&ControlField> {
match self {
Field::Control(control_field) => Some(control_field),
Field::Data(_) => None,
}
}
/// A quick accessor to get the data field variant if the field is actually one.
pub fn data(&self) -> Option<&DataField> {
match self {
Field::Control(_) => None,
Field::Data(data_field) => Some(data_field),
}
}
}
/// The first fields of a MARC record represent control data. Unlike
/// the variable data fields, they are simple blob of ASCII content, although
/// some of them are encoded with a specific scheme (for example, some of them are pipe-separated values)
#[derive(Debug, PartialEq, Eq)]
pub struct ControlField {
pub tag: FieldTag,
pub data: String,
}
/// One of the variable data fields representing the bulk of the information
/// found in the MARC record. The tag along with the indicators (of which there are typically 2) help
/// figure out the specific meaning of the data found within the subfields or blocks of content.
#[derive(Debug, PartialEq, Eq)]
pub struct DataField {
pub tag: FieldTag,
pub indicator: Vec<char>,
pub subfields: Vec<Subfield>,
}
/// A type representing the function of a block of content
/// within a variable data field. Typically a single character.
#[derive(Debug, PartialEq, Eq)]
pub struct SubfieldTag(char);
impl Display for SubfieldTag {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_char(self.0)
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct Subfield {
pub tag: SubfieldTag,
pub data: String,
}
struct Utf8Decoder {}
impl TextDecoder for Utf8Decoder {
fn decode<'a>(&self, text: &'a [u8]) -> Result<Cow<'a, str>, DecodeError> {
str::from_utf8(&text)
.map(|s| Cow::Borrowed(s))
.map_err(|e| DecodeError::Utf(e))
}
}