Skip to main content

marc_rs/xml/
reader.rs

1use std::io::BufRead;
2
3use quick_xml::events::{BytesStart, Event};
4use quick_xml::Reader;
5
6use crate::error::MarcError;
7
8/// Converts MARC-XML data into concatenated ISO2709 bytes, allowing reuse
9/// of the entire existing binary pipeline (format detection, encoding, mapping).
10pub struct XmlReader;
11
12struct XmlField {
13    tag: [u8; 3],
14    data: XmlFieldData,
15}
16
17enum XmlFieldData {
18    Control(String),
19    Data { ind1: u8, ind2: u8, subfields: Vec<(u8, String)> },
20}
21
22impl XmlReader {
23    /// Parse MARC-XML bytes into a single buffer of concatenated ISO2709 records.
24    pub fn parse(data: &[u8]) -> Result<Vec<u8>, MarcError> {
25        let mut reader = Reader::from_reader(data);
26        reader.config_mut().trim_text(true);
27        let mut buf = Vec::new();
28        let mut output = Vec::new();
29
30        loop {
31            match reader.read_event_into(&mut buf) {
32                Ok(Event::Start(ref e)) if local_name_eq(e, b"record") => {
33                    let iso_bytes = Self::parse_record(&mut reader)?;
34                    output.extend_from_slice(&iso_bytes);
35                }
36                Ok(Event::Eof) => break,
37                Err(e) => return Err(MarcError::Xml(e.to_string())),
38                _ => {}
39            }
40            buf.clear();
41        }
42
43        Ok(output)
44    }
45
46    fn parse_record<R: BufRead>(reader: &mut Reader<R>) -> Result<Vec<u8>, MarcError> {
47        let mut buf = Vec::new();
48        let mut leader = String::new();
49        let mut fields: Vec<XmlField> = Vec::new();
50
51        loop {
52            match reader.read_event_into(&mut buf) {
53                Ok(Event::Start(ref e)) => {
54                    let local = e.local_name();
55                    match local.as_ref() {
56                        b"leader" => {
57                            leader = read_element_text(reader)?;
58                        }
59                        b"controlfield" => {
60                            let tag = parse_tag(e)?;
61                            let text = read_element_text(reader)?;
62                            fields.push(XmlField {
63                                tag,
64                                data: XmlFieldData::Control(text),
65                            });
66                        }
67                        b"datafield" => {
68                            let tag = parse_tag(e)?;
69                            let ind1 = parse_indicator(e, b"ind1");
70                            let ind2 = parse_indicator(e, b"ind2");
71                            let subfields = Self::parse_subfields(reader)?;
72                            fields.push(XmlField {
73                                tag,
74                                data: XmlFieldData::Data { ind1, ind2, subfields },
75                            });
76                        }
77                        _ => {}
78                    }
79                }
80                Ok(Event::End(ref e)) if e.local_name().as_ref() == b"record" => break,
81                Ok(Event::Eof) => return Err(MarcError::Xml("unexpected EOF inside <record>".into())),
82                Err(e) => return Err(MarcError::Xml(e.to_string())),
83                _ => {}
84            }
85            buf.clear();
86        }
87
88        build_iso2709_from_xml(&leader, &fields)
89    }
90
91    fn parse_subfields<R: BufRead>(reader: &mut Reader<R>) -> Result<Vec<(u8, String)>, MarcError> {
92        let mut buf = Vec::new();
93        let mut subfields = Vec::new();
94
95        loop {
96            match reader.read_event_into(&mut buf) {
97                Ok(Event::Start(ref e)) if e.local_name().as_ref() == b"subfield" => {
98                    let code = parse_subfield_code(e);
99                    let text = read_element_text(reader)?;
100                    subfields.push((code, text));
101                }
102                Ok(Event::Empty(ref e)) if e.local_name().as_ref() == b"subfield" => {
103                    let code = parse_subfield_code(e);
104                    subfields.push((code, String::new()));
105                }
106                Ok(Event::End(ref e)) if e.local_name().as_ref() == b"datafield" => break,
107                Ok(Event::Eof) => return Err(MarcError::Xml("unexpected EOF inside <datafield>".into())),
108                Err(e) => return Err(MarcError::Xml(e.to_string())),
109                _ => {}
110            }
111            buf.clear();
112        }
113
114        Ok(subfields)
115    }
116}
117
118// ---------------------------------------------------------------------------
119// Helpers
120// ---------------------------------------------------------------------------
121
122fn local_name_eq(e: &BytesStart, name: &[u8]) -> bool {
123    e.local_name().as_ref() == name
124}
125
126fn read_element_text<R: BufRead>(reader: &mut Reader<R>) -> Result<String, MarcError> {
127    let mut buf = Vec::new();
128    let mut text = String::new();
129    loop {
130        match reader.read_event_into(&mut buf) {
131            Ok(Event::Text(e)) => {
132                text.push_str(&e.unescape().map_err(|e| MarcError::Xml(e.to_string()))?);
133            }
134            Ok(Event::CData(e)) => {
135                text.push_str(&String::from_utf8_lossy(&e));
136            }
137            Ok(Event::End(_)) => break,
138            Ok(Event::Eof) => return Err(MarcError::Xml("unexpected EOF in element text".into())),
139            Err(e) => return Err(MarcError::Xml(e.to_string())),
140            _ => {}
141        }
142        buf.clear();
143    }
144    Ok(text)
145}
146
147fn attr_value(e: &BytesStart, name: &[u8]) -> Option<String> {
148    e.attributes().flatten().find(|a| a.key.as_ref() == name).map(|a| String::from_utf8_lossy(&a.value).into_owned())
149}
150
151fn parse_tag(e: &BytesStart) -> Result<[u8; 3], MarcError> {
152    let val = attr_value(e, b"tag").ok_or_else(|| MarcError::Xml("missing tag attribute".into()))?;
153    let bytes = val.as_bytes();
154    if bytes.len() != 3 {
155        return Err(MarcError::Xml(format!("tag must be exactly 3 characters, got '{val}'")));
156    }
157    let mut tag = [0u8; 3];
158    tag.copy_from_slice(bytes);
159    Ok(tag)
160}
161
162fn parse_indicator(e: &BytesStart, name: &[u8]) -> u8 {
163    attr_value(e, name).and_then(|v| v.bytes().next()).unwrap_or(b' ')
164}
165
166fn parse_subfield_code(e: &BytesStart) -> u8 {
167    attr_value(e, b"code").and_then(|v| v.bytes().next()).unwrap_or(b'a')
168}
169
170/// Builds a valid ISO2709 byte sequence from XML-parsed fields.
171/// The resulting bytes are always UTF-8 encoded (leader byte 9 = 'a').
172fn build_iso2709_from_xml(leader_str: &str, fields: &[XmlField]) -> Result<Vec<u8>, MarcError> {
173    let mut directory: Vec<u8> = Vec::new();
174    let mut field_data: Vec<u8> = Vec::new();
175    let mut offset: usize = 0;
176
177    for field in fields {
178        let mut fb: Vec<u8> = Vec::new();
179        match &field.data {
180            XmlFieldData::Control(text) => {
181                fb.extend_from_slice(text.as_bytes());
182                fb.push(0x1E);
183            }
184            XmlFieldData::Data { ind1, ind2, subfields } => {
185                fb.push(*ind1);
186                fb.push(*ind2);
187                for (code, value) in subfields {
188                    fb.push(0x1F);
189                    fb.push(*code);
190                    fb.extend_from_slice(value.as_bytes());
191                }
192                fb.push(0x1E);
193            }
194        }
195
196        let length = fb.len();
197        directory.extend_from_slice(&field.tag);
198        directory.extend_from_slice(format!("{length:0>4}").as_bytes());
199        directory.extend_from_slice(format!("{offset:0>5}").as_bytes());
200        field_data.extend_from_slice(&fb);
201        offset += length;
202    }
203
204    directory.push(0x1E);
205    let base_address = 24 + directory.len();
206
207    let mut leader_bytes = [b' '; 24];
208    let src = leader_str.as_bytes();
209    let n = src.len().min(24);
210    leader_bytes[..n].copy_from_slice(&src[..n]);
211
212    leader_bytes[9] = b'a';
213    leader_bytes[10] = b'2';
214    leader_bytes[11] = b'2';
215    leader_bytes[20..24].copy_from_slice(b"4500");
216
217    let record_length = base_address + field_data.len() + 1;
218    if record_length > 99999 {
219        return Err(MarcError::InvalidRecord("record too long for ISO2709 leader"));
220    }
221    leader_bytes[0..5].copy_from_slice(format!("{record_length:0>5}").as_bytes());
222    leader_bytes[12..17].copy_from_slice(format!("{base_address:0>5}").as_bytes());
223
224    let mut out = Vec::with_capacity(record_length);
225    out.extend_from_slice(&leader_bytes);
226    out.extend_from_slice(&directory);
227    out.extend_from_slice(&field_data);
228    out.push(0x1D);
229    Ok(out)
230}