marc_rs/
parser.rs

1use crate::encoding::convert_to_utf8;
2use crate::format::{FormatEncoding, MarcFormat};
3use crate::record::{ControlField, DataField, Leader, Record, Subfield};
4
5/// Parse error type
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum ParseError {
8    InvalidLeader(String),
9    InvalidRecordLength(String),
10    InvalidField(String),
11    InvalidEncoding(String),
12    UnexpectedEof,
13    InvalidXml(String),
14    Other(String),
15}
16
17impl std::fmt::Display for ParseError {
18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19        match self {
20            ParseError::InvalidLeader(msg) => write!(f, "Invalid leader: {}", msg),
21            ParseError::InvalidRecordLength(msg) => write!(f, "Invalid record length: {}", msg),
22            ParseError::InvalidField(msg) => write!(f, "Invalid field: {}", msg),
23            ParseError::InvalidEncoding(msg) => write!(f, "Invalid encoding: {}", msg),
24            ParseError::UnexpectedEof => write!(f, "Unexpected end of file"),
25            ParseError::InvalidXml(msg) => write!(f, "Invalid XML: {}", msg),
26            ParseError::Other(msg) => write!(f, "Parse error: {}", msg),
27        }
28    }
29}
30
31impl std::error::Error for ParseError {}
32
33/// Parse MARC records from bytes
34pub fn parse(data: &[u8], format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
35    match format_encoding.format {
36        MarcFormat::Marc21 => parse_marc21_binary(data, format_encoding),
37        MarcFormat::Unimarc => parse_unimarc_binary(data, format_encoding),
38        MarcFormat::MarcXml => parse_marc_xml(data, format_encoding),
39    }
40}
41
42/// Parse MARC21 binary format
43pub fn parse_marc21_binary(data: &[u8], format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
44    let mut records = Vec::new();
45    let mut offset = 0;
46
47    while offset < data.len() {
48        if data.len() - offset < 24 {
49            break; // Not enough data for a leader
50        }
51
52        let leader = Leader::from_bytes(&data[offset..offset + 24]).map_err(|e| ParseError::InvalidLeader(e))?;
53
54        let record_length = leader.record_length as usize;
55        if record_length == 0 || record_length > data.len() - offset {
56            return Err(ParseError::InvalidRecordLength(format!(
57                "Record length {} exceeds available data {}",
58                record_length,
59                data.len() - offset
60            )));
61        }
62
63        let record_data = &data[offset..offset + record_length];
64        let record = parse_single_marc21_record(record_data, &leader, format_encoding)?;
65        records.push(record);
66
67        offset += record_length;
68    }
69
70    Ok(records)
71}
72
73/// Parse a single MARC21 record
74fn parse_single_marc21_record(data: &[u8], leader: &Leader, format_encoding: FormatEncoding) -> Result<Record, ParseError> {
75    if data.len() < leader.base_address_of_data as usize {
76        return Err(ParseError::UnexpectedEof);
77    }
78
79    let base_address = leader.base_address_of_data as usize;
80    let directory = &data[24..base_address];
81    let data_area = &data[base_address..];
82
83    let mut control_fields = Vec::new();
84    let mut data_fields = Vec::new();
85
86    let mut dir_offset = 0;
87    while dir_offset + 12 <= directory.len() {
88        let tag_bytes = &directory[dir_offset..dir_offset + 3];
89        let tag = std::str::from_utf8(tag_bytes).map_err(|e| ParseError::InvalidField(format!("Invalid tag: {}", e)))?;
90
91        let length_bytes = &directory[dir_offset + 3..dir_offset + 7];
92        let length = std::str::from_utf8(length_bytes)
93            .map_err(|e| ParseError::InvalidField(format!("Invalid length: {}", e)))?
94            .parse::<usize>()
95            .map_err(|e| ParseError::InvalidField(format!("Invalid length number: {}", e)))?;
96
97        let start_bytes = &directory[dir_offset + 7..dir_offset + 12];
98        let start = std::str::from_utf8(start_bytes)
99            .map_err(|e| ParseError::InvalidField(format!("Invalid start: {}", e)))?
100            .parse::<usize>()
101            .map_err(|e| ParseError::InvalidField(format!("Invalid start number: {}", e)))?;
102
103        if start + length > data_area.len() {
104            return Err(ParseError::InvalidField(format!(
105                "Field extends beyond data area: start={}, length={}, data_len={}",
106                start,
107                length,
108                data_area.len()
109            )));
110        }
111
112        let field_data = &data_area[start..start + length];
113
114        if tag < "010" {
115            // Control field
116            let value = convert_to_utf8(field_data, format_encoding.encoding).map_err(|e| ParseError::InvalidEncoding(e))?;
117            control_fields.push(ControlField { tag: tag.to_string(), value });
118        } else {
119            // Data field
120            if field_data.is_empty() {
121                dir_offset += 12;
122                continue;
123            }
124
125            let ind1 = field_data[0] as char;
126            let ind2 = field_data[1] as char;
127            let subfield_data = &field_data[2..];
128
129            let mut subfields = Vec::new();
130            let mut i = 0;
131            while i < subfield_data.len() {
132                if subfield_data[i] == 0x1F {
133                    // Subfield delimiter
134                    i += 1;
135                    if i >= subfield_data.len() {
136                        break;
137                    }
138                    let code = subfield_data[i] as char;
139                    i += 1;
140
141                    let value_start = i;
142                    while i < subfield_data.len() && subfield_data[i] != 0x1F && subfield_data[i] != 0x1E {
143                        i += 1;
144                    }
145
146                    let value_bytes = &subfield_data[value_start..i];
147                    let value = convert_to_utf8(value_bytes, format_encoding.encoding).map_err(|e| ParseError::InvalidEncoding(e))?;
148
149                    subfields.push(Subfield { code, value });
150                } else {
151                    i += 1;
152                }
153            }
154
155            data_fields.push(DataField {
156                tag: tag.to_string(),
157                ind1,
158                ind2,
159                subfields,
160            });
161        }
162
163        dir_offset += 12;
164    }
165
166    Ok(Record {
167        leader: leader.clone(),
168        control_fields,
169        data_fields,
170    })
171}
172
173/// Parse UNIMARC binary format
174pub fn parse_unimarc_binary(data: &[u8], format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
175    // UNIMARC uses the same binary structure as MARC21
176    // The main differences are in field definitions and content
177    parse_marc21_binary(data, format_encoding)
178}
179
180/// Parse MARC XML format
181pub fn parse_marc_xml(data: &[u8], _format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
182    use quick_xml::events::Event;
183    use quick_xml::Reader;
184
185    let mut reader = Reader::from_str(std::str::from_utf8(data).map_err(|e| ParseError::InvalidXml(format!("Invalid UTF-8: {}", e)))?);
186    reader.trim_text(true);
187
188    let mut records = Vec::new();
189    let mut buf = Vec::new();
190    let mut current_record: Option<Record> = None;
191    let mut current_field: Option<DataField> = None;
192    let mut current_subfield: Option<Subfield> = None;
193    let mut current_tag = String::new();
194    let mut current_value = String::new();
195    let mut in_collection = false;
196
197    loop {
198        match reader.read_event_into(&mut buf) {
199            Ok(Event::Start(e)) => match e.name().as_ref() {
200                b"collection" => {
201                    in_collection = true;
202                }
203                b"record" => {
204                    current_record = Some(Record {
205                        leader: Leader {
206                            record_length: 0,
207                            record_status: ' ',
208                            record_type: ' ',
209                            bibliographic_level: ' ',
210                            type_of_control: ' ',
211                            character_coding_scheme: ' ',
212                            indicator_count: 2,
213                            subfield_code_count: 2,
214                            base_address_of_data: 0,
215                            encoding_level: ' ',
216                            descriptive_cataloging_form: ' ',
217                            multipart_resource_record_level: ' ',
218                            length_of_length_of_field_portion: 4,
219                            length_of_starting_character_position_portion: 5,
220                            length_of_implementation_defined_portion: 0,
221                            undefined: ' ',
222                        },
223                        control_fields: Vec::new(),
224                        data_fields: Vec::new(),
225                    });
226                }
227                b"leader" => {
228                    current_value.clear();
229                }
230                b"controlfield" => {
231                    current_tag = String::from_utf8_lossy(
232                        e.attributes()
233                            .find(|a| a.as_ref().unwrap().key.as_ref() == b"tag")
234                            .ok_or_else(|| ParseError::InvalidXml("Missing tag attribute".to_string()))?
235                            .as_ref()
236                            .unwrap()
237                            .value
238                            .as_ref(),
239                    )
240                    .to_string();
241                    current_value.clear();
242                }
243                b"datafield" => {
244                    let tag = String::from_utf8_lossy(
245                        e.attributes()
246                            .find(|a| a.as_ref().unwrap().key.as_ref() == b"tag")
247                            .ok_or_else(|| ParseError::InvalidXml("Missing tag attribute".to_string()))?
248                            .as_ref()
249                            .unwrap()
250                            .value
251                            .as_ref(),
252                    )
253                    .to_string();
254
255                    let ind1 = e
256                        .attributes()
257                        .find(|a| a.as_ref().unwrap().key.as_ref() == b"ind1")
258                        .map(|a| String::from_utf8_lossy(a.as_ref().unwrap().value.as_ref()).chars().next().unwrap_or(' '))
259                        .unwrap_or(' ');
260
261                    let ind2 = e
262                        .attributes()
263                        .find(|a| a.as_ref().unwrap().key.as_ref() == b"ind2")
264                        .map(|a| String::from_utf8_lossy(a.as_ref().unwrap().value.as_ref()).chars().next().unwrap_or(' '))
265                        .unwrap_or(' ');
266
267                    current_field = Some(DataField {
268                        tag,
269                        ind1,
270                        ind2,
271                        subfields: Vec::new(),
272                    });
273                }
274                b"subfield" => {
275                    let code = String::from_utf8_lossy(
276                        e.attributes()
277                            .find(|a| a.as_ref().unwrap().key.as_ref() == b"code")
278                            .ok_or_else(|| ParseError::InvalidXml("Missing code attribute".to_string()))?
279                            .as_ref()
280                            .unwrap()
281                            .value
282                            .as_ref(),
283                    )
284                    .chars()
285                    .next()
286                    .ok_or_else(|| ParseError::InvalidXml("Empty code attribute".to_string()))?;
287                    current_subfield = Some(Subfield { code, value: String::new() });
288                    current_value.clear();
289                }
290                _ => {}
291            },
292            Ok(Event::Text(e)) => {
293                current_value = e.unescape().unwrap_or_default().to_string();
294            }
295            Ok(Event::End(e)) => {
296                match e.name().as_ref() {
297                    b"record" => {
298                        if let Some(record) = current_record.take() {
299                            records.push(record);
300                        }
301                    }
302                    b"leader" => {
303                        if let Some(ref mut record) = current_record {
304                            // Parse leader from string (24 bytes)
305                            if current_value.len() >= 24 {
306                                let leader_bytes = current_value.as_bytes()[..24].to_vec();
307                                record.leader = Leader::from_bytes(&leader_bytes).map_err(|e| ParseError::InvalidLeader(e))?;
308                            }
309                        }
310                    }
311                    b"controlfield" => {
312                        if let Some(ref mut record) = current_record {
313                            record.control_fields.push(ControlField {
314                                tag: current_tag.clone(),
315                                value: current_value.clone(),
316                            });
317                        }
318                        current_tag.clear();
319                        current_value.clear();
320                    }
321                    b"datafield" => {
322                        if let Some(field) = current_field.take() {
323                            if let Some(ref mut record) = current_record {
324                                record.data_fields.push(field);
325                            }
326                        }
327                    }
328                    b"subfield" => {
329                        if let Some(subfield) = current_subfield.take() {
330                            if let Some(ref mut field) = current_field {
331                                field.subfields.push(Subfield {
332                                    code: subfield.code,
333                                    value: current_value.clone(),
334                                });
335                            }
336                        }
337                        current_value.clear();
338                    }
339                    _ => {}
340                }
341            }
342            Ok(Event::Eof) => break,
343            Err(e) => {
344                return Err(ParseError::InvalidXml(format!("XML parsing error: {}", e)));
345            }
346            _ => {}
347        }
348        buf.clear();
349    }
350
351    // If we have a single record outside a collection
352    if !in_collection && records.is_empty() {
353        if let Some(record) = current_record {
354            records.push(record);
355        }
356    }
357
358    Ok(records)
359}