1use crate::encoding::convert_to_utf8;
2use crate::format::{FormatEncoding, MarcFormat};
3use crate::record::{ControlField, DataField, Leader, Record, Subfield};
4
5#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum ParseError {
8 InvalidLeader(String),
9 InvalidRecordLength(String),
10 InvalidField(String),
11 InvalidEncoding(String),
12 UnexpectedEof,
13 InvalidXml(String),
14 Other(String),
15}
16
17impl std::fmt::Display for ParseError {
18 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19 match self {
20 ParseError::InvalidLeader(msg) => write!(f, "Invalid leader: {}", msg),
21 ParseError::InvalidRecordLength(msg) => write!(f, "Invalid record length: {}", msg),
22 ParseError::InvalidField(msg) => write!(f, "Invalid field: {}", msg),
23 ParseError::InvalidEncoding(msg) => write!(f, "Invalid encoding: {}", msg),
24 ParseError::UnexpectedEof => write!(f, "Unexpected end of file"),
25 ParseError::InvalidXml(msg) => write!(f, "Invalid XML: {}", msg),
26 ParseError::Other(msg) => write!(f, "Parse error: {}", msg),
27 }
28 }
29}
30
31impl std::error::Error for ParseError {}
32
33pub fn parse(data: &[u8], format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
35 match format_encoding.format {
36 MarcFormat::Marc21 => parse_marc21_binary(data, format_encoding),
37 MarcFormat::Unimarc => parse_unimarc_binary(data, format_encoding),
38 MarcFormat::MarcXml => parse_marc_xml(data, format_encoding),
39 }
40}
41
42pub fn parse_marc21_binary(data: &[u8], format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
44 let mut records = Vec::new();
45 let mut offset = 0;
46
47 while offset < data.len() {
48 if data.len() - offset < 24 {
49 break; }
51
52 let leader = Leader::from_bytes(&data[offset..offset + 24]).map_err(|e| ParseError::InvalidLeader(e))?;
53
54 let record_length = leader.record_length as usize;
55 if record_length == 0 || record_length > data.len() - offset {
56 return Err(ParseError::InvalidRecordLength(format!(
57 "Record length {} exceeds available data {}",
58 record_length,
59 data.len() - offset
60 )));
61 }
62
63 let record_data = &data[offset..offset + record_length];
64 let record = parse_single_marc21_record(record_data, &leader, format_encoding)?;
65 records.push(record);
66
67 offset += record_length;
68 }
69
70 Ok(records)
71}
72
73fn parse_single_marc21_record(data: &[u8], leader: &Leader, format_encoding: FormatEncoding) -> Result<Record, ParseError> {
75 if data.len() < leader.base_address_of_data as usize {
76 return Err(ParseError::UnexpectedEof);
77 }
78
79 let base_address = leader.base_address_of_data as usize;
80 let directory = &data[24..base_address];
81 let data_area = &data[base_address..];
82
83 let mut control_fields = Vec::new();
84 let mut data_fields = Vec::new();
85
86 let mut dir_offset = 0;
87 while dir_offset + 12 <= directory.len() {
88 let tag_bytes = &directory[dir_offset..dir_offset + 3];
89 let tag = std::str::from_utf8(tag_bytes).map_err(|e| ParseError::InvalidField(format!("Invalid tag: {}", e)))?;
90
91 let length_bytes = &directory[dir_offset + 3..dir_offset + 7];
92 let length = std::str::from_utf8(length_bytes)
93 .map_err(|e| ParseError::InvalidField(format!("Invalid length: {}", e)))?
94 .parse::<usize>()
95 .map_err(|e| ParseError::InvalidField(format!("Invalid length number: {}", e)))?;
96
97 let start_bytes = &directory[dir_offset + 7..dir_offset + 12];
98 let start = std::str::from_utf8(start_bytes)
99 .map_err(|e| ParseError::InvalidField(format!("Invalid start: {}", e)))?
100 .parse::<usize>()
101 .map_err(|e| ParseError::InvalidField(format!("Invalid start number: {}", e)))?;
102
103 if start + length > data_area.len() {
104 return Err(ParseError::InvalidField(format!(
105 "Field extends beyond data area: start={}, length={}, data_len={}",
106 start,
107 length,
108 data_area.len()
109 )));
110 }
111
112 let field_data = &data_area[start..start + length];
113
114 if tag < "010" {
115 let value = convert_to_utf8(field_data, format_encoding.encoding).map_err(|e| ParseError::InvalidEncoding(e))?;
117 control_fields.push(ControlField { tag: tag.to_string(), value });
118 } else {
119 if field_data.is_empty() {
121 dir_offset += 12;
122 continue;
123 }
124
125 let ind1 = field_data[0] as char;
126 let ind2 = field_data[1] as char;
127 let subfield_data = &field_data[2..];
128
129 let mut subfields = Vec::new();
130 let mut i = 0;
131 while i < subfield_data.len() {
132 if subfield_data[i] == 0x1F {
133 i += 1;
135 if i >= subfield_data.len() {
136 break;
137 }
138 let code = subfield_data[i] as char;
139 i += 1;
140
141 let value_start = i;
142 while i < subfield_data.len() && subfield_data[i] != 0x1F && subfield_data[i] != 0x1E {
143 i += 1;
144 }
145
146 let value_bytes = &subfield_data[value_start..i];
147 let value = convert_to_utf8(value_bytes, format_encoding.encoding).map_err(|e| ParseError::InvalidEncoding(e))?;
148
149 subfields.push(Subfield { code, value });
150 } else {
151 i += 1;
152 }
153 }
154
155 data_fields.push(DataField {
156 tag: tag.to_string(),
157 ind1,
158 ind2,
159 subfields,
160 });
161 }
162
163 dir_offset += 12;
164 }
165
166 Ok(Record {
167 leader: leader.clone(),
168 control_fields,
169 data_fields,
170 })
171}
172
173pub fn parse_unimarc_binary(data: &[u8], format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
175 parse_marc21_binary(data, format_encoding)
178}
179
180pub fn parse_marc_xml(data: &[u8], _format_encoding: FormatEncoding) -> Result<Vec<Record>, ParseError> {
182 use quick_xml::events::Event;
183 use quick_xml::Reader;
184
185 let mut reader = Reader::from_str(std::str::from_utf8(data).map_err(|e| ParseError::InvalidXml(format!("Invalid UTF-8: {}", e)))?);
186 reader.trim_text(true);
187
188 let mut records = Vec::new();
189 let mut buf = Vec::new();
190 let mut current_record: Option<Record> = None;
191 let mut current_field: Option<DataField> = None;
192 let mut current_subfield: Option<Subfield> = None;
193 let mut current_tag = String::new();
194 let mut current_value = String::new();
195 let mut in_collection = false;
196
197 loop {
198 match reader.read_event_into(&mut buf) {
199 Ok(Event::Start(e)) => match e.name().as_ref() {
200 b"collection" => {
201 in_collection = true;
202 }
203 b"record" => {
204 current_record = Some(Record {
205 leader: Leader {
206 record_length: 0,
207 record_status: ' ',
208 record_type: ' ',
209 bibliographic_level: ' ',
210 type_of_control: ' ',
211 character_coding_scheme: ' ',
212 indicator_count: 2,
213 subfield_code_count: 2,
214 base_address_of_data: 0,
215 encoding_level: ' ',
216 descriptive_cataloging_form: ' ',
217 multipart_resource_record_level: ' ',
218 length_of_length_of_field_portion: 4,
219 length_of_starting_character_position_portion: 5,
220 length_of_implementation_defined_portion: 0,
221 undefined: ' ',
222 },
223 control_fields: Vec::new(),
224 data_fields: Vec::new(),
225 });
226 }
227 b"leader" => {
228 current_value.clear();
229 }
230 b"controlfield" => {
231 current_tag = String::from_utf8_lossy(
232 e.attributes()
233 .find(|a| a.as_ref().unwrap().key.as_ref() == b"tag")
234 .ok_or_else(|| ParseError::InvalidXml("Missing tag attribute".to_string()))?
235 .as_ref()
236 .unwrap()
237 .value
238 .as_ref(),
239 )
240 .to_string();
241 current_value.clear();
242 }
243 b"datafield" => {
244 let tag = String::from_utf8_lossy(
245 e.attributes()
246 .find(|a| a.as_ref().unwrap().key.as_ref() == b"tag")
247 .ok_or_else(|| ParseError::InvalidXml("Missing tag attribute".to_string()))?
248 .as_ref()
249 .unwrap()
250 .value
251 .as_ref(),
252 )
253 .to_string();
254
255 let ind1 = e
256 .attributes()
257 .find(|a| a.as_ref().unwrap().key.as_ref() == b"ind1")
258 .map(|a| String::from_utf8_lossy(a.as_ref().unwrap().value.as_ref()).chars().next().unwrap_or(' '))
259 .unwrap_or(' ');
260
261 let ind2 = e
262 .attributes()
263 .find(|a| a.as_ref().unwrap().key.as_ref() == b"ind2")
264 .map(|a| String::from_utf8_lossy(a.as_ref().unwrap().value.as_ref()).chars().next().unwrap_or(' '))
265 .unwrap_or(' ');
266
267 current_field = Some(DataField {
268 tag,
269 ind1,
270 ind2,
271 subfields: Vec::new(),
272 });
273 }
274 b"subfield" => {
275 let code = String::from_utf8_lossy(
276 e.attributes()
277 .find(|a| a.as_ref().unwrap().key.as_ref() == b"code")
278 .ok_or_else(|| ParseError::InvalidXml("Missing code attribute".to_string()))?
279 .as_ref()
280 .unwrap()
281 .value
282 .as_ref(),
283 )
284 .chars()
285 .next()
286 .ok_or_else(|| ParseError::InvalidXml("Empty code attribute".to_string()))?;
287 current_subfield = Some(Subfield { code, value: String::new() });
288 current_value.clear();
289 }
290 _ => {}
291 },
292 Ok(Event::Text(e)) => {
293 current_value = e.unescape().unwrap_or_default().to_string();
294 }
295 Ok(Event::End(e)) => {
296 match e.name().as_ref() {
297 b"record" => {
298 if let Some(record) = current_record.take() {
299 records.push(record);
300 }
301 }
302 b"leader" => {
303 if let Some(ref mut record) = current_record {
304 if current_value.len() >= 24 {
306 let leader_bytes = current_value.as_bytes()[..24].to_vec();
307 record.leader = Leader::from_bytes(&leader_bytes).map_err(|e| ParseError::InvalidLeader(e))?;
308 }
309 }
310 }
311 b"controlfield" => {
312 if let Some(ref mut record) = current_record {
313 record.control_fields.push(ControlField {
314 tag: current_tag.clone(),
315 value: current_value.clone(),
316 });
317 }
318 current_tag.clear();
319 current_value.clear();
320 }
321 b"datafield" => {
322 if let Some(field) = current_field.take() {
323 if let Some(ref mut record) = current_record {
324 record.data_fields.push(field);
325 }
326 }
327 }
328 b"subfield" => {
329 if let Some(subfield) = current_subfield.take() {
330 if let Some(ref mut field) = current_field {
331 field.subfields.push(Subfield {
332 code: subfield.code,
333 value: current_value.clone(),
334 });
335 }
336 }
337 current_value.clear();
338 }
339 _ => {}
340 }
341 }
342 Ok(Event::Eof) => break,
343 Err(e) => {
344 return Err(ParseError::InvalidXml(format!("XML parsing error: {}", e)));
345 }
346 _ => {}
347 }
348 buf.clear();
349 }
350
351 if !in_collection && records.is_empty() {
353 if let Some(record) = current_record {
354 records.push(record);
355 }
356 }
357
358 Ok(records)
359}