1use crate::error::ParseError;
5use quick_xml::{events::Event, Reader};
6use std::collections::HashSet;
7use std::io::BufRead;
8
9#[derive(Debug, Clone)]
11pub struct SelectiveParser {
12 target_fields: HashSet<String>,
14 case_sensitive: bool,
16 max_depth: usize,
18}
19
20#[derive(Debug, Clone)]
22pub struct SelectiveResult {
23 pub values: std::collections::HashMap<String, Vec<String>>,
25 pub elements_processed: usize,
27 pub bytes_processed: usize,
29 pub duration: std::time::Duration,
31}
32
33impl SelectiveParser {
34 pub fn new(target_fields: Vec<String>) -> Self {
36 Self {
37 target_fields: target_fields.into_iter().collect(),
38 case_sensitive: false,
39 max_depth: 0,
40 }
41 }
42
43 pub fn for_isrcs() -> Self {
45 Self::new(vec![
46 "ISRC".to_string(),
47 "SoundRecordingId".to_string(),
48 "ResourceId".to_string(),
49 ])
50 }
51
52 pub fn for_release_metadata() -> Self {
54 Self::new(vec![
55 "ReleaseId".to_string(),
56 "ReleaseReference".to_string(),
57 "TitleText".to_string(),
58 "DisplayArtist".to_string(),
59 "ReleaseDate".to_string(),
60 ])
61 }
62
63 pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
65 self.case_sensitive = case_sensitive;
66 self
67 }
68
69 pub fn max_depth(mut self, max_depth: usize) -> Self {
71 self.max_depth = max_depth;
72 self
73 }
74
75 pub fn extract_isrcs<R: BufRead>(&mut self, reader: R) -> Result<Vec<String>, ParseError> {
77 let result = self.extract_fields(reader)?;
78
79 let mut isrcs = Vec::new();
80
81 for field_name in &["ISRC", "SoundRecordingId", "ResourceId"] {
83 if let Some(values) = result.values.get(*field_name) {
84 for value in values {
85 let isrc = if value.contains(':') {
87 value.split(':').nth(1).unwrap_or(value).to_string()
88 } else {
89 value.clone()
90 };
91
92 if self.is_valid_isrc(&isrc) {
94 isrcs.push(isrc);
95 }
96 }
97 }
98 }
99
100 isrcs.sort();
101 isrcs.dedup();
102 Ok(isrcs)
103 }
104
105 pub fn extract_fields<R: BufRead>(&mut self, reader: R) -> Result<SelectiveResult, ParseError> {
107 let start_time = std::time::Instant::now();
108 let mut xml_reader = Reader::from_reader(reader);
109 xml_reader.config_mut().trim_text(true);
110
111 let mut values: std::collections::HashMap<String, Vec<String>> =
112 std::collections::HashMap::new();
113 let mut buf = Vec::new();
114 let mut current_field = None::<String>;
115 let mut depth = 0;
116 let mut elements_processed = 0;
117
118 loop {
119 match xml_reader.read_event_into(&mut buf) {
120 Ok(Event::Start(ref e)) => {
121 depth += 1;
122 elements_processed += 1;
123
124 if self.max_depth > 0 && depth > self.max_depth {
126 buf.clear();
127 continue;
128 }
129
130 let element_name = self.extract_element_name(e.name().as_ref())?;
131
132 if self.is_target_field(&element_name) {
134 current_field = Some(element_name);
135 }
136 }
137 Ok(Event::End(_)) => {
138 depth = depth.saturating_sub(1);
139 current_field = None;
140 }
141 Ok(Event::Empty(ref e)) => {
142 elements_processed += 1;
143
144 let element_name = self.extract_element_name(e.name().as_ref())?;
145
146 if self.is_target_field(&element_name) {
148 if let Ok(attributes) = e.attributes().collect::<Result<Vec<_>, _>>() {
149 for attr in attributes {
150 let attr_value = String::from_utf8_lossy(&attr.value);
151 self.add_value(&mut values, &element_name, attr_value.to_string());
152 }
153 }
154 }
155 }
156 Ok(Event::Text(ref e)) => {
157 if let Some(ref field_name) = current_field {
158 let current_pos = xml_reader.buffer_position() as usize;
160 let text = crate::utf8_utils::handle_text_node(e, current_pos)?;
161
162 let text_content = text.trim();
163 if !text_content.is_empty() {
164 self.add_value(&mut values, field_name, text_content.to_string());
165 }
166 }
167 }
168 Ok(Event::CData(ref e)) => {
169 if let Some(ref field_name) = current_field {
170 let text = String::from_utf8_lossy(e);
171 let text_content = text.trim();
172 if !text_content.is_empty() {
173 self.add_value(&mut values, field_name, text_content.to_string());
174 }
175 }
176 }
177 Ok(Event::Eof) => break,
178 Err(e) => {
179 return Err(ParseError::XmlError(format!("XML parsing error: {}", e)));
180 }
181 _ => {} }
183 buf.clear();
184 }
185
186 Ok(SelectiveResult {
187 values,
188 elements_processed,
189 bytes_processed: xml_reader.buffer_position() as usize,
190 duration: start_time.elapsed(),
191 })
192 }
193
194 pub fn extract_isrcs_fast<R: BufRead>(
196 &mut self,
197 mut reader: R,
198 ) -> Result<Vec<String>, ParseError> {
199 let mut isrcs = Vec::new();
200 let mut buffer = Vec::new();
201
202 reader
204 .read_to_end(&mut buffer)
205 .map_err(|e| ParseError::IoError(format!("Failed to read input: {}", e)))?;
206
207 let content = std::str::from_utf8(&buffer).map_err(|e| ParseError::InvalidUtf8 {
209 message: format!("UTF-8 decoding error at position 0: {}", e),
210 })?;
211
212 self.extract_isrcs_from_content(content, &mut isrcs);
214
215 isrcs.sort_unstable();
217 isrcs.dedup();
218
219 Ok(isrcs)
220 }
221
222 fn extract_isrcs_from_content(&self, content: &str, isrcs: &mut Vec<String>) {
224 let mut pos = 0;
226 let content_len = content.len();
227
228 while pos < content_len {
229 if let Some(isrc_pos) = self.find_next_isrc_tag(content, pos) {
231 pos = isrc_pos;
232
233 if let Some((isrc, next_pos)) = self.extract_isrc_at_position(content, pos) {
235 if self.is_valid_isrc(&isrc) {
236 isrcs.push(isrc);
237 }
238 pos = next_pos;
239 } else {
240 pos += 1;
241 }
242 } else {
243 break;
244 }
245 }
246 }
247
248 fn find_next_isrc_tag(&self, content: &str, start_pos: usize) -> Option<usize> {
250 let search_slice = &content[start_pos..];
251
252 let patterns = [
254 "<ISRC>",
255 "<ern:ISRC>",
256 "<SoundRecordingId",
257 "<ern:SoundRecordingId",
258 ];
259
260 let mut min_pos: Option<usize> = None;
261 for &pattern in &patterns {
262 if let Some(found_pos) = search_slice.find(pattern) {
263 let absolute_pos = start_pos + found_pos;
264 min_pos =
265 Some(min_pos.map_or(absolute_pos, |current: usize| current.min(absolute_pos)));
266 }
267 }
268
269 min_pos
270 }
271
272 fn extract_isrc_at_position(&self, content: &str, pos: usize) -> Option<(String, usize)> {
274 let remaining = &content[pos..];
275
276 if remaining.starts_with("<ISRC>") {
278 return self.extract_between_tags(content, pos, "<ISRC>", "</ISRC>");
279 }
280 if remaining.starts_with("<ern:ISRC>") {
281 return self.extract_between_tags(content, pos, "<ern:ISRC>", "</ern:ISRC>");
282 }
283
284 if remaining.starts_with("<SoundRecordingId")
286 || remaining.starts_with("<ern:SoundRecordingId")
287 {
288 if let Some(tag_end) = remaining.find('>') {
290 let opening_tag = &remaining[..=tag_end];
291
292 if opening_tag.contains("Namespace=\"ISRC\"")
294 || opening_tag.contains("Namespace='ISRC'")
295 {
296 let content_start = pos + tag_end + 1;
297
298 let closing_tag = if remaining.starts_with("<ern:") {
300 "</ern:SoundRecordingId>"
301 } else {
302 "</SoundRecordingId>"
303 };
304
305 if let Some(closing_pos) = content[content_start..].find(closing_tag) {
306 let content_end = content_start + closing_pos;
307 let isrc = content[content_start..content_end].trim().to_string();
308 return Some((isrc, content_end + closing_tag.len()));
309 }
310 }
311 }
312 }
313
314 None
315 }
316
317 fn extract_between_tags(
319 &self,
320 content: &str,
321 pos: usize,
322 open_tag: &str,
323 close_tag: &str,
324 ) -> Option<(String, usize)> {
325 let content_start = pos + open_tag.len();
326
327 if let Some(content_end_rel) = content[content_start..].find(close_tag) {
328 let content_end = content_start + content_end_rel;
329 let extracted = content[content_start..content_end].trim().to_string();
330 Some((extracted, content_end + close_tag.len()))
331 } else {
332 None
333 }
334 }
335
336 fn is_target_field(&self, name: &str) -> bool {
338 if self.case_sensitive {
339 self.target_fields.contains(name)
340 } else {
341 self.target_fields
342 .iter()
343 .any(|field| field.eq_ignore_ascii_case(name))
344 }
345 }
346
347 fn extract_element_name(&self, qname: &[u8]) -> Result<String, ParseError> {
349 let name_str = std::str::from_utf8(qname).map_err(|_| ParseError::IoError(
350 "Invalid UTF-8 in element name".to_string(),
351 ))?;
352
353 let local_name = if let Some(colon_pos) = name_str.find(':') {
355 &name_str[colon_pos + 1..]
356 } else {
357 name_str
358 };
359
360 Ok(local_name.to_string())
361 }
362
363 fn add_value(
365 &self,
366 values: &mut std::collections::HashMap<String, Vec<String>>,
367 field_name: &str,
368 value: String,
369 ) {
370 values
371 .entry(field_name.to_string())
372 .or_default()
373 .push(value);
374 }
375
376 fn is_valid_isrc(&self, isrc: &str) -> bool {
378 if isrc.len() != 12 {
385 return false;
386 }
387
388 let chars: Vec<char> = isrc.chars().collect();
389
390 if !chars[0].is_ascii_alphabetic() || !chars[1].is_ascii_alphabetic() {
392 return false;
393 }
394
395 for &ch in &chars[2..5] {
397 if !ch.is_ascii_alphanumeric() {
398 return false;
399 }
400 }
401
402 if !chars[5].is_ascii_digit() || !chars[6].is_ascii_digit() {
404 return false;
405 }
406
407 for &ch in &chars[7..12] {
409 if !ch.is_ascii_digit() {
410 return false;
411 }
412 }
413
414 true
415 }
416}
417
418#[cfg(test)]
419mod tests {
420 use super::*;
421 use std::io::Cursor;
422
423 #[test]
424 fn test_isrc_validation() {
425 let parser = SelectiveParser::for_isrcs();
426
427 assert!(parser.is_valid_isrc("USRC17607839"));
428 assert!(parser.is_valid_isrc("GBUM71505078"));
429 assert!(parser.is_valid_isrc("FRUM71200001"));
430
431 assert!(!parser.is_valid_isrc("USRC1760783")); assert!(!parser.is_valid_isrc("USRC176078391")); assert!(!parser.is_valid_isrc("12RC17607839")); assert!(!parser.is_valid_isrc("USRC1760783A")); }
436
437 #[test]
438 fn test_selective_isrc_extraction() {
439 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
440 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
441 <ern:ResourceList>
442 <ern:SoundRecording>
443 <ern:SoundRecordingId Namespace="ISRC">USRC17607839</ern:SoundRecordingId>
444 <ern:ReferenceTitle>
445 <ern:TitleText>Test Track</ern:TitleText>
446 </ern:ReferenceTitle>
447 </ern:SoundRecording>
448 <ern:SoundRecording>
449 <ern:SoundRecordingId Namespace="ISRC">GBUM71505078</ern:SoundRecordingId>
450 <ern:ReferenceTitle>
451 <ern:TitleText>Another Track</ern:TitleText>
452 </ern:ReferenceTitle>
453 </ern:SoundRecording>
454 </ern:ResourceList>
455 </ern:NewReleaseMessage>"#;
456
457 let cursor = Cursor::new(xml.as_bytes());
458 let mut parser = SelectiveParser::for_isrcs();
459
460 let isrcs = parser.extract_isrcs(cursor).expect("Should extract ISRCs");
461
462 assert_eq!(isrcs.len(), 2);
463 assert!(isrcs.contains(&"USRC17607839".to_string()));
464 assert!(isrcs.contains(&"GBUM71505078".to_string()));
465 }
466
467 #[test]
468 fn test_fast_isrc_extraction() {
469 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
470 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
471 <ern:ResourceList>
472 <ern:SoundRecording>
473 <ISRC>USRC17607839</ISRC>
474 <ern:ReferenceTitle>
475 <ern:TitleText>Test Track</ern:TitleText>
476 </ern:ReferenceTitle>
477 </ern:SoundRecording>
478 </ern:ResourceList>
479 </ern:NewReleaseMessage>"#;
480
481 let cursor = Cursor::new(xml.as_bytes());
482 let mut parser = SelectiveParser::for_isrcs();
483
484 let isrcs = parser
485 .extract_isrcs_fast(cursor)
486 .expect("Should extract ISRCs");
487
488 assert_eq!(isrcs.len(), 1);
489 assert_eq!(isrcs[0], "USRC17607839");
490 }
491
492 #[test]
493 fn test_selective_field_extraction() {
494 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
495 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
496 <ern:ReleaseList>
497 <ern:Release>
498 <ern:ReleaseId>REL001</ern:ReleaseId>
499 <ern:ReleaseReference>R001</ern:ReleaseReference>
500 <ern:ReferenceTitle>
501 <ern:TitleText>My Album</ern:TitleText>
502 </ern:ReferenceTitle>
503 </ern:Release>
504 </ern:ReleaseList>
505 </ern:NewReleaseMessage>"#;
506
507 let cursor = Cursor::new(xml.as_bytes());
508 let mut parser = SelectiveParser::for_release_metadata();
509
510 let result = parser
511 .extract_fields(cursor)
512 .expect("Should extract fields");
513
514 assert!(result.values.contains_key("ReleaseId"));
515 assert!(result.values.contains_key("ReleaseReference"));
516 assert!(result.values.contains_key("TitleText"));
517
518 assert_eq!(result.values["ReleaseId"][0], "REL001");
519 assert_eq!(result.values["ReleaseReference"][0], "R001");
520 assert_eq!(result.values["TitleText"][0], "My Album");
521
522 println!("Extraction results: {:#?}", result);
523 }
524
525 #[test]
526 fn test_performance_comparison() {
527 let mut xml = String::from(
529 r#"<?xml version="1.0" encoding="UTF-8"?>
530 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
531 <ern:ResourceList>"#,
532 );
533
534 for i in 0..1000 {
535 xml.push_str(&format!(
536 r#"
537 <ern:SoundRecording>
538 <ern:SoundRecordingId Namespace="ISRC">USRC{:08}</ern:SoundRecordingId>
539 <ern:ReferenceTitle>
540 <ern:TitleText>Test Track {}</ern:TitleText>
541 </ern:ReferenceTitle>
542 </ern:SoundRecording>"#,
543 17600000 + i,
544 i
545 ));
546 }
547
548 xml.push_str("</ern:ResourceList></ern:NewReleaseMessage>");
549
550 let cursor1 = Cursor::new(xml.as_bytes());
552 let mut parser1 = SelectiveParser::for_isrcs();
553 let start1 = std::time::Instant::now();
554 let isrcs1 = parser1
555 .extract_isrcs(cursor1)
556 .expect("Standard extraction should work");
557 let duration1 = start1.elapsed();
558
559 let cursor2 = Cursor::new(xml.as_bytes());
561 let mut parser2 = SelectiveParser::for_isrcs();
562 let start2 = std::time::Instant::now();
563 let isrcs2 = parser2
564 .extract_isrcs_fast(cursor2)
565 .expect("Fast extraction should work");
566 let duration2 = start2.elapsed();
567
568 println!(
569 "Standard extraction: {} ISRCs in {:?}",
570 isrcs1.len(),
571 duration1
572 );
573 println!("Fast extraction: {} ISRCs in {:?}", isrcs2.len(), duration2);
574
575 assert_eq!(isrcs1.len(), 1000);
577 assert_eq!(isrcs2.len(), 1000);
578
579 println!(
581 "Fast extraction speedup: {:.2}x",
582 duration1.as_nanos() as f64 / duration2.as_nanos() as f64
583 );
584 }
585}