1use crate::error::ParseError;
5use quick_xml::{events::Event, Reader};
6use std::collections::HashSet;
7use std::io::BufRead;
8
9#[derive(Debug, Clone)]
11pub struct SelectiveParser {
12 target_fields: HashSet<String>,
14 case_sensitive: bool,
16 max_depth: usize,
18}
19
20#[derive(Debug, Clone)]
22pub struct SelectiveResult {
23 pub values: std::collections::HashMap<String, Vec<String>>,
25 pub elements_processed: usize,
27 pub bytes_processed: usize,
29 pub duration: std::time::Duration,
31}
32
33impl SelectiveParser {
34 pub fn new(target_fields: Vec<String>) -> Self {
36 Self {
37 target_fields: target_fields.into_iter().collect(),
38 case_sensitive: false,
39 max_depth: 0,
40 }
41 }
42
43 pub fn for_isrcs() -> Self {
45 Self::new(vec![
46 "ISRC".to_string(),
47 "SoundRecordingId".to_string(),
48 "ResourceId".to_string(),
49 ])
50 }
51
52 pub fn for_release_metadata() -> Self {
54 Self::new(vec![
55 "ReleaseId".to_string(),
56 "ReleaseReference".to_string(),
57 "TitleText".to_string(),
58 "DisplayArtist".to_string(),
59 "ReleaseDate".to_string(),
60 ])
61 }
62
63 pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
65 self.case_sensitive = case_sensitive;
66 self
67 }
68
69 pub fn max_depth(mut self, max_depth: usize) -> Self {
71 self.max_depth = max_depth;
72 self
73 }
74
75 pub fn extract_isrcs<R: BufRead>(&mut self, reader: R) -> Result<Vec<String>, ParseError> {
77 let result = self.extract_fields(reader)?;
78
79 let mut isrcs = Vec::new();
80
81 for field_name in &["ISRC", "SoundRecordingId", "ResourceId"] {
83 if let Some(values) = result.values.get(*field_name) {
84 for value in values {
85 let isrc = if value.contains(':') {
87 value.split(':').nth(1).unwrap_or(value).to_string()
88 } else {
89 value.clone()
90 };
91
92 if self.is_valid_isrc(&isrc) {
94 isrcs.push(isrc);
95 }
96 }
97 }
98 }
99
100 isrcs.sort();
101 isrcs.dedup();
102 Ok(isrcs)
103 }
104
105 pub fn extract_fields<R: BufRead>(&mut self, reader: R) -> Result<SelectiveResult, ParseError> {
107 let start_time = std::time::Instant::now();
108 let mut xml_reader = Reader::from_reader(reader);
109 xml_reader.config_mut().trim_text(true);
110
111 let mut values: std::collections::HashMap<String, Vec<String>> =
112 std::collections::HashMap::new();
113 let mut buf = Vec::new();
114 let mut current_field = None::<String>;
115 let mut depth = 0;
116 let mut elements_processed = 0;
117
118 loop {
119 match xml_reader.read_event_into(&mut buf) {
120 Ok(Event::Start(ref e)) => {
121 depth += 1;
122 elements_processed += 1;
123
124 if self.max_depth > 0 && depth > self.max_depth {
126 buf.clear();
127 continue;
128 }
129
130 let element_name = self.extract_element_name(e.name().as_ref())?;
131
132 if self.is_target_field(&element_name) {
134 current_field = Some(element_name);
135 }
136 }
137 Ok(Event::End(_)) => {
138 depth = depth.saturating_sub(1);
139 current_field = None;
140 }
141 Ok(Event::Empty(ref e)) => {
142 elements_processed += 1;
143
144 let element_name = self.extract_element_name(e.name().as_ref())?;
145
146 if self.is_target_field(&element_name) {
148 if let Ok(attributes) = e.attributes().collect::<Result<Vec<_>, _>>() {
149 for attr in attributes {
150 let attr_value = String::from_utf8_lossy(&attr.value);
151 self.add_value(&mut values, &element_name, attr_value.to_string());
152 }
153 }
154 }
155 }
156 Ok(Event::Text(ref e)) => {
157 if let Some(ref field_name) = current_field {
158 let current_pos = xml_reader.buffer_position() as usize;
160 let text = crate::utf8_utils::handle_text_node(e, current_pos)?;
161
162 let text_content = text.trim();
163 if !text_content.is_empty() {
164 self.add_value(&mut values, field_name, text_content.to_string());
165 }
166 }
167 }
168 Ok(Event::CData(ref e)) => {
169 if let Some(ref field_name) = current_field {
170 let text = String::from_utf8_lossy(e);
171 let text_content = text.trim();
172 if !text_content.is_empty() {
173 self.add_value(&mut values, field_name, text_content.to_string());
174 }
175 }
176 }
177 Ok(Event::Eof) => break,
178 Err(e) => {
179 return Err(ParseError::XmlError {
180 message: format!("XML parsing error: {}", e),
181 location: crate::error::ErrorLocation {
182 line: 0,
183 column: 0,
184 byte_offset: Some(xml_reader.buffer_position() as usize),
185 path: "selective_parser".to_string(),
186 },
187 });
188 }
189 _ => {} }
191 buf.clear();
192 }
193
194 Ok(SelectiveResult {
195 values,
196 elements_processed,
197 bytes_processed: xml_reader.buffer_position() as usize,
198 duration: start_time.elapsed(),
199 })
200 }
201
202 pub fn extract_isrcs_fast<R: BufRead>(
204 &mut self,
205 mut reader: R,
206 ) -> Result<Vec<String>, ParseError> {
207 let mut isrcs = Vec::new();
208 let mut buffer = Vec::new();
209
210 reader
212 .read_to_end(&mut buffer)
213 .map_err(|e| ParseError::Io {
214 message: format!("Failed to read input: {}", e),
215 })?;
216
217 let content = std::str::from_utf8(&buffer).map_err(|e| ParseError::InvalidUtf8 {
219 position: 0,
220 error: e.to_string(),
221 })?;
222
223 self.extract_isrcs_from_content(content, &mut isrcs);
225
226 isrcs.sort_unstable();
228 isrcs.dedup();
229
230 Ok(isrcs)
231 }
232
233 fn extract_isrcs_from_content(&self, content: &str, isrcs: &mut Vec<String>) {
235 let mut pos = 0;
237 let content_len = content.len();
238
239 while pos < content_len {
240 if let Some(isrc_pos) = self.find_next_isrc_tag(content, pos) {
242 pos = isrc_pos;
243
244 if let Some((isrc, next_pos)) = self.extract_isrc_at_position(content, pos) {
246 if self.is_valid_isrc(&isrc) {
247 isrcs.push(isrc);
248 }
249 pos = next_pos;
250 } else {
251 pos += 1;
252 }
253 } else {
254 break;
255 }
256 }
257 }
258
259 fn find_next_isrc_tag(&self, content: &str, start_pos: usize) -> Option<usize> {
261 let search_slice = &content[start_pos..];
262
263 let patterns = [
265 "<ISRC>",
266 "<ern:ISRC>",
267 "<SoundRecordingId",
268 "<ern:SoundRecordingId",
269 ];
270
271 let mut min_pos: Option<usize> = None;
272 for &pattern in &patterns {
273 if let Some(found_pos) = search_slice.find(pattern) {
274 let absolute_pos = start_pos + found_pos;
275 min_pos =
276 Some(min_pos.map_or(absolute_pos, |current: usize| current.min(absolute_pos)));
277 }
278 }
279
280 min_pos
281 }
282
283 fn extract_isrc_at_position(&self, content: &str, pos: usize) -> Option<(String, usize)> {
285 let remaining = &content[pos..];
286
287 if remaining.starts_with("<ISRC>") {
289 return self.extract_between_tags(content, pos, "<ISRC>", "</ISRC>");
290 }
291 if remaining.starts_with("<ern:ISRC>") {
292 return self.extract_between_tags(content, pos, "<ern:ISRC>", "</ern:ISRC>");
293 }
294
295 if remaining.starts_with("<SoundRecordingId")
297 || remaining.starts_with("<ern:SoundRecordingId")
298 {
299 if let Some(tag_end) = remaining.find('>') {
301 let opening_tag = &remaining[..=tag_end];
302
303 if opening_tag.contains("Namespace=\"ISRC\"")
305 || opening_tag.contains("Namespace='ISRC'")
306 {
307 let content_start = pos + tag_end + 1;
308
309 let closing_tag = if remaining.starts_with("<ern:") {
311 "</ern:SoundRecordingId>"
312 } else {
313 "</SoundRecordingId>"
314 };
315
316 if let Some(closing_pos) = content[content_start..].find(closing_tag) {
317 let content_end = content_start + closing_pos;
318 let isrc = content[content_start..content_end].trim().to_string();
319 return Some((isrc, content_end + closing_tag.len()));
320 }
321 }
322 }
323 }
324
325 None
326 }
327
328 fn extract_between_tags(
330 &self,
331 content: &str,
332 pos: usize,
333 open_tag: &str,
334 close_tag: &str,
335 ) -> Option<(String, usize)> {
336 let content_start = pos + open_tag.len();
337
338 if let Some(content_end_rel) = content[content_start..].find(close_tag) {
339 let content_end = content_start + content_end_rel;
340 let extracted = content[content_start..content_end].trim().to_string();
341 Some((extracted, content_end + close_tag.len()))
342 } else {
343 None
344 }
345 }
346
347 fn is_target_field(&self, name: &str) -> bool {
349 if self.case_sensitive {
350 self.target_fields.contains(name)
351 } else {
352 self.target_fields
353 .iter()
354 .any(|field| field.eq_ignore_ascii_case(name))
355 }
356 }
357
358 fn extract_element_name(&self, qname: &[u8]) -> Result<String, ParseError> {
360 let name_str = std::str::from_utf8(qname).map_err(|_| ParseError::Io {
361 message: "Invalid UTF-8 in element name".to_string(),
362 })?;
363
364 let local_name = if let Some(colon_pos) = name_str.find(':') {
366 &name_str[colon_pos + 1..]
367 } else {
368 name_str
369 };
370
371 Ok(local_name.to_string())
372 }
373
374 fn add_value(
376 &self,
377 values: &mut std::collections::HashMap<String, Vec<String>>,
378 field_name: &str,
379 value: String,
380 ) {
381 values
382 .entry(field_name.to_string())
383 .or_default()
384 .push(value);
385 }
386
387 fn is_valid_isrc(&self, isrc: &str) -> bool {
389 if isrc.len() != 12 {
396 return false;
397 }
398
399 let chars: Vec<char> = isrc.chars().collect();
400
401 if !chars[0].is_ascii_alphabetic() || !chars[1].is_ascii_alphabetic() {
403 return false;
404 }
405
406 for &ch in &chars[2..5] {
408 if !ch.is_ascii_alphanumeric() {
409 return false;
410 }
411 }
412
413 if !chars[5].is_ascii_digit() || !chars[6].is_ascii_digit() {
415 return false;
416 }
417
418 for &ch in &chars[7..12] {
420 if !ch.is_ascii_digit() {
421 return false;
422 }
423 }
424
425 true
426 }
427}
428
429#[cfg(test)]
430mod tests {
431 use super::*;
432 use std::io::Cursor;
433
434 #[test]
435 fn test_isrc_validation() {
436 let parser = SelectiveParser::for_isrcs();
437
438 assert!(parser.is_valid_isrc("USRC17607839"));
439 assert!(parser.is_valid_isrc("GBUM71505078"));
440 assert!(parser.is_valid_isrc("FRUM71200001"));
441
442 assert!(!parser.is_valid_isrc("USRC1760783")); assert!(!parser.is_valid_isrc("USRC176078391")); assert!(!parser.is_valid_isrc("12RC17607839")); assert!(!parser.is_valid_isrc("USRC1760783A")); }
447
448 #[test]
449 fn test_selective_isrc_extraction() {
450 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
451 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
452 <ern:ResourceList>
453 <ern:SoundRecording>
454 <ern:SoundRecordingId Namespace="ISRC">USRC17607839</ern:SoundRecordingId>
455 <ern:ReferenceTitle>
456 <ern:TitleText>Test Track</ern:TitleText>
457 </ern:ReferenceTitle>
458 </ern:SoundRecording>
459 <ern:SoundRecording>
460 <ern:SoundRecordingId Namespace="ISRC">GBUM71505078</ern:SoundRecordingId>
461 <ern:ReferenceTitle>
462 <ern:TitleText>Another Track</ern:TitleText>
463 </ern:ReferenceTitle>
464 </ern:SoundRecording>
465 </ern:ResourceList>
466 </ern:NewReleaseMessage>"#;
467
468 let cursor = Cursor::new(xml.as_bytes());
469 let mut parser = SelectiveParser::for_isrcs();
470
471 let isrcs = parser.extract_isrcs(cursor).expect("Should extract ISRCs");
472
473 assert_eq!(isrcs.len(), 2);
474 assert!(isrcs.contains(&"USRC17607839".to_string()));
475 assert!(isrcs.contains(&"GBUM71505078".to_string()));
476 }
477
478 #[test]
479 fn test_fast_isrc_extraction() {
480 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
481 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
482 <ern:ResourceList>
483 <ern:SoundRecording>
484 <ISRC>USRC17607839</ISRC>
485 <ern:ReferenceTitle>
486 <ern:TitleText>Test Track</ern:TitleText>
487 </ern:ReferenceTitle>
488 </ern:SoundRecording>
489 </ern:ResourceList>
490 </ern:NewReleaseMessage>"#;
491
492 let cursor = Cursor::new(xml.as_bytes());
493 let mut parser = SelectiveParser::for_isrcs();
494
495 let isrcs = parser
496 .extract_isrcs_fast(cursor)
497 .expect("Should extract ISRCs");
498
499 assert_eq!(isrcs.len(), 1);
500 assert_eq!(isrcs[0], "USRC17607839");
501 }
502
503 #[test]
504 fn test_selective_field_extraction() {
505 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
506 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
507 <ern:ReleaseList>
508 <ern:Release>
509 <ern:ReleaseId>REL001</ern:ReleaseId>
510 <ern:ReleaseReference>R001</ern:ReleaseReference>
511 <ern:ReferenceTitle>
512 <ern:TitleText>My Album</ern:TitleText>
513 </ern:ReferenceTitle>
514 </ern:Release>
515 </ern:ReleaseList>
516 </ern:NewReleaseMessage>"#;
517
518 let cursor = Cursor::new(xml.as_bytes());
519 let mut parser = SelectiveParser::for_release_metadata();
520
521 let result = parser
522 .extract_fields(cursor)
523 .expect("Should extract fields");
524
525 assert!(result.values.contains_key("ReleaseId"));
526 assert!(result.values.contains_key("ReleaseReference"));
527 assert!(result.values.contains_key("TitleText"));
528
529 assert_eq!(result.values["ReleaseId"][0], "REL001");
530 assert_eq!(result.values["ReleaseReference"][0], "R001");
531 assert_eq!(result.values["TitleText"][0], "My Album");
532
533 println!("Extraction results: {:#?}", result);
534 }
535
536 #[test]
537 fn test_performance_comparison() {
538 let mut xml = String::from(
540 r#"<?xml version="1.0" encoding="UTF-8"?>
541 <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
542 <ern:ResourceList>"#,
543 );
544
545 for i in 0..1000 {
546 xml.push_str(&format!(
547 r#"
548 <ern:SoundRecording>
549 <ern:SoundRecordingId Namespace="ISRC">USRC{:08}</ern:SoundRecordingId>
550 <ern:ReferenceTitle>
551 <ern:TitleText>Test Track {}</ern:TitleText>
552 </ern:ReferenceTitle>
553 </ern:SoundRecording>"#,
554 17600000 + i,
555 i
556 ));
557 }
558
559 xml.push_str("</ern:ResourceList></ern:NewReleaseMessage>");
560
561 let cursor1 = Cursor::new(xml.as_bytes());
563 let mut parser1 = SelectiveParser::for_isrcs();
564 let start1 = std::time::Instant::now();
565 let isrcs1 = parser1
566 .extract_isrcs(cursor1)
567 .expect("Standard extraction should work");
568 let duration1 = start1.elapsed();
569
570 let cursor2 = Cursor::new(xml.as_bytes());
572 let mut parser2 = SelectiveParser::for_isrcs();
573 let start2 = std::time::Instant::now();
574 let isrcs2 = parser2
575 .extract_isrcs_fast(cursor2)
576 .expect("Fast extraction should work");
577 let duration2 = start2.elapsed();
578
579 println!(
580 "Standard extraction: {} ISRCs in {:?}",
581 isrcs1.len(),
582 duration1
583 );
584 println!("Fast extraction: {} ISRCs in {:?}", isrcs2.len(), duration2);
585
586 assert_eq!(isrcs1.len(), 1000);
588 assert_eq!(isrcs2.len(), 1000);
589
590 println!(
592 "Fast extraction speedup: {:.2}x",
593 duration1.as_nanos() as f64 / duration2.as_nanos() as f64
594 );
595 }
596}