ddex_parser/parser/
selective_parser.rs

1// src/parser/selective_parser.rs
2//! Fast selective parsing for extracting specific fields like ISRCs
3
4use crate::error::ParseError;
5use quick_xml::{events::Event, Reader};
6use std::collections::HashSet;
7use std::io::BufRead;
8
9/// High-performance selective parser for extracting specific fields
10#[derive(Debug, Clone)]
11pub struct SelectiveParser {
12    /// Target fields to extract (e.g., "ISRC", "ReleaseId", etc.)
13    target_fields: HashSet<String>,
14    /// Enable case-sensitive matching
15    case_sensitive: bool,
16    /// Maximum depth to search (0 = unlimited)
17    max_depth: usize,
18}
19
20/// Result of selective field extraction
21#[derive(Debug, Clone)]
22pub struct SelectiveResult {
23    /// Extracted values mapped by field name
24    pub values: std::collections::HashMap<String, Vec<String>>,
25    /// Total elements processed
26    pub elements_processed: usize,
27    /// Bytes processed
28    pub bytes_processed: usize,
29    /// Parse duration
30    pub duration: std::time::Duration,
31}
32
33impl SelectiveParser {
34    /// Create a new selective parser targeting specific fields
35    pub fn new(target_fields: Vec<String>) -> Self {
36        Self {
37            target_fields: target_fields.into_iter().collect(),
38            case_sensitive: false,
39            max_depth: 0,
40        }
41    }
42
43    /// Create a parser specifically for ISRC extraction
44    pub fn for_isrcs() -> Self {
45        Self::new(vec![
46            "ISRC".to_string(),
47            "SoundRecordingId".to_string(),
48            "ResourceId".to_string(),
49        ])
50    }
51
52    /// Create a parser for release metadata extraction
53    pub fn for_release_metadata() -> Self {
54        Self::new(vec![
55            "ReleaseId".to_string(),
56            "ReleaseReference".to_string(),
57            "TitleText".to_string(),
58            "DisplayArtist".to_string(),
59            "ReleaseDate".to_string(),
60        ])
61    }
62
63    /// Set case sensitivity for field matching
64    pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
65        self.case_sensitive = case_sensitive;
66        self
67    }
68
69    /// Set maximum depth to search
70    pub fn max_depth(mut self, max_depth: usize) -> Self {
71        self.max_depth = max_depth;
72        self
73    }
74
75    /// Extract ISRCs from XML with maximum performance
76    pub fn extract_isrcs<R: BufRead>(&mut self, reader: R) -> Result<Vec<String>, ParseError> {
77        let result = self.extract_fields(reader)?;
78
79        let mut isrcs = Vec::new();
80
81        // Collect ISRCs from all possible field names
82        for field_name in &["ISRC", "SoundRecordingId", "ResourceId"] {
83            if let Some(values) = result.values.get(*field_name) {
84                for value in values {
85                    // Extract ISRC from value (might be in format "Namespace:Value")
86                    let isrc = if value.contains(':') {
87                        value.split(':').nth(1).unwrap_or(value).to_string()
88                    } else {
89                        value.clone()
90                    };
91
92                    // Validate ISRC format (12 characters: CCXXXYYNNNNN)
93                    if self.is_valid_isrc(&isrc) {
94                        isrcs.push(isrc);
95                    }
96                }
97            }
98        }
99
100        isrcs.sort();
101        isrcs.dedup();
102        Ok(isrcs)
103    }
104
105    /// Extract all targeted fields from XML
106    pub fn extract_fields<R: BufRead>(&mut self, reader: R) -> Result<SelectiveResult, ParseError> {
107        let start_time = std::time::Instant::now();
108        let mut xml_reader = Reader::from_reader(reader);
109        xml_reader.config_mut().trim_text(true);
110
111        let mut values: std::collections::HashMap<String, Vec<String>> =
112            std::collections::HashMap::new();
113        let mut buf = Vec::new();
114        let mut current_field = None::<String>;
115        let mut depth = 0;
116        let mut elements_processed = 0;
117
118        loop {
119            match xml_reader.read_event_into(&mut buf) {
120                Ok(Event::Start(ref e)) => {
121                    depth += 1;
122                    elements_processed += 1;
123
124                    // Check depth limit
125                    if self.max_depth > 0 && depth > self.max_depth {
126                        buf.clear();
127                        continue;
128                    }
129
130                    let element_name = self.extract_element_name(e.name().as_ref())?;
131
132                    // Check if this is a target field
133                    if self.is_target_field(&element_name) {
134                        current_field = Some(element_name);
135                    }
136                }
137                Ok(Event::End(_)) => {
138                    depth = depth.saturating_sub(1);
139                    current_field = None;
140                }
141                Ok(Event::Empty(ref e)) => {
142                    elements_processed += 1;
143
144                    let element_name = self.extract_element_name(e.name().as_ref())?;
145
146                    // For self-closing elements, check attributes
147                    if self.is_target_field(&element_name) {
148                        if let Ok(attributes) = e.attributes().collect::<Result<Vec<_>, _>>() {
149                            for attr in attributes {
150                                let attr_value = String::from_utf8_lossy(&attr.value);
151                                self.add_value(&mut values, &element_name, attr_value.to_string());
152                            }
153                        }
154                    }
155                }
156                Ok(Event::Text(ref e)) => {
157                    if let Some(ref field_name) = current_field {
158                        // Use utf8_utils for proper UTF-8 handling
159                        let current_pos = xml_reader.buffer_position() as usize;
160                        let text = crate::utf8_utils::handle_text_node(e, current_pos)?;
161
162                        let text_content = text.trim();
163                        if !text_content.is_empty() {
164                            self.add_value(&mut values, field_name, text_content.to_string());
165                        }
166                    }
167                }
168                Ok(Event::CData(ref e)) => {
169                    if let Some(ref field_name) = current_field {
170                        let text = String::from_utf8_lossy(e);
171                        let text_content = text.trim();
172                        if !text_content.is_empty() {
173                            self.add_value(&mut values, field_name, text_content.to_string());
174                        }
175                    }
176                }
177                Ok(Event::Eof) => break,
178                Err(e) => {
179                    return Err(ParseError::XmlError {
180                        message: format!("XML parsing error: {}", e),
181                        location: crate::error::ErrorLocation {
182                            line: 0,
183                            column: 0,
184                            byte_offset: Some(xml_reader.buffer_position() as usize),
185                            path: "selective_parser".to_string(),
186                        },
187                    });
188                }
189                _ => {} // Skip other events for maximum speed
190            }
191            buf.clear();
192        }
193
194        Ok(SelectiveResult {
195            values,
196            elements_processed,
197            bytes_processed: xml_reader.buffer_position() as usize,
198            duration: start_time.elapsed(),
199        })
200    }
201
202    /// Ultra-fast ISRC extraction using pattern matching (10x+ faster than XML parsing)
203    pub fn extract_isrcs_fast<R: BufRead>(
204        &mut self,
205        mut reader: R,
206    ) -> Result<Vec<String>, ParseError> {
207        let mut isrcs = Vec::new();
208        let mut buffer = Vec::new();
209
210        // Read entire content for fast scanning
211        reader
212            .read_to_end(&mut buffer)
213            .map_err(|e| ParseError::Io {
214                message: format!("Failed to read input: {}", e),
215            })?;
216
217        // Convert to string for faster pattern matching
218        let content = std::str::from_utf8(&buffer).map_err(|e| ParseError::InvalidUtf8 {
219            position: 0,
220            error: e.to_string(),
221        })?;
222
223        // Ultra-fast pattern matching for ISRC tags
224        self.extract_isrcs_from_content(content, &mut isrcs);
225
226        // Remove duplicates and sort
227        isrcs.sort_unstable();
228        isrcs.dedup();
229
230        Ok(isrcs)
231    }
232
233    /// Extract ISRCs from content using fastest possible pattern matching
234    fn extract_isrcs_from_content(&self, content: &str, isrcs: &mut Vec<String>) {
235        // Look for ISRC patterns, handling both direct ISRC tags and SoundRecordingId with ISRC namespace
236        let mut pos = 0;
237        let content_len = content.len();
238
239        while pos < content_len {
240            // Look for any potential ISRC container tags
241            if let Some(isrc_pos) = self.find_next_isrc_tag(content, pos) {
242                pos = isrc_pos;
243
244                // Extract ISRC value from this position
245                if let Some((isrc, next_pos)) = self.extract_isrc_at_position(content, pos) {
246                    if self.is_valid_isrc(&isrc) {
247                        isrcs.push(isrc);
248                    }
249                    pos = next_pos;
250                } else {
251                    pos += 1;
252                }
253            } else {
254                break;
255            }
256        }
257    }
258
259    /// Find next potential ISRC tag position
260    fn find_next_isrc_tag(&self, content: &str, start_pos: usize) -> Option<usize> {
261        let search_slice = &content[start_pos..];
262
263        // Patterns to look for (ordered by likelihood)
264        let patterns = [
265            "<ISRC>",
266            "<ern:ISRC>",
267            "<SoundRecordingId",
268            "<ern:SoundRecordingId",
269        ];
270
271        let mut min_pos: Option<usize> = None;
272        for &pattern in &patterns {
273            if let Some(found_pos) = search_slice.find(pattern) {
274                let absolute_pos = start_pos + found_pos;
275                min_pos =
276                    Some(min_pos.map_or(absolute_pos, |current: usize| current.min(absolute_pos)));
277            }
278        }
279
280        min_pos
281    }
282
283    /// Extract ISRC value at a specific position
284    fn extract_isrc_at_position(&self, content: &str, pos: usize) -> Option<(String, usize)> {
285        let remaining = &content[pos..];
286
287        // Handle direct ISRC tags
288        if remaining.starts_with("<ISRC>") {
289            return self.extract_between_tags(content, pos, "<ISRC>", "</ISRC>");
290        }
291        if remaining.starts_with("<ern:ISRC>") {
292            return self.extract_between_tags(content, pos, "<ern:ISRC>", "</ern:ISRC>");
293        }
294
295        // Handle SoundRecordingId with Namespace="ISRC"
296        if remaining.starts_with("<SoundRecordingId")
297            || remaining.starts_with("<ern:SoundRecordingId")
298        {
299            // Find the closing > of the opening tag
300            if let Some(tag_end) = remaining.find('>') {
301                let opening_tag = &remaining[..=tag_end];
302
303                // Check if this has Namespace="ISRC"
304                if opening_tag.contains("Namespace=\"ISRC\"")
305                    || opening_tag.contains("Namespace='ISRC'")
306                {
307                    let content_start = pos + tag_end + 1;
308
309                    // Find the closing tag
310                    let closing_tag = if remaining.starts_with("<ern:") {
311                        "</ern:SoundRecordingId>"
312                    } else {
313                        "</SoundRecordingId>"
314                    };
315
316                    if let Some(closing_pos) = content[content_start..].find(closing_tag) {
317                        let content_end = content_start + closing_pos;
318                        let isrc = content[content_start..content_end].trim().to_string();
319                        return Some((isrc, content_end + closing_tag.len()));
320                    }
321                }
322            }
323        }
324
325        None
326    }
327
328    /// Extract content between opening and closing tags
329    fn extract_between_tags(
330        &self,
331        content: &str,
332        pos: usize,
333        open_tag: &str,
334        close_tag: &str,
335    ) -> Option<(String, usize)> {
336        let content_start = pos + open_tag.len();
337
338        if let Some(content_end_rel) = content[content_start..].find(close_tag) {
339            let content_end = content_start + content_end_rel;
340            let extracted = content[content_start..content_end].trim().to_string();
341            Some((extracted, content_end + close_tag.len()))
342        } else {
343            None
344        }
345    }
346
347    /// Check if element name matches target fields
348    fn is_target_field(&self, name: &str) -> bool {
349        if self.case_sensitive {
350            self.target_fields.contains(name)
351        } else {
352            self.target_fields
353                .iter()
354                .any(|field| field.eq_ignore_ascii_case(name))
355        }
356    }
357
358    /// Extract element name from QName (strips namespace prefix)
359    fn extract_element_name(&self, qname: &[u8]) -> Result<String, ParseError> {
360        let name_str = std::str::from_utf8(qname).map_err(|_| ParseError::Io {
361            message: "Invalid UTF-8 in element name".to_string(),
362        })?;
363
364        // Strip namespace prefix if present
365        let local_name = if let Some(colon_pos) = name_str.find(':') {
366            &name_str[colon_pos + 1..]
367        } else {
368            name_str
369        };
370
371        Ok(local_name.to_string())
372    }
373
374    /// Add value to results, handling duplicates
375    fn add_value(
376        &self,
377        values: &mut std::collections::HashMap<String, Vec<String>>,
378        field_name: &str,
379        value: String,
380    ) {
381        values
382            .entry(field_name.to_string())
383            .or_default()
384            .push(value);
385    }
386
387    /// Validate ISRC format (basic validation)
388    fn is_valid_isrc(&self, isrc: &str) -> bool {
389        // ISRC format: CCXXXYYNNNNN (12 characters)
390        // CC = Country code (2 letters)
391        // XXX = Registrant code (3 alphanumeric)
392        // YY = Year (2 digits)
393        // NNNNN = Designation code (5 digits)
394
395        if isrc.len() != 12 {
396            return false;
397        }
398
399        let chars: Vec<char> = isrc.chars().collect();
400
401        // Check country code (first 2 chars should be letters)
402        if !chars[0].is_ascii_alphabetic() || !chars[1].is_ascii_alphabetic() {
403            return false;
404        }
405
406        // Check registrant code (chars 2-4 should be alphanumeric)
407        for &ch in &chars[2..5] {
408            if !ch.is_ascii_alphanumeric() {
409                return false;
410            }
411        }
412
413        // Check year (chars 5-6 should be digits)
414        if !chars[5].is_ascii_digit() || !chars[6].is_ascii_digit() {
415            return false;
416        }
417
418        // Check designation code (chars 7-11 should be digits)
419        for &ch in &chars[7..12] {
420            if !ch.is_ascii_digit() {
421                return false;
422            }
423        }
424
425        true
426    }
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432    use std::io::Cursor;
433
434    #[test]
435    fn test_isrc_validation() {
436        let parser = SelectiveParser::for_isrcs();
437
438        assert!(parser.is_valid_isrc("USRC17607839"));
439        assert!(parser.is_valid_isrc("GBUM71505078"));
440        assert!(parser.is_valid_isrc("FRUM71200001"));
441
442        assert!(!parser.is_valid_isrc("USRC1760783")); // Too short
443        assert!(!parser.is_valid_isrc("USRC176078391")); // Too long
444        assert!(!parser.is_valid_isrc("12RC17607839")); // Invalid country code
445        assert!(!parser.is_valid_isrc("USRC1760783A")); // Invalid designation code
446    }
447
448    #[test]
449    fn test_selective_isrc_extraction() {
450        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
451        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
452            <ern:ResourceList>
453                <ern:SoundRecording>
454                    <ern:SoundRecordingId Namespace="ISRC">USRC17607839</ern:SoundRecordingId>
455                    <ern:ReferenceTitle>
456                        <ern:TitleText>Test Track</ern:TitleText>
457                    </ern:ReferenceTitle>
458                </ern:SoundRecording>
459                <ern:SoundRecording>
460                    <ern:SoundRecordingId Namespace="ISRC">GBUM71505078</ern:SoundRecordingId>
461                    <ern:ReferenceTitle>
462                        <ern:TitleText>Another Track</ern:TitleText>
463                    </ern:ReferenceTitle>
464                </ern:SoundRecording>
465            </ern:ResourceList>
466        </ern:NewReleaseMessage>"#;
467
468        let cursor = Cursor::new(xml.as_bytes());
469        let mut parser = SelectiveParser::for_isrcs();
470
471        let isrcs = parser.extract_isrcs(cursor).expect("Should extract ISRCs");
472
473        assert_eq!(isrcs.len(), 2);
474        assert!(isrcs.contains(&"USRC17607839".to_string()));
475        assert!(isrcs.contains(&"GBUM71505078".to_string()));
476    }
477
478    #[test]
479    fn test_fast_isrc_extraction() {
480        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
481        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
482            <ern:ResourceList>
483                <ern:SoundRecording>
484                    <ISRC>USRC17607839</ISRC>
485                    <ern:ReferenceTitle>
486                        <ern:TitleText>Test Track</ern:TitleText>
487                    </ern:ReferenceTitle>
488                </ern:SoundRecording>
489            </ern:ResourceList>
490        </ern:NewReleaseMessage>"#;
491
492        let cursor = Cursor::new(xml.as_bytes());
493        let mut parser = SelectiveParser::for_isrcs();
494
495        let isrcs = parser
496            .extract_isrcs_fast(cursor)
497            .expect("Should extract ISRCs");
498
499        assert_eq!(isrcs.len(), 1);
500        assert_eq!(isrcs[0], "USRC17607839");
501    }
502
503    #[test]
504    fn test_selective_field_extraction() {
505        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
506        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
507            <ern:ReleaseList>
508                <ern:Release>
509                    <ern:ReleaseId>REL001</ern:ReleaseId>
510                    <ern:ReleaseReference>R001</ern:ReleaseReference>
511                    <ern:ReferenceTitle>
512                        <ern:TitleText>My Album</ern:TitleText>
513                    </ern:ReferenceTitle>
514                </ern:Release>
515            </ern:ReleaseList>
516        </ern:NewReleaseMessage>"#;
517
518        let cursor = Cursor::new(xml.as_bytes());
519        let mut parser = SelectiveParser::for_release_metadata();
520
521        let result = parser
522            .extract_fields(cursor)
523            .expect("Should extract fields");
524
525        assert!(result.values.contains_key("ReleaseId"));
526        assert!(result.values.contains_key("ReleaseReference"));
527        assert!(result.values.contains_key("TitleText"));
528
529        assert_eq!(result.values["ReleaseId"][0], "REL001");
530        assert_eq!(result.values["ReleaseReference"][0], "R001");
531        assert_eq!(result.values["TitleText"][0], "My Album");
532
533        println!("Extraction results: {:#?}", result);
534    }
535
536    #[test]
537    fn test_performance_comparison() {
538        // Generate larger test data
539        let mut xml = String::from(
540            r#"<?xml version="1.0" encoding="UTF-8"?>
541        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
542            <ern:ResourceList>"#,
543        );
544
545        for i in 0..1000 {
546            xml.push_str(&format!(
547                r#"
548                <ern:SoundRecording>
549                    <ern:SoundRecordingId Namespace="ISRC">USRC{:08}</ern:SoundRecordingId>
550                    <ern:ReferenceTitle>
551                        <ern:TitleText>Test Track {}</ern:TitleText>
552                    </ern:ReferenceTitle>
553                </ern:SoundRecording>"#,
554                17600000 + i,
555                i
556            ));
557        }
558
559        xml.push_str("</ern:ResourceList></ern:NewReleaseMessage>");
560
561        // Test standard extraction
562        let cursor1 = Cursor::new(xml.as_bytes());
563        let mut parser1 = SelectiveParser::for_isrcs();
564        let start1 = std::time::Instant::now();
565        let isrcs1 = parser1
566            .extract_isrcs(cursor1)
567            .expect("Standard extraction should work");
568        let duration1 = start1.elapsed();
569
570        // Test fast extraction
571        let cursor2 = Cursor::new(xml.as_bytes());
572        let mut parser2 = SelectiveParser::for_isrcs();
573        let start2 = std::time::Instant::now();
574        let isrcs2 = parser2
575            .extract_isrcs_fast(cursor2)
576            .expect("Fast extraction should work");
577        let duration2 = start2.elapsed();
578
579        println!(
580            "Standard extraction: {} ISRCs in {:?}",
581            isrcs1.len(),
582            duration1
583        );
584        println!("Fast extraction: {} ISRCs in {:?}", isrcs2.len(), duration2);
585
586        // Both methods should find the same ISRCs
587        assert_eq!(isrcs1.len(), 1000);
588        assert_eq!(isrcs2.len(), 1000);
589
590        // Fast method should be faster (though results may vary in debug mode)
591        println!(
592            "Fast extraction speedup: {:.2}x",
593            duration1.as_nanos() as f64 / duration2.as_nanos() as f64
594        );
595    }
596}