ddex_parser/parser/
selective_parser.rs

1// src/parser/selective_parser.rs
2//! Fast selective parsing for extracting specific fields like ISRCs
3
4use crate::error::ParseError;
5use quick_xml::{events::Event, Reader};
6use std::collections::HashSet;
7use std::io::BufRead;
8
9/// High-performance selective parser for extracting specific fields
10#[derive(Debug, Clone)]
11pub struct SelectiveParser {
12    /// Target fields to extract (e.g., "ISRC", "ReleaseId", etc.)
13    target_fields: HashSet<String>,
14    /// Enable case-sensitive matching
15    case_sensitive: bool,
16    /// Maximum depth to search (0 = unlimited)
17    max_depth: usize,
18}
19
20/// Result of selective field extraction
21#[derive(Debug, Clone)]
22pub struct SelectiveResult {
23    /// Extracted values mapped by field name
24    pub values: std::collections::HashMap<String, Vec<String>>,
25    /// Total elements processed
26    pub elements_processed: usize,
27    /// Bytes processed
28    pub bytes_processed: usize,
29    /// Parse duration
30    pub duration: std::time::Duration,
31}
32
33impl SelectiveParser {
34    /// Create a new selective parser targeting specific fields
35    pub fn new(target_fields: Vec<String>) -> Self {
36        Self {
37            target_fields: target_fields.into_iter().collect(),
38            case_sensitive: false,
39            max_depth: 0,
40        }
41    }
42
43    /// Create a parser specifically for ISRC extraction
44    pub fn for_isrcs() -> Self {
45        Self::new(vec![
46            "ISRC".to_string(),
47            "SoundRecordingId".to_string(),
48            "ResourceId".to_string(),
49        ])
50    }
51
52    /// Create a parser for release metadata extraction
53    pub fn for_release_metadata() -> Self {
54        Self::new(vec![
55            "ReleaseId".to_string(),
56            "ReleaseReference".to_string(),
57            "TitleText".to_string(),
58            "DisplayArtist".to_string(),
59            "ReleaseDate".to_string(),
60        ])
61    }
62
63    /// Set case sensitivity for field matching
64    pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
65        self.case_sensitive = case_sensitive;
66        self
67    }
68
69    /// Set maximum depth to search
70    pub fn max_depth(mut self, max_depth: usize) -> Self {
71        self.max_depth = max_depth;
72        self
73    }
74
75    /// Extract ISRCs from XML with maximum performance
76    pub fn extract_isrcs<R: BufRead>(&mut self, reader: R) -> Result<Vec<String>, ParseError> {
77        let result = self.extract_fields(reader)?;
78
79        let mut isrcs = Vec::new();
80
81        // Collect ISRCs from all possible field names
82        for field_name in &["ISRC", "SoundRecordingId", "ResourceId"] {
83            if let Some(values) = result.values.get(*field_name) {
84                for value in values {
85                    // Extract ISRC from value (might be in format "Namespace:Value")
86                    let isrc = if value.contains(':') {
87                        value.split(':').nth(1).unwrap_or(value).to_string()
88                    } else {
89                        value.clone()
90                    };
91
92                    // Validate ISRC format (12 characters: CCXXXYYNNNNN)
93                    if self.is_valid_isrc(&isrc) {
94                        isrcs.push(isrc);
95                    }
96                }
97            }
98        }
99
100        isrcs.sort();
101        isrcs.dedup();
102        Ok(isrcs)
103    }
104
105    /// Extract all targeted fields from XML
106    pub fn extract_fields<R: BufRead>(&mut self, reader: R) -> Result<SelectiveResult, ParseError> {
107        let start_time = std::time::Instant::now();
108        let mut xml_reader = Reader::from_reader(reader);
109        xml_reader.config_mut().trim_text(true);
110
111        let mut values: std::collections::HashMap<String, Vec<String>> =
112            std::collections::HashMap::new();
113        let mut buf = Vec::new();
114        let mut current_field = None::<String>;
115        let mut depth = 0;
116        let mut elements_processed = 0;
117
118        loop {
119            match xml_reader.read_event_into(&mut buf) {
120                Ok(Event::Start(ref e)) => {
121                    depth += 1;
122                    elements_processed += 1;
123
124                    // Check depth limit
125                    if self.max_depth > 0 && depth > self.max_depth {
126                        buf.clear();
127                        continue;
128                    }
129
130                    let element_name = self.extract_element_name(e.name().as_ref())?;
131
132                    // Check if this is a target field
133                    if self.is_target_field(&element_name) {
134                        current_field = Some(element_name);
135                    }
136                }
137                Ok(Event::End(_)) => {
138                    depth = depth.saturating_sub(1);
139                    current_field = None;
140                }
141                Ok(Event::Empty(ref e)) => {
142                    elements_processed += 1;
143
144                    let element_name = self.extract_element_name(e.name().as_ref())?;
145
146                    // For self-closing elements, check attributes
147                    if self.is_target_field(&element_name) {
148                        if let Ok(attributes) = e.attributes().collect::<Result<Vec<_>, _>>() {
149                            for attr in attributes {
150                                let attr_value = String::from_utf8_lossy(&attr.value);
151                                self.add_value(&mut values, &element_name, attr_value.to_string());
152                            }
153                        }
154                    }
155                }
156                Ok(Event::Text(ref e)) => {
157                    if let Some(ref field_name) = current_field {
158                        // Use utf8_utils for proper UTF-8 handling
159                        let current_pos = xml_reader.buffer_position() as usize;
160                        let text = crate::utf8_utils::handle_text_node(e, current_pos)?;
161
162                        let text_content = text.trim();
163                        if !text_content.is_empty() {
164                            self.add_value(&mut values, field_name, text_content.to_string());
165                        }
166                    }
167                }
168                Ok(Event::CData(ref e)) => {
169                    if let Some(ref field_name) = current_field {
170                        let text = String::from_utf8_lossy(e);
171                        let text_content = text.trim();
172                        if !text_content.is_empty() {
173                            self.add_value(&mut values, field_name, text_content.to_string());
174                        }
175                    }
176                }
177                Ok(Event::Eof) => break,
178                Err(e) => {
179                    return Err(ParseError::XmlError(format!("XML parsing error: {}", e)));
180                }
181                _ => {} // Skip other events for maximum speed
182            }
183            buf.clear();
184        }
185
186        Ok(SelectiveResult {
187            values,
188            elements_processed,
189            bytes_processed: xml_reader.buffer_position() as usize,
190            duration: start_time.elapsed(),
191        })
192    }
193
194    /// Ultra-fast ISRC extraction using pattern matching (10x+ faster than XML parsing)
195    pub fn extract_isrcs_fast<R: BufRead>(
196        &mut self,
197        mut reader: R,
198    ) -> Result<Vec<String>, ParseError> {
199        let mut isrcs = Vec::new();
200        let mut buffer = Vec::new();
201
202        // Read entire content for fast scanning
203        reader
204            .read_to_end(&mut buffer)
205            .map_err(|e| ParseError::IoError(format!("Failed to read input: {}", e)))?;
206
207        // Convert to string for faster pattern matching
208        let content = std::str::from_utf8(&buffer).map_err(|e| ParseError::InvalidUtf8 {
209            message: format!("UTF-8 decoding error at position 0: {}", e),
210        })?;
211
212        // Ultra-fast pattern matching for ISRC tags
213        self.extract_isrcs_from_content(content, &mut isrcs);
214
215        // Remove duplicates and sort
216        isrcs.sort_unstable();
217        isrcs.dedup();
218
219        Ok(isrcs)
220    }
221
222    /// Extract ISRCs from content using fastest possible pattern matching
223    fn extract_isrcs_from_content(&self, content: &str, isrcs: &mut Vec<String>) {
224        // Look for ISRC patterns, handling both direct ISRC tags and SoundRecordingId with ISRC namespace
225        let mut pos = 0;
226        let content_len = content.len();
227
228        while pos < content_len {
229            // Look for any potential ISRC container tags
230            if let Some(isrc_pos) = self.find_next_isrc_tag(content, pos) {
231                pos = isrc_pos;
232
233                // Extract ISRC value from this position
234                if let Some((isrc, next_pos)) = self.extract_isrc_at_position(content, pos) {
235                    if self.is_valid_isrc(&isrc) {
236                        isrcs.push(isrc);
237                    }
238                    pos = next_pos;
239                } else {
240                    pos += 1;
241                }
242            } else {
243                break;
244            }
245        }
246    }
247
248    /// Find next potential ISRC tag position
249    fn find_next_isrc_tag(&self, content: &str, start_pos: usize) -> Option<usize> {
250        let search_slice = &content[start_pos..];
251
252        // Patterns to look for (ordered by likelihood)
253        let patterns = [
254            "<ISRC>",
255            "<ern:ISRC>",
256            "<SoundRecordingId",
257            "<ern:SoundRecordingId",
258        ];
259
260        let mut min_pos: Option<usize> = None;
261        for &pattern in &patterns {
262            if let Some(found_pos) = search_slice.find(pattern) {
263                let absolute_pos = start_pos + found_pos;
264                min_pos =
265                    Some(min_pos.map_or(absolute_pos, |current: usize| current.min(absolute_pos)));
266            }
267        }
268
269        min_pos
270    }
271
272    /// Extract ISRC value at a specific position
273    fn extract_isrc_at_position(&self, content: &str, pos: usize) -> Option<(String, usize)> {
274        let remaining = &content[pos..];
275
276        // Handle direct ISRC tags
277        if remaining.starts_with("<ISRC>") {
278            return self.extract_between_tags(content, pos, "<ISRC>", "</ISRC>");
279        }
280        if remaining.starts_with("<ern:ISRC>") {
281            return self.extract_between_tags(content, pos, "<ern:ISRC>", "</ern:ISRC>");
282        }
283
284        // Handle SoundRecordingId with Namespace="ISRC"
285        if remaining.starts_with("<SoundRecordingId")
286            || remaining.starts_with("<ern:SoundRecordingId")
287        {
288            // Find the closing > of the opening tag
289            if let Some(tag_end) = remaining.find('>') {
290                let opening_tag = &remaining[..=tag_end];
291
292                // Check if this has Namespace="ISRC"
293                if opening_tag.contains("Namespace=\"ISRC\"")
294                    || opening_tag.contains("Namespace='ISRC'")
295                {
296                    let content_start = pos + tag_end + 1;
297
298                    // Find the closing tag
299                    let closing_tag = if remaining.starts_with("<ern:") {
300                        "</ern:SoundRecordingId>"
301                    } else {
302                        "</SoundRecordingId>"
303                    };
304
305                    if let Some(closing_pos) = content[content_start..].find(closing_tag) {
306                        let content_end = content_start + closing_pos;
307                        let isrc = content[content_start..content_end].trim().to_string();
308                        return Some((isrc, content_end + closing_tag.len()));
309                    }
310                }
311            }
312        }
313
314        None
315    }
316
317    /// Extract content between opening and closing tags
318    fn extract_between_tags(
319        &self,
320        content: &str,
321        pos: usize,
322        open_tag: &str,
323        close_tag: &str,
324    ) -> Option<(String, usize)> {
325        let content_start = pos + open_tag.len();
326
327        if let Some(content_end_rel) = content[content_start..].find(close_tag) {
328            let content_end = content_start + content_end_rel;
329            let extracted = content[content_start..content_end].trim().to_string();
330            Some((extracted, content_end + close_tag.len()))
331        } else {
332            None
333        }
334    }
335
336    /// Check if element name matches target fields
337    fn is_target_field(&self, name: &str) -> bool {
338        if self.case_sensitive {
339            self.target_fields.contains(name)
340        } else {
341            self.target_fields
342                .iter()
343                .any(|field| field.eq_ignore_ascii_case(name))
344        }
345    }
346
347    /// Extract element name from QName (strips namespace prefix)
348    fn extract_element_name(&self, qname: &[u8]) -> Result<String, ParseError> {
349        let name_str = std::str::from_utf8(qname).map_err(|_| ParseError::IoError(
350            "Invalid UTF-8 in element name".to_string(),
351        ))?;
352
353        // Strip namespace prefix if present
354        let local_name = if let Some(colon_pos) = name_str.find(':') {
355            &name_str[colon_pos + 1..]
356        } else {
357            name_str
358        };
359
360        Ok(local_name.to_string())
361    }
362
363    /// Add value to results, handling duplicates
364    fn add_value(
365        &self,
366        values: &mut std::collections::HashMap<String, Vec<String>>,
367        field_name: &str,
368        value: String,
369    ) {
370        values
371            .entry(field_name.to_string())
372            .or_default()
373            .push(value);
374    }
375
376    /// Validate ISRC format (basic validation)
377    fn is_valid_isrc(&self, isrc: &str) -> bool {
378        // ISRC format: CCXXXYYNNNNN (12 characters)
379        // CC = Country code (2 letters)
380        // XXX = Registrant code (3 alphanumeric)
381        // YY = Year (2 digits)
382        // NNNNN = Designation code (5 digits)
383
384        if isrc.len() != 12 {
385            return false;
386        }
387
388        let chars: Vec<char> = isrc.chars().collect();
389
390        // Check country code (first 2 chars should be letters)
391        if !chars[0].is_ascii_alphabetic() || !chars[1].is_ascii_alphabetic() {
392            return false;
393        }
394
395        // Check registrant code (chars 2-4 should be alphanumeric)
396        for &ch in &chars[2..5] {
397            if !ch.is_ascii_alphanumeric() {
398                return false;
399            }
400        }
401
402        // Check year (chars 5-6 should be digits)
403        if !chars[5].is_ascii_digit() || !chars[6].is_ascii_digit() {
404            return false;
405        }
406
407        // Check designation code (chars 7-11 should be digits)
408        for &ch in &chars[7..12] {
409            if !ch.is_ascii_digit() {
410                return false;
411            }
412        }
413
414        true
415    }
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421    use std::io::Cursor;
422
423    #[test]
424    fn test_isrc_validation() {
425        let parser = SelectiveParser::for_isrcs();
426
427        assert!(parser.is_valid_isrc("USRC17607839"));
428        assert!(parser.is_valid_isrc("GBUM71505078"));
429        assert!(parser.is_valid_isrc("FRUM71200001"));
430
431        assert!(!parser.is_valid_isrc("USRC1760783")); // Too short
432        assert!(!parser.is_valid_isrc("USRC176078391")); // Too long
433        assert!(!parser.is_valid_isrc("12RC17607839")); // Invalid country code
434        assert!(!parser.is_valid_isrc("USRC1760783A")); // Invalid designation code
435    }
436
437    #[test]
438    fn test_selective_isrc_extraction() {
439        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
440        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
441            <ern:ResourceList>
442                <ern:SoundRecording>
443                    <ern:SoundRecordingId Namespace="ISRC">USRC17607839</ern:SoundRecordingId>
444                    <ern:ReferenceTitle>
445                        <ern:TitleText>Test Track</ern:TitleText>
446                    </ern:ReferenceTitle>
447                </ern:SoundRecording>
448                <ern:SoundRecording>
449                    <ern:SoundRecordingId Namespace="ISRC">GBUM71505078</ern:SoundRecordingId>
450                    <ern:ReferenceTitle>
451                        <ern:TitleText>Another Track</ern:TitleText>
452                    </ern:ReferenceTitle>
453                </ern:SoundRecording>
454            </ern:ResourceList>
455        </ern:NewReleaseMessage>"#;
456
457        let cursor = Cursor::new(xml.as_bytes());
458        let mut parser = SelectiveParser::for_isrcs();
459
460        let isrcs = parser.extract_isrcs(cursor).expect("Should extract ISRCs");
461
462        assert_eq!(isrcs.len(), 2);
463        assert!(isrcs.contains(&"USRC17607839".to_string()));
464        assert!(isrcs.contains(&"GBUM71505078".to_string()));
465    }
466
467    #[test]
468    fn test_fast_isrc_extraction() {
469        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
470        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
471            <ern:ResourceList>
472                <ern:SoundRecording>
473                    <ISRC>USRC17607839</ISRC>
474                    <ern:ReferenceTitle>
475                        <ern:TitleText>Test Track</ern:TitleText>
476                    </ern:ReferenceTitle>
477                </ern:SoundRecording>
478            </ern:ResourceList>
479        </ern:NewReleaseMessage>"#;
480
481        let cursor = Cursor::new(xml.as_bytes());
482        let mut parser = SelectiveParser::for_isrcs();
483
484        let isrcs = parser
485            .extract_isrcs_fast(cursor)
486            .expect("Should extract ISRCs");
487
488        assert_eq!(isrcs.len(), 1);
489        assert_eq!(isrcs[0], "USRC17607839");
490    }
491
492    #[test]
493    fn test_selective_field_extraction() {
494        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
495        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
496            <ern:ReleaseList>
497                <ern:Release>
498                    <ern:ReleaseId>REL001</ern:ReleaseId>
499                    <ern:ReleaseReference>R001</ern:ReleaseReference>
500                    <ern:ReferenceTitle>
501                        <ern:TitleText>My Album</ern:TitleText>
502                    </ern:ReferenceTitle>
503                </ern:Release>
504            </ern:ReleaseList>
505        </ern:NewReleaseMessage>"#;
506
507        let cursor = Cursor::new(xml.as_bytes());
508        let mut parser = SelectiveParser::for_release_metadata();
509
510        let result = parser
511            .extract_fields(cursor)
512            .expect("Should extract fields");
513
514        assert!(result.values.contains_key("ReleaseId"));
515        assert!(result.values.contains_key("ReleaseReference"));
516        assert!(result.values.contains_key("TitleText"));
517
518        assert_eq!(result.values["ReleaseId"][0], "REL001");
519        assert_eq!(result.values["ReleaseReference"][0], "R001");
520        assert_eq!(result.values["TitleText"][0], "My Album");
521
522        println!("Extraction results: {:#?}", result);
523    }
524
525    #[test]
526    fn test_performance_comparison() {
527        // Generate larger test data
528        let mut xml = String::from(
529            r#"<?xml version="1.0" encoding="UTF-8"?>
530        <ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
531            <ern:ResourceList>"#,
532        );
533
534        for i in 0..1000 {
535            xml.push_str(&format!(
536                r#"
537                <ern:SoundRecording>
538                    <ern:SoundRecordingId Namespace="ISRC">USRC{:08}</ern:SoundRecordingId>
539                    <ern:ReferenceTitle>
540                        <ern:TitleText>Test Track {}</ern:TitleText>
541                    </ern:ReferenceTitle>
542                </ern:SoundRecording>"#,
543                17600000 + i,
544                i
545            ));
546        }
547
548        xml.push_str("</ern:ResourceList></ern:NewReleaseMessage>");
549
550        // Test standard extraction
551        let cursor1 = Cursor::new(xml.as_bytes());
552        let mut parser1 = SelectiveParser::for_isrcs();
553        let start1 = std::time::Instant::now();
554        let isrcs1 = parser1
555            .extract_isrcs(cursor1)
556            .expect("Standard extraction should work");
557        let duration1 = start1.elapsed();
558
559        // Test fast extraction
560        let cursor2 = Cursor::new(xml.as_bytes());
561        let mut parser2 = SelectiveParser::for_isrcs();
562        let start2 = std::time::Instant::now();
563        let isrcs2 = parser2
564            .extract_isrcs_fast(cursor2)
565            .expect("Fast extraction should work");
566        let duration2 = start2.elapsed();
567
568        println!(
569            "Standard extraction: {} ISRCs in {:?}",
570            isrcs1.len(),
571            duration1
572        );
573        println!("Fast extraction: {} ISRCs in {:?}", isrcs2.len(), duration2);
574
575        // Both methods should find the same ISRCs
576        assert_eq!(isrcs1.len(), 1000);
577        assert_eq!(isrcs2.len(), 1000);
578
579        // Fast method should be faster (though results may vary in debug mode)
580        println!(
581            "Fast extraction speedup: {:.2}x",
582            duration1.as_nanos() as f64 / duration2.as_nanos() as f64
583        );
584    }
585}
ddex_parser/parser/selective_parser.rs

ddex_parser/parser/
selective_parser.rs