use crate::error::ParseError;
use quick_xml::{events::Event, Reader};
use std::collections::HashSet;
use std::io::BufRead;
#[derive(Debug, Clone)]
pub struct SelectiveParser {
target_fields: HashSet<String>,
case_sensitive: bool,
max_depth: usize,
}
#[derive(Debug, Clone)]
pub struct SelectiveResult {
pub values: std::collections::HashMap<String, Vec<String>>,
pub elements_processed: usize,
pub bytes_processed: usize,
pub duration: std::time::Duration,
}
impl SelectiveParser {
pub fn new(target_fields: Vec<String>) -> Self {
Self {
target_fields: target_fields.into_iter().collect(),
case_sensitive: false,
max_depth: 0,
}
}
pub fn for_isrcs() -> Self {
Self::new(vec![
"ISRC".to_string(),
"SoundRecordingId".to_string(),
"ResourceId".to_string(),
])
}
pub fn for_release_metadata() -> Self {
Self::new(vec![
"ReleaseId".to_string(),
"ReleaseReference".to_string(),
"TitleText".to_string(),
"DisplayArtist".to_string(),
"ReleaseDate".to_string(),
])
}
pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
self.case_sensitive = case_sensitive;
self
}
pub fn max_depth(mut self, max_depth: usize) -> Self {
self.max_depth = max_depth;
self
}
pub fn extract_isrcs<R: BufRead>(&mut self, reader: R) -> Result<Vec<String>, ParseError> {
let result = self.extract_fields(reader)?;
let mut isrcs = Vec::new();
for field_name in &["ISRC", "SoundRecordingId", "ResourceId"] {
if let Some(values) = result.values.get(*field_name) {
for value in values {
let isrc = if value.contains(':') {
value.split(':').nth(1).unwrap_or(value).to_string()
} else {
value.clone()
};
if self.is_valid_isrc(&isrc) {
isrcs.push(isrc);
}
}
}
}
isrcs.sort();
isrcs.dedup();
Ok(isrcs)
}
pub fn extract_fields<R: BufRead>(&mut self, reader: R) -> Result<SelectiveResult, ParseError> {
let start_time = std::time::Instant::now();
let mut xml_reader = Reader::from_reader(reader);
xml_reader.config_mut().trim_text(true);
let mut values: std::collections::HashMap<String, Vec<String>> =
std::collections::HashMap::new();
let mut buf = Vec::new();
let mut current_field = None::<String>;
let mut depth = 0;
let mut elements_processed = 0;
loop {
match xml_reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
depth += 1;
elements_processed += 1;
if self.max_depth > 0 && depth > self.max_depth {
buf.clear();
continue;
}
let element_name = self.extract_element_name(e.name().as_ref())?;
if self.is_target_field(&element_name) {
current_field = Some(element_name);
}
}
Ok(Event::End(_)) => {
depth = depth.saturating_sub(1);
current_field = None;
}
Ok(Event::Empty(ref e)) => {
elements_processed += 1;
let element_name = self.extract_element_name(e.name().as_ref())?;
if self.is_target_field(&element_name) {
if let Ok(attributes) = e.attributes().collect::<Result<Vec<_>, _>>() {
for attr in attributes {
let attr_value = String::from_utf8_lossy(&attr.value);
self.add_value(&mut values, &element_name, attr_value.to_string());
}
}
}
}
Ok(Event::Text(ref e)) => {
if let Some(ref field_name) = current_field {
let current_pos = xml_reader.buffer_position() as usize;
let text = crate::utf8_utils::handle_text_node(e, current_pos)?;
let text_content = text.trim();
if !text_content.is_empty() {
self.add_value(&mut values, field_name, text_content.to_string());
}
}
}
Ok(Event::CData(ref e)) => {
if let Some(ref field_name) = current_field {
let text = String::from_utf8_lossy(e);
let text_content = text.trim();
if !text_content.is_empty() {
self.add_value(&mut values, field_name, text_content.to_string());
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ParseError::XmlError(format!("XML parsing error: {}", e)));
}
_ => {} }
buf.clear();
}
Ok(SelectiveResult {
values,
elements_processed,
bytes_processed: xml_reader.buffer_position() as usize,
duration: start_time.elapsed(),
})
}
pub fn extract_isrcs_fast<R: BufRead>(
&mut self,
mut reader: R,
) -> Result<Vec<String>, ParseError> {
let mut isrcs = Vec::new();
let mut buffer = Vec::new();
reader
.read_to_end(&mut buffer)
.map_err(|e| ParseError::IoError(format!("Failed to read input: {}", e)))?;
let content = std::str::from_utf8(&buffer).map_err(|e| ParseError::InvalidUtf8 {
message: format!("UTF-8 decoding error at position 0: {}", e),
})?;
self.extract_isrcs_from_content(content, &mut isrcs);
isrcs.sort_unstable();
isrcs.dedup();
Ok(isrcs)
}
fn extract_isrcs_from_content(&self, content: &str, isrcs: &mut Vec<String>) {
let mut pos = 0;
let content_len = content.len();
while pos < content_len {
if let Some(isrc_pos) = self.find_next_isrc_tag(content, pos) {
pos = isrc_pos;
if let Some((isrc, next_pos)) = self.extract_isrc_at_position(content, pos) {
if self.is_valid_isrc(&isrc) {
isrcs.push(isrc);
}
pos = next_pos;
} else {
pos += 1;
}
} else {
break;
}
}
}
fn find_next_isrc_tag(&self, content: &str, start_pos: usize) -> Option<usize> {
let search_slice = &content[start_pos..];
let patterns = [
"<ISRC>",
"<ern:ISRC>",
"<SoundRecordingId",
"<ern:SoundRecordingId",
];
let mut min_pos: Option<usize> = None;
for &pattern in &patterns {
if let Some(found_pos) = search_slice.find(pattern) {
let absolute_pos = start_pos + found_pos;
min_pos =
Some(min_pos.map_or(absolute_pos, |current: usize| current.min(absolute_pos)));
}
}
min_pos
}
fn extract_isrc_at_position(&self, content: &str, pos: usize) -> Option<(String, usize)> {
let remaining = &content[pos..];
if remaining.starts_with("<ISRC>") {
return self.extract_between_tags(content, pos, "<ISRC>", "</ISRC>");
}
if remaining.starts_with("<ern:ISRC>") {
return self.extract_between_tags(content, pos, "<ern:ISRC>", "</ern:ISRC>");
}
if remaining.starts_with("<SoundRecordingId")
|| remaining.starts_with("<ern:SoundRecordingId")
{
if let Some(tag_end) = remaining.find('>') {
let opening_tag = &remaining[..=tag_end];
if opening_tag.contains("Namespace=\"ISRC\"")
|| opening_tag.contains("Namespace='ISRC'")
{
let content_start = pos + tag_end + 1;
let closing_tag = if remaining.starts_with("<ern:") {
"</ern:SoundRecordingId>"
} else {
"</SoundRecordingId>"
};
if let Some(closing_pos) = content[content_start..].find(closing_tag) {
let content_end = content_start + closing_pos;
let isrc = content[content_start..content_end].trim().to_string();
return Some((isrc, content_end + closing_tag.len()));
}
}
}
}
None
}
fn extract_between_tags(
&self,
content: &str,
pos: usize,
open_tag: &str,
close_tag: &str,
) -> Option<(String, usize)> {
let content_start = pos + open_tag.len();
if let Some(content_end_rel) = content[content_start..].find(close_tag) {
let content_end = content_start + content_end_rel;
let extracted = content[content_start..content_end].trim().to_string();
Some((extracted, content_end + close_tag.len()))
} else {
None
}
}
fn is_target_field(&self, name: &str) -> bool {
if self.case_sensitive {
self.target_fields.contains(name)
} else {
self.target_fields
.iter()
.any(|field| field.eq_ignore_ascii_case(name))
}
}
fn extract_element_name(&self, qname: &[u8]) -> Result<String, ParseError> {
let name_str = std::str::from_utf8(qname).map_err(|_| ParseError::IoError(
"Invalid UTF-8 in element name".to_string(),
))?;
let local_name = if let Some(colon_pos) = name_str.find(':') {
&name_str[colon_pos + 1..]
} else {
name_str
};
Ok(local_name.to_string())
}
fn add_value(
&self,
values: &mut std::collections::HashMap<String, Vec<String>>,
field_name: &str,
value: String,
) {
values
.entry(field_name.to_string())
.or_default()
.push(value);
}
fn is_valid_isrc(&self, isrc: &str) -> bool {
if isrc.len() != 12 {
return false;
}
let chars: Vec<char> = isrc.chars().collect();
if !chars[0].is_ascii_alphabetic() || !chars[1].is_ascii_alphabetic() {
return false;
}
for &ch in &chars[2..5] {
if !ch.is_ascii_alphanumeric() {
return false;
}
}
if !chars[5].is_ascii_digit() || !chars[6].is_ascii_digit() {
return false;
}
for &ch in &chars[7..12] {
if !ch.is_ascii_digit() {
return false;
}
}
true
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_isrc_validation() {
let parser = SelectiveParser::for_isrcs();
assert!(parser.is_valid_isrc("USRC17607839"));
assert!(parser.is_valid_isrc("GBUM71505078"));
assert!(parser.is_valid_isrc("FRUM71200001"));
assert!(!parser.is_valid_isrc("USRC1760783")); assert!(!parser.is_valid_isrc("USRC176078391")); assert!(!parser.is_valid_isrc("12RC17607839")); assert!(!parser.is_valid_isrc("USRC1760783A")); }
#[test]
fn test_selective_isrc_extraction() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
<ern:ResourceList>
<ern:SoundRecording>
<ern:SoundRecordingId Namespace="ISRC">USRC17607839</ern:SoundRecordingId>
<ern:ReferenceTitle>
<ern:TitleText>Test Track</ern:TitleText>
</ern:ReferenceTitle>
</ern:SoundRecording>
<ern:SoundRecording>
<ern:SoundRecordingId Namespace="ISRC">GBUM71505078</ern:SoundRecordingId>
<ern:ReferenceTitle>
<ern:TitleText>Another Track</ern:TitleText>
</ern:ReferenceTitle>
</ern:SoundRecording>
</ern:ResourceList>
</ern:NewReleaseMessage>"#;
let cursor = Cursor::new(xml.as_bytes());
let mut parser = SelectiveParser::for_isrcs();
let isrcs = parser.extract_isrcs(cursor).expect("Should extract ISRCs");
assert_eq!(isrcs.len(), 2);
assert!(isrcs.contains(&"USRC17607839".to_string()));
assert!(isrcs.contains(&"GBUM71505078".to_string()));
}
#[test]
fn test_fast_isrc_extraction() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
<ern:ResourceList>
<ern:SoundRecording>
<ISRC>USRC17607839</ISRC>
<ern:ReferenceTitle>
<ern:TitleText>Test Track</ern:TitleText>
</ern:ReferenceTitle>
</ern:SoundRecording>
</ern:ResourceList>
</ern:NewReleaseMessage>"#;
let cursor = Cursor::new(xml.as_bytes());
let mut parser = SelectiveParser::for_isrcs();
let isrcs = parser
.extract_isrcs_fast(cursor)
.expect("Should extract ISRCs");
assert_eq!(isrcs.len(), 1);
assert_eq!(isrcs[0], "USRC17607839");
}
#[test]
fn test_selective_field_extraction() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
<ern:ReleaseList>
<ern:Release>
<ern:ReleaseId>REL001</ern:ReleaseId>
<ern:ReleaseReference>R001</ern:ReleaseReference>
<ern:ReferenceTitle>
<ern:TitleText>My Album</ern:TitleText>
</ern:ReferenceTitle>
</ern:Release>
</ern:ReleaseList>
</ern:NewReleaseMessage>"#;
let cursor = Cursor::new(xml.as_bytes());
let mut parser = SelectiveParser::for_release_metadata();
let result = parser
.extract_fields(cursor)
.expect("Should extract fields");
assert!(result.values.contains_key("ReleaseId"));
assert!(result.values.contains_key("ReleaseReference"));
assert!(result.values.contains_key("TitleText"));
assert_eq!(result.values["ReleaseId"][0], "REL001");
assert_eq!(result.values["ReleaseReference"][0], "R001");
assert_eq!(result.values["TitleText"][0], "My Album");
println!("Extraction results: {:#?}", result);
}
#[test]
fn test_performance_comparison() {
let mut xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8"?>
<ern:NewReleaseMessage xmlns:ern="http://ddex.net/xml/ern/43">
<ern:ResourceList>"#,
);
for i in 0..1000 {
xml.push_str(&format!(
r#"
<ern:SoundRecording>
<ern:SoundRecordingId Namespace="ISRC">USRC{:08}</ern:SoundRecordingId>
<ern:ReferenceTitle>
<ern:TitleText>Test Track {}</ern:TitleText>
</ern:ReferenceTitle>
</ern:SoundRecording>"#,
17600000 + i,
i
));
}
xml.push_str("</ern:ResourceList></ern:NewReleaseMessage>");
let cursor1 = Cursor::new(xml.as_bytes());
let mut parser1 = SelectiveParser::for_isrcs();
let start1 = std::time::Instant::now();
let isrcs1 = parser1
.extract_isrcs(cursor1)
.expect("Standard extraction should work");
let duration1 = start1.elapsed();
let cursor2 = Cursor::new(xml.as_bytes());
let mut parser2 = SelectiveParser::for_isrcs();
let start2 = std::time::Instant::now();
let isrcs2 = parser2
.extract_isrcs_fast(cursor2)
.expect("Fast extraction should work");
let duration2 = start2.elapsed();
println!(
"Standard extraction: {} ISRCs in {:?}",
isrcs1.len(),
duration1
);
println!("Fast extraction: {} ISRCs in {:?}", isrcs2.len(), duration2);
assert_eq!(isrcs1.len(), 1000);
assert_eq!(isrcs2.len(), 1000);
println!(
"Fast extraction speedup: {:.2}x",
duration1.as_nanos() as f64 / duration2.as_nanos() as f64
);
}
}