use crate::error::{EncodingErrorMode, MediaInfoError, Result};
use crate::track::{AttributeValue, Track};
use quick_xml::Reader;
use quick_xml::events::Event;
use std::fmt::Write;
fn bytes_to_string(bytes: &[u8], mode: EncodingErrorMode) -> Result<String> {
match mode {
EncodingErrorMode::Strict => std::str::from_utf8(bytes)
.map(|s| s.to_string())
.map_err(|e| MediaInfoError::XmlParseError(format!("UTF-8 encoding error: {}", e))),
EncodingErrorMode::Replace => Ok(String::from_utf8_lossy(bytes).into_owned()),
EncodingErrorMode::Ignore => Ok(decode_with_handler(bytes, |_byte, _out| {})),
EncodingErrorMode::BackslashReplace => Ok(decode_with_handler(bytes, |byte, out| {
let _ = write!(out, "\\x{:02x}", byte);
})),
EncodingErrorMode::XmlCharRefReplace => Ok(decode_with_handler(bytes, |byte, out| {
let _ = write!(out, "&#{};", byte);
})),
}
}
fn decode_with_handler<F>(bytes: &[u8], mut on_invalid: F) -> String
where
F: FnMut(u8, &mut String),
{
let mut output = String::new();
for chunk in bytes.utf8_chunks() {
output.push_str(chunk.valid());
for &byte in chunk.invalid() {
on_invalid(byte, &mut output);
}
}
output
}
#[allow(dead_code)]
pub(crate) fn parse_xml(xml: &str) -> Result<Vec<Track>> {
parse_xml_with_encoding(xml, EncodingErrorMode::Strict)
}
pub fn parse_xml_with_encoding(xml: &str, encoding_mode: EncodingErrorMode) -> Result<Vec<Track>> {
parse_xml_str_with_encoding(xml, encoding_mode)
}
pub fn parse_xml_bytes_with_encoding(
xml_bytes: &[u8],
encoding_mode: EncodingErrorMode,
) -> Result<Vec<Track>> {
let xml_text = bytes_to_string(xml_bytes, encoding_mode)?;
parse_xml_str_with_encoding(&xml_text, encoding_mode)
}
fn parse_xml_str_with_encoding(xml: &str, encoding_mode: EncodingErrorMode) -> Result<Vec<Track>> {
let mut tracks = Vec::new();
let mut reader = Reader::from_str(xml);
reader.trim_text(false);
let mut inside_file = false;
let mut current_track: Option<Track> = None;
let mut current_element_name: Option<String> = None;
let mut current_element_has_text = false;
let mut current_element_active = false;
let mut current_element_text: Option<String> = None;
let mut track_depth: usize = 0;
let mut repeated_attributes: Vec<(String, String)> = Vec::new();
let mut seen_element = false;
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name_bytes = e.name();
let tag_name = bytes_to_string(name_bytes.as_ref(), encoding_mode)?;
seen_element = true;
match tag_name.as_str() {
"File" => {
inside_file = true;
}
"track" if inside_file => {
let mut track_type = String::new();
for attr in e.attributes() {
let attr = attr?;
if attr.key.as_ref() == b"type" {
track_type = bytes_to_string(&attr.value, encoding_mode)?;
break;
}
}
current_track = Some(Track::new(track_type));
repeated_attributes.clear();
current_element_name = None;
current_element_has_text = false;
current_element_active = false;
current_element_text = None;
track_depth = 0;
}
_ if current_track.is_some() => {
track_depth += 1;
if track_depth == 1 {
current_element_name = Some(normalize_attribute_name(&tag_name));
current_element_has_text = false;
current_element_active = true;
current_element_text = Some(String::new());
} else if track_depth == 2 {
current_element_active = false;
}
}
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
let name_bytes = e.name();
let tag_name = bytes_to_string(name_bytes.as_ref(), encoding_mode)?;
seen_element = true;
match tag_name.as_str() {
"File" => {}
"track" if inside_file => {
let mut track_type = String::new();
for attr in e.attributes() {
let attr = attr?;
if attr.key.as_ref() == b"type" {
track_type = bytes_to_string(&attr.value, encoding_mode)?;
break;
}
}
let mut track = Track::new(track_type);
process_integer_conversion(&mut track, &repeated_attributes);
tracks.push(track);
repeated_attributes.clear();
current_track = None;
track_depth = 0;
current_element_name = None;
current_element_has_text = false;
current_element_active = false;
current_element_text = None;
}
_ if current_track.is_some() => {
track_depth += 1;
if track_depth == 1 {
let element_name = normalize_attribute_name(&tag_name);
if let Some(ref mut track) = current_track {
apply_attribute_value(
track,
&element_name,
None,
&mut repeated_attributes,
);
}
}
track_depth = track_depth.saturating_sub(1);
if track_depth == 0 {
current_element_name = None;
current_element_has_text = false;
current_element_active = false;
current_element_text = None;
}
}
_ => {}
}
}
Ok(Event::Text(ref e)) => {
if current_track.is_some()
&& current_element_name.is_some()
&& track_depth == 1
&& current_element_active
{
let unescaped = e.unescape()?;
let text = bytes_to_string(unescaped.as_bytes(), encoding_mode)?;
if !text.is_empty() {
current_element_has_text = true;
if let Some(ref mut buffer) = current_element_text {
buffer.push_str(&text);
}
}
}
}
Ok(Event::CData(ref e)) => {
if current_track.is_some()
&& current_element_name.is_some()
&& track_depth == 1
&& current_element_active
{
let text = bytes_to_string(e.as_ref(), encoding_mode)?;
if !text.is_empty() {
current_element_has_text = true;
if let Some(ref mut buffer) = current_element_text {
buffer.push_str(&text);
}
}
}
}
Ok(Event::End(ref e)) => {
let name_bytes = e.name();
let tag_name = bytes_to_string(name_bytes.as_ref(), encoding_mode)?;
seen_element = true;
match tag_name.as_str() {
"track" => {
if let Some(mut track) = current_track.take() {
process_integer_conversion(&mut track, &repeated_attributes);
tracks.push(track);
}
repeated_attributes.clear();
track_depth = 0;
current_element_name = None;
current_element_has_text = false;
current_element_active = false;
current_element_text = None;
}
"File" => {
inside_file = false;
}
_ => {
if track_depth == 1
&& let (Some(track), Some(element_name)) =
(&mut current_track, ¤t_element_name)
{
if current_element_has_text {
let text = current_element_text.take().unwrap_or_default();
apply_attribute_value(
track,
element_name,
Some(text),
&mut repeated_attributes,
);
} else {
apply_attribute_value(
track,
element_name,
None,
&mut repeated_attributes,
);
}
}
track_depth = track_depth.saturating_sub(1);
if track_depth == 0 {
current_element_name = None;
current_element_has_text = false;
current_element_active = false;
current_element_text = None;
}
}
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(MediaInfoError::xml_parse_error(e.to_string())),
_ => {}
}
buf.clear();
}
if !seen_element {
return Err(MediaInfoError::xml_parse_error("Invalid XML".to_string()));
}
Ok(tracks)
}
fn normalize_attribute_name(name: &str) -> String {
let mut normalized = name.to_lowercase();
normalized = normalized.trim().trim_matches('_').to_string();
if normalized == "id" {
"track_id".to_string()
} else {
normalized
}
}
fn apply_attribute_value(
track: &mut Track,
element_name: &str,
value: Option<String>,
repeated_attributes: &mut Vec<(String, String)>,
) {
match value {
Some(text) => {
if track.get(element_name).is_some() {
let other_key = format!("other_{}", element_name);
append_other_value(track, &other_key, text);
if !repeated_attributes.iter().any(|(k, _)| k == element_name) {
repeated_attributes.push((element_name.to_string(), other_key));
}
} else {
track.set_string(element_name.to_string(), text);
}
}
None => {
if track.get(element_name).is_some() {
return;
}
track.set_null(element_name.to_string());
}
}
}
fn append_other_value(track: &mut Track, other_key: &str, value: String) {
if let Some(AttributeValue::List(list)) = track.attributes_mut().get_mut(other_key) {
list.push(value);
return;
}
let existing = track.attributes().get(other_key).cloned();
match existing {
Some(AttributeValue::String(s)) => {
track.set_list(other_key.to_string(), vec![s, value]);
}
Some(AttributeValue::Int(i)) => {
track.set_list(other_key.to_string(), vec![i.to_string(), value]);
}
Some(AttributeValue::Null) | None => {
track.set_list(other_key.to_string(), vec![value]);
}
Some(AttributeValue::List(_)) => {}
}
}
fn process_integer_conversion(track: &mut Track, repeated_attributes: &[(String, String)]) {
for (primary_key, other_key) in repeated_attributes {
let primary_value = match track.get(primary_key) {
Some(AttributeValue::String(s)) => s.clone(),
_ => continue,
};
if let Ok(int_val) = primary_value.parse::<i64>() {
track.set_int(primary_key.clone(), int_val);
} else {
let other_values = match track.get(other_key) {
Some(AttributeValue::List(l)) => l.clone(),
_ => continue,
};
let mut found_int = false;
let mut new_other_values = other_values.clone();
for other_val in other_values.iter() {
if let Ok(int_val) = other_val.parse::<i64>() {
track.set_int(primary_key.clone(), int_val);
new_other_values.push(primary_value.clone());
found_int = true;
break;
}
}
if found_int {
track.set_list(other_key.clone(), new_other_values);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_attribute_name() {
assert_eq!(normalize_attribute_name("Duration"), "duration");
assert_eq!(normalize_attribute_name("ID"), "track_id");
assert_eq!(normalize_attribute_name("id"), "track_id");
assert_eq!(normalize_attribute_name("_Format_"), "format");
assert_eq!(normalize_attribute_name("_StreamSize_"), "streamsize");
assert_eq!(normalize_attribute_name(" Format "), "format");
assert_eq!(normalize_attribute_name(" Stream_Size "), "stream_size");
assert_eq!(normalize_attribute_name("Bit_Rate"), "bit_rate");
}
#[test]
fn test_parse_simple_xml() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Format>Matroska</Format>
<Duration>3000</Duration>
</track>
</File>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].track_type(), "General");
assert_eq!(tracks[0].get_string("format"), Some("Matroska"));
}
#[test]
fn test_parse_new_format_xml() {
let xml = r#"<?xml version="1.0"?>
<MediaInfo>
<File>
<track type="General">
<Format>Matroska</Format>
</track>
<track type="Video">
<Format>AVC</Format>
</track>
</File>
</MediaInfo>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 2);
assert_eq!(tracks[0].track_type(), "General");
assert_eq!(tracks[1].track_type(), "Video");
}
#[test]
fn test_parse_repeated_attributes() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Duration>3000</Duration>
<Duration>3 s 0 ms</Duration>
<Duration>00:00:03.000</Duration>
</track>
</File>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_int("duration"), Some(3000));
let other = tracks[0].get_list("other_duration").unwrap();
assert!(other.contains(&"3 s 0 ms".to_string()));
assert!(other.contains(&"00:00:03.000".to_string()));
}
#[test]
fn test_parse_repeated_attributes_swap() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Duration>3 s 0 ms</Duration>
<Duration>3000</Duration>
<Duration>00:00:03.000</Duration>
</track>
</File>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_int("duration"), Some(3000));
let other = tracks[0].get_list("other_duration").unwrap();
assert!(other.contains(&"3 s 0 ms".to_string()));
}
#[test]
fn test_parse_xml_with_track_id() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="Video">
<ID>1</ID>
<Format>AVC</Format>
</track>
</File>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("track_id"), Some("1"));
}
#[test]
fn test_parse_invalid_xml() {
let xml = "not valid xml";
let result = parse_xml(xml);
assert!(result.is_err());
}
#[test]
fn test_bytes_to_string_strict_valid() {
let bytes = b"Hello, World!";
let result = bytes_to_string(bytes, EncodingErrorMode::Strict);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello, World!");
}
#[test]
fn test_bytes_to_string_strict_invalid() {
let bytes = &[0xff, 0xfe, 0x00, 0x01];
let result = bytes_to_string(bytes, EncodingErrorMode::Strict);
assert!(result.is_err());
}
#[test]
fn test_bytes_to_string_replace_invalid() {
let bytes = &[0x48, 0x65, 0x6c, 0x6c, 0x6f, 0xff, 0xfe];
let result = bytes_to_string(bytes, EncodingErrorMode::Replace);
assert!(result.is_ok());
let s = result.unwrap();
assert!(s.contains('\u{FFFD}'));
assert!(s.starts_with("Hello"));
}
#[test]
fn test_bytes_to_string_ignore_invalid() {
let bytes = &[0x48, 0x65, 0x6c, 0x6c, 0x6f, 0xff, 0xfe];
let result = bytes_to_string(bytes, EncodingErrorMode::Ignore);
assert!(result.is_ok());
let s = result.unwrap();
assert!(!s.contains('\u{FFFD}'));
assert_eq!(s, "Hello");
}
#[test]
fn test_bytes_to_string_backslash_replace_invalid() {
let bytes = &[0x48, 0x69, 0xff, 0xfe];
let result = bytes_to_string(bytes, EncodingErrorMode::BackslashReplace);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hi\\xff\\xfe");
}
#[test]
fn test_bytes_to_string_xmlcharrefreplace_invalid() {
let bytes = &[0x4f, 0x4b, 0xff];
let result = bytes_to_string(bytes, EncodingErrorMode::XmlCharRefReplace);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OKÿ");
}
#[test]
fn test_parse_xml_with_encoding_replace() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Format>MKV</Format>
</track>
</File>"#;
let tracks = parse_xml_with_encoding(xml, EncodingErrorMode::Replace).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("format"), Some("MKV"));
}
#[test]
fn test_parse_xml_with_encoding_strict() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Format>MP4</Format>
</track>
</File>"#;
let tracks = parse_xml_with_encoding(xml, EncodingErrorMode::Strict).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("format"), Some("MP4"));
}
#[test]
fn test_parse_xml_with_encoding_ignore() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Format>AVI</Format>
</track>
</File>"#;
let tracks = parse_xml_with_encoding(xml, EncodingErrorMode::Ignore).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("format"), Some("AVI"));
}
#[test]
fn test_parse_xml_bytes_with_encoding_strict_invalid() {
let xml_bytes = b"<?xml version=\"1.0\"?><File><track type=\"General\"><Title>Bad\xFF</Title></track></File>";
let result = parse_xml_bytes_with_encoding(xml_bytes, EncodingErrorMode::Strict);
assert!(result.is_err());
}
#[test]
fn test_parse_xml_bytes_with_encoding_replace() {
let xml_bytes = b"<?xml version=\"1.0\"?><File><track type=\"General\"><Title>Bad\xFF</Title></track></File>";
let tracks = parse_xml_bytes_with_encoding(xml_bytes, EncodingErrorMode::Replace).unwrap();
assert_eq!(tracks.len(), 1);
let title = tracks[0].get_string("title").unwrap();
assert!(title.contains('\u{FFFD}'));
}
#[test]
fn test_parse_xml_bytes_with_encoding_ignore() {
let xml_bytes = b"<?xml version=\"1.0\"?><File><track type=\"General\"><Title>Bad\xFF</Title></track></File>";
let tracks = parse_xml_bytes_with_encoding(xml_bytes, EncodingErrorMode::Ignore).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("title"), Some("Bad"));
}
#[test]
fn test_cdata_is_parsed_as_text() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Description><![CDATA[CDATA with <tags>]]></Description>
</track>
</File>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(
tracks[0].get_string("description"),
Some("CDATA with <tags>")
);
}
#[test]
fn test_tracks_outside_file_are_ignored() {
let xml = r#"<?xml version="1.0"?>
<MediaInfo>
<track type="General"><Format>Outside</Format></track>
<File>
<track type="General"><Format>Inside</Format></track>
</File>
</MediaInfo>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("format"), Some("Inside"));
}
#[test]
fn test_xml_entities_are_unescaped_exactly_once() {
let xml = r#"<?xml version="1.0"?>
<File>
<track type="General">
<Title>A & B</Title>
<Album>&amp;</Album>
<Comment>x & y</Comment>
</track>
</File>"#;
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("title"), Some("A & B"));
assert_eq!(tracks[0].get_string("album"), Some("&"));
assert_eq!(tracks[0].get_string("comment"), Some("x & y"));
}
#[test]
fn test_whitespace_only_text_is_preserved() {
let xml = "<?xml version=\"1.0\"?>\n\
<File>\n\
<track type=\"General\">\n\
<Format> </Format>\n\
<Album>\t</Album>\n\
<Comment> hi </Comment>\n\
</track>\n\
</File>";
let tracks = parse_xml(xml).unwrap();
assert_eq!(tracks.len(), 1);
assert_eq!(tracks[0].get_string("format"), Some(" "));
assert_eq!(tracks[0].get_string("album"), Some("\t"));
assert_eq!(tracks[0].get_string("comment"), Some(" hi "));
}
}