edit_xml/utils/
encoding.rs

1use std::borrow::Cow;
2
3use quick_xml::escape::{EscapeError, ParseCharRefError};
4
5#[inline]
6fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
7    match src.as_bytes().first().copied() {
8        // We should not allow sign numbers, but u32::from_str_radix will accept `+`.
9        // We also handle `-` to be consistent in returned errors
10        Some(b'+') | Some(b'-') => Err(ParseCharRefError::UnexpectedSign),
11        _ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
12    }
13}
14fn parse_number(num: &str) -> Result<char, ParseCharRefError> {
15    let code = if let Some(hex) = num.strip_prefix('x') {
16        from_str_radix(hex, 16)?
17    } else {
18        from_str_radix(num, 10)?
19    };
20    if code == 0 {
21        return Err(ParseCharRefError::IllegalCharacter(code));
22    }
23    match std::char::from_u32(code) {
24        Some(c) => Ok(c),
25        None => Err(ParseCharRefError::InvalidCodepoint(code)),
26    }
27}
28/// Will unescape the given string and ignore any unknown entities
29pub fn unescape_with_and_ignore<'input, 'entity, F>(
30    raw: &'input str,
31    mut resolve_entity: F,
32) -> Result<Cow<'input, str>, EscapeError>
33where
34    // the lifetime of the output comes from a capture or is `'static`
35    F: FnMut(&str) -> Option<&'entity str>,
36{
37    let bytes = raw.as_bytes();
38    let mut unescaped = None;
39    let mut last_end = 0;
40    let mut iter = memchr::Memchr2::new(b'&', b';', bytes);
41    while let Some(start) = iter.by_ref().find(|p| bytes[*p] == b'&') {
42        match iter.next() {
43            Some(end) if bytes[end] == b';' => {
44                // append valid data
45                if unescaped.is_none() {
46                    unescaped = Some(String::with_capacity(raw.len()));
47                }
48                let unescaped = unescaped.as_mut().expect("initialized");
49                unescaped.push_str(&raw[last_end..start]);
50
51                // search for character correctness
52                let pat = &raw[start + 1..end];
53                if let Some(entity) = pat.strip_prefix('#') {
54                    let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
55                    unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
56                } else if let Some(value) = resolve_entity(pat) {
57                    unescaped.push_str(value);
58                } else {
59                    tracing::warn!("Unknown entity: {:?}", pat);
60                    unescaped.push_str(&raw[start..=end]);
61                }
62
63                last_end = end + 1;
64            }
65            _ => return Err(EscapeError::UnterminatedEntity(start..raw.len())),
66        }
67    }
68
69    if let Some(mut unescaped) = unescaped {
70        if let Some(raw) = raw.get(last_end..) {
71            unescaped.push_str(raw);
72        }
73        Ok(Cow::Owned(unescaped))
74    } else {
75        Ok(Cow::Borrowed(raw))
76    }
77}
78pub fn unescape_with<'input, 'entity, F>(
79    raw: &'input str,
80    resolve_entity: F,
81) -> Result<Cow<'input, str>, EscapeError>
82where
83    // the lifetime of the output comes from a capture or is `'static`
84    F: FnMut(&str) -> Option<&'entity str>,
85{
86    #[cfg(feature = "soft-fail-unescape")]
87    {
88        unescape_with_and_ignore(raw, resolve_entity)
89    }
90    #[cfg(not(feature = "soft-fail-unescape"))]
91    {
92        quick_xml::escape::unescape_with(raw, resolve_entity)
93    }
94}
95
96#[cfg(test)]
97mod tests {
98
99    #[cfg(any(feature = "soft-fail-unescape", feature = "escape-html"))]
100    #[test]
101    fn oslash() -> anyhow::Result<()> {
102        use anyhow::Context;
103
104        use crate::{utils::tests, Document, ReadOptions};
105        use std::fs::read_to_string;
106        tests::setup_logger();
107        let file_path = tests::test_dir()
108            .join("bugs")
109            .join("oslash")
110            .join("oslash.xml");
111        if !file_path.exists() {
112            anyhow::bail!("File not found: {:?}", file_path);
113        }
114        let file = read_to_string(file_path).context("Failed to read file")?;
115
116        let doc = Document::parse_str_with_opts(&file, ReadOptions::relaxed()).unwrap();
117        let root = doc.root_element().context("Root Element not found")?;
118        let developers = root
119            .find(&doc, "developers")
120            .context("Developers Element not found")?;
121
122        for children in developers.children(&doc) {
123            println!("{:#?}", children.debug(&doc));
124        }
125        println!("Parse Successful");
126        Ok(())
127    }
128}