anyxml 0.12.0

A fully spec-conformant XML library
Documentation
use crate::{
    error::XMLError,
    sax::{InputSource, ParserSpec, SAXHandler, XMLReader, error::fatal_error},
};

impl<'a, Spec: ParserSpec<Reader = InputSource<'a>>, H: SAXHandler + ?Sized> XMLReader<Spec, H> {
    pub(crate) fn is_char(&self, c: char) -> bool {
        self.version.is_char(c)
    }

    pub(crate) fn is_name_start_char(&self, c: char) -> bool {
        self.version.is_name_start_char(c)
    }

    pub(crate) fn is_name_char(&self, c: char) -> bool {
        self.version.is_name_char(c)
    }

    pub(crate) fn is_whitespace(&self, c: char) -> bool {
        self.version.is_whitespace(c)
    }

    pub(crate) fn skip_whitespaces(&mut self) -> Result<usize, XMLError> {
        let mut skipped = 0;
        while let Some(w) = self.source.peek_char()? {
            if !self.is_whitespace(w) {
                break;
            }
            self.source.next_char()?;

            match w {
                '\x20' | '\t' => self.locator.update_column(|c| c + 1),
                '\n' => {
                    self.locator.set_column(1);
                    self.locator.update_line(|l| l + 1);
                }
                '\r' => {
                    if self.source.peek_char()?.is_some_and(|c| c == '\n') {
                        self.source.next_char()?;
                    }
                    self.locator.set_column(1);
                    self.locator.update_line(|l| l + 1);
                }
                _ => unimplemented!(),
            }
            skipped += 1;
        }

        Ok(skipped)
    }

    pub(crate) fn parse_nmtoken(&mut self, buffer: &mut String) -> Result<(), XMLError> {
        let orig = buffer.len();
        while let Some(c) = self.source.next_char_if(|c| self.version.is_name_char(c))? {
            buffer.push(c);
            self.locator.update_column(|c| c + 1);
        }

        if buffer.len() == orig {
            fatal_error!(self, ParserEmptyName, "Nmtoken is empty.");
            return Err(XMLError::ParserEmptyName);
        }
        Ok(())
    }

    pub(crate) fn parse_name(&mut self, buffer: &mut String) -> Result<(), XMLError> {
        let Some(c) = self
            .source
            .next_char_if(|c| self.version.is_name_start_char(c))?
        else {
            fatal_error!(self, ParserEmptyName, "Name is empty.");
            return Err(XMLError::ParserEmptyName);
        };
        buffer.push(c);
        self.locator.update_column(|c| c + 1);

        while let Some(c) = self.source.next_char_if(|c| self.version.is_name_char(c))? {
            buffer.push(c);
            self.locator.update_column(|c| c + 1);
        }

        Ok(())
    }

    /// Even if NCName is empty, no error will be reported.
    fn parse_ncname_allow_empty(&mut self, buffer: &mut String) -> Result<(), XMLError> {
        if self.source.content_bytes().is_empty() {
            self.source.grow()?;
        }
        let mut col = self.locator.column();
        let content = self.source.content_bytes();
        let rem = content
            .iter()
            .position(|b| !b.is_ascii_alphabetic() && *b != b'_')
            .unwrap_or(content.len());
        if rem > 0 {
            buffer.push_str(unsafe {
                // # Safety
                // `content[..rem]` contains only ASCII alphabetic or '_',
                // so UTF-8 validation won't fail.
                std::str::from_utf8_unchecked(&content[..rem])
            });
            self.source.advance(rem);
            col += rem;
        } else {
            let Some(c) = self
                .source
                .next_char_if(|c| self.version.is_name_start_char(c) && c != ':')?
            else {
                return Ok(());
            };
            buffer.push(c);
            col += 1;
        }
        self.source.grow()?;

        while let content = self.source.content_bytes()
            && !content.is_empty()
        {
            match content
                .iter()
                .position(|b| !b.is_ascii_alphanumeric() && !matches!(b, b'-' | b'.' | b'_'))
            {
                Some(rem) if rem > 0 => {
                    buffer.push_str(unsafe {
                        // # Safety
                        // `content[..rem]` contains only ASCII alphanumeric , b'-', b'.' or '_',
                        // so UTF-8 validation won't fail.
                        std::str::from_utf8_unchecked(&content[..rem])
                    });
                    let end = content[rem] < 0x80;
                    self.source.advance(rem);
                    col += rem;
                    if end {
                        break;
                    }
                }
                None => {
                    buffer.push_str(unsafe {
                        // # Safety
                        // `content` contains only ASCII alphanumeric , b'-', b'.' or '_',
                        // so UTF-8 validation won't fail.
                        std::str::from_utf8_unchecked(content)
                    });
                    let len = content.len();
                    self.source.advance(len);
                    col += len;
                }
                _ => {}
            }

            if let Some(c) = self.source.peek_char()?
                && self.version.is_name_char(c)
                && c != ':'
            {
                buffer.push(c);
                col += 1;
                self.source.advance(c.len_utf8());
            } else {
                break;
            }
            self.source.grow()?;
        }

        self.locator.set_column(col);
        Ok(())
    }

    pub(crate) fn parse_ncname(&mut self, buffer: &mut String) -> Result<(), XMLError> {
        let orig = buffer.len();
        self.parse_ncname_allow_empty(buffer)?;
        if buffer.len() == orig {
            fatal_error!(self, ParserEmptyName, "Name is empty.");
            return Err(XMLError::ParserEmptyName);
        }
        Ok(())
    }

    /// Return the length of prefix if some errors occurred.
    pub(crate) fn parse_qname(&mut self, buffer: &mut String) -> Result<usize, XMLError> {
        let orig = buffer.len();
        self.parse_ncname_allow_empty(buffer)?;

        if self.source.next_char_if(|c| c == ':')?.is_none() {
            return if buffer.len() == orig {
                fatal_error!(self, ParserEmptyQName, "QName is empty.");
                Err(XMLError::ParserEmptyQName)
            } else {
                Ok(0)
            };
        };
        if buffer.len() == orig {
            fatal_error!(
                self,
                ParserEmptyQNamePrefix,
                "':' is found in QName, but its prefix is empty."
            );
        }
        let prefix = buffer.len() - orig;
        buffer.push(':');
        self.locator.update_column(|c| c + 1);
        self.parse_ncname_allow_empty(buffer)?;

        if buffer.len() == orig + prefix + 1 {
            fatal_error!(
                self,
                ParserEmptyQNameLocalPart,
                "':' is found in QName, but its local part is empty."
            );
            Err(XMLError::ParserEmptyQNameLocalPart)
        } else if prefix == 0 {
            Err(XMLError::ParserEmptyQNamePrefix)
        } else {
            Ok(prefix)
        }
    }

    pub fn validate_nmtoken(&self, nmtoken: &str) -> Result<(), XMLError> {
        if nmtoken.is_empty() {
            return Err(XMLError::ParserEmptyNmtoken);
        }
        nmtoken
            .chars()
            .all(|c| self.is_name_char(c))
            .then_some(())
            .ok_or(XMLError::ParserInvalidNameChar)
    }

    pub fn validate_name(&self, name: &str) -> Result<(), XMLError> {
        if name.is_empty() {
            return Err(XMLError::ParserEmptyName);
        }

        name.strip_prefix(|c| self.is_name_start_char(c))
            .ok_or(XMLError::ParserInvalidNameStartChar)?
            .chars()
            .all(|c| self.is_name_char(c))
            .then_some(())
            .ok_or(XMLError::ParserInvalidNameChar)
    }

    pub fn validate_ncname(&self, name: &str) -> Result<(), XMLError> {
        if name.is_empty() {
            return Err(XMLError::ParserEmptyNCName);
        }

        name.strip_prefix(|c| c != ':' && self.is_name_start_char(c))
            .ok_or(XMLError::ParserInvalidNCNameStartChar)?
            .chars()
            .all(|c| c != ':' && self.is_name_char(c))
            .then_some(())
            .ok_or(XMLError::ParserInvalidNCNameChar)
    }

    pub fn validate_qname(&self, mut name: &str) -> Result<(), XMLError> {
        if name.is_empty() {
            return Err(XMLError::ParserEmptyQName);
        }

        if name.starts_with(':') {
            return Err(XMLError::ParserEmptyQNamePrefix);
        }

        name = name
            .strip_prefix(|c| self.is_name_start_char(c))
            .ok_or(XMLError::ParserInvalidNCNameStartChar)?;
        name = name.trim_start_matches(|c| c != ':' && self.is_name_char(c));

        if name.is_empty() {
            // This is an UnprefixedName
            return Ok(());
        }
        name = name
            .strip_prefix(|c| c == ':')
            .ok_or(XMLError::ParserInvalidQNameSeparator)?;
        if name.is_empty() {
            return Err(XMLError::ParserEmptyQNameLocalPart);
        }
        self.validate_ncname(name)
    }

    pub fn validate_nmtokens(&self, nmtokens: &str) -> Result<(), XMLError> {
        if nmtokens.is_empty() {
            return Err(XMLError::ParserEmptyNmtokens);
        }
        self.validate_names(nmtokens, |nmtoken| self.validate_nmtoken(nmtoken))
    }

    pub fn validate_names(
        &self,
        names: &str,
        name_validation: impl Fn(&str) -> Result<(), XMLError>,
    ) -> Result<(), XMLError> {
        if names.is_empty() {
            return Err(XMLError::ParserEmptyNames);
        }
        for name in names.split('\x20') {
            name_validation(name)?;
        }
        Ok(())
    }
}