htmlite 0.12.0

An HTML manipulation toolkit
Documentation
use std::sync::Arc;

// Basically a Vec<char> with convenience methods for operations defined in the spec.
pub(crate) struct InputStream {
    input: Arc<String>,
    pos: usize,
    eof: bool,
    current_codepoint_size: u8,
    checkpoints: Vec<Checkpoint>,
}

struct Checkpoint {
    current_codepoint_size: u8,
    byte_offset: usize,
    eof: bool,
}

impl InputStream {
    pub(crate) fn new(input: &str) -> InputStream {
        // TODO: Do this normalization elsewhere,
        let normalized = input.replace("\r\n", "\n").replace("\r", "\n");
        InputStream {
            input: Arc::new(normalized),
            pos: 0,
            eof: false,
            current_codepoint_size: 0,
            checkpoints: Vec::new(),
        }
    }

    pub(crate) fn byte_offset(&self) -> usize {
        self.pos
    }

    // Look ahead `n` codepoints.
    // Returns an empty string if there are not enough codepoints left.
    pub(crate) fn lookahead(&mut self, n: usize) -> &str {
        debug_assert!(n > 0, "lookahead must be greater than 0");

        let mut offset = 0;

        for codepoint in self.remaining().chars().take(n) {
            offset += codepoint.len_utf8();
        }

        let peeked = &self.remaining()[..offset];

        if peeked.chars().count() < n {
            return "";
        }

        peeked
    }

    pub(crate) fn advance(&mut self, step: usize) {
        for _ in 0..step {
            let _ = self.consume();
        }
    }

    // https://html.spec.whatwg.org/multipage/parsing.html#next-input-character
    pub(crate) fn peek(&self) -> Option<char> {
        self.remaining().chars().next()
    }

    // For supporting the "reconsume" operation
    // See: https://html.spec.whatwg.org/multipage/parsing.html#reconsume
    pub(crate) fn reconsume(&mut self) {
        if self.pos == 0 {
            // If we haven't consumed anything yet, there is nothing do.
            return;
        };

        if self.eof {
            // If we are already consumed the end of the input, reconsuming should not do anything.
            // We want to continue giving you eof.
            return;
        }

        if self.current_codepoint_size == 0 {
            panic!("can't reconsume twice in a row")
        }

        self.pos -= self.current_codepoint_size as usize;
        self.current_codepoint_size = 0;
    }

    // The "consume next input character" operation
    pub(crate) fn consume(&mut self) -> Option<char> {
        let Some(next) = self.remaining().chars().next() else {
            self.eof = true;
            self.current_codepoint_size = 0;
            return None;
        };

        self.pos += next.len_utf8();
        self.current_codepoint_size = next.len_utf8() as u8;

        Some(next)
    }

    // Save the current position so we can later rewind to it
    pub(crate) fn mark(&mut self) {
        self.checkpoints.push(Checkpoint {
            byte_offset: self.pos,
            current_codepoint_size: self.current_codepoint_size,
            eof: self.eof,
        });
    }

    // The spec does not require a rewind operation explicitely.
    // But in practice, it is needed when tokenizing named character references.
    pub(crate) fn rewind(&mut self) {
        let Some(checkpoint) = self.checkpoints.pop() else {
            panic!("can't rewind input stream without available checkpoints")
        };

        self.eof = checkpoint.eof;
        self.pos = checkpoint.byte_offset;
        self.current_codepoint_size = checkpoint.current_codepoint_size;
    }

    pub(crate) fn get_input(&self) -> Arc<String> {
        self.input.clone()
    }

    fn remaining(&self) -> &str {
        &self.input[self.pos..]
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn consuming_an_empty_stream() {
        let mut stream = InputStream::new("");

        assert_eq!(stream.consume(), None);
        assert_eq!(stream.consume(), None);
        stream.reconsume();
        assert_eq!(stream.consume(), None);
    }

    #[test]
    fn reconsuming() {
        let mut stream = InputStream::new("abc");
        assert_eq!(stream.consume(), Some('a'));
        assert_eq!(stream.consume(), Some('b'));
        assert_eq!(stream.consume(), Some('c'));
        stream.reconsume();
        assert_eq!(stream.consume(), Some('c'));
        assert_eq!(stream.consume(), None);
    }

    #[test]
    fn reconsuming_before_anything_has_been_consumed() {
        // Reconsuming before anything has been consumed should have no effect
        let mut stream = InputStream::new("abc");
        stream.reconsume();
        stream.reconsume();
        assert_eq!(stream.consume(), Some('a'));
    }

    #[test]
    fn can_reconsume_twice_after_exactly_one_call_to_consume() {
        let mut stream = InputStream::new("abc");
        assert_eq!(stream.consume(), Some('a'));
        stream.reconsume();
        stream.reconsume();
        assert_eq!(stream.consume(), Some('a'));
        assert_eq!(stream.consume(), Some('b'));
    }

    #[test]
    fn reconsuming_after_end_of_stream_has_been_reached() {
        let mut s = InputStream::new("a");
        assert_eq!(s.consume(), Some('a'));
        assert_eq!(s.consume(), None);
        s.reconsume();
        s.reconsume();
        s.reconsume();
        assert_eq!(s.consume(), None);
    }

    #[test]
    #[should_panic = "can't reconsume twice in a row"]
    fn reconsuming_twice_in_row_in_the_middle_of_the_input() {
        let mut s = InputStream::new("ab");
        assert_eq!(s.consume(), Some('a'));
        assert_eq!(s.consume(), Some('b'));
        s.reconsume();
        s.reconsume();
    }

    #[test]
    fn lookahead_returns_none_when_there_is_not_enough_bytes() {
        let mut s = InputStream::new("🙈🙉🙊");

        assert_eq!(s.lookahead(3), "🙈🙉🙊");

        dbg!(s.consume().unwrap());

        assert_eq!(s.lookahead(3), "");
        assert_eq!(s.lookahead(2), "🙉🙊");

        s.consume().unwrap();

        assert_eq!(s.lookahead(2), "");
        assert_eq!(s.lookahead(1), "🙊");

        s.consume().unwrap();

        assert_eq!(s.lookahead(1), "");
    }

    #[test]
    fn rewinding_to_the_start_of_the_stream() {
        let mut s = InputStream::new("abc");

        s.mark();
        assert_eq!(s.consume(), Some('a'));
        assert_eq!(s.consume(), Some('b'));
        assert_eq!(s.consume(), Some('c'));
        assert_eq!(s.consume(), None);

        s.rewind();
        assert_eq!(s.consume(), Some('a'));
        assert_eq!(s.consume(), Some('b'));
        assert_eq!(s.consume(), Some('c'));
        assert_eq!(s.consume(), None);
    }

    #[test]
    fn rewinding_after_reconsuming() {
        let mut s = InputStream::new("abc");
        assert_eq!(s.consume(), Some('a'));
        assert_eq!(s.consume(), Some('b'));
        s.reconsume();
        s.mark();

        assert_eq!(s.consume(), Some('b'));
        assert_eq!(s.consume(), Some('c'));
        assert_eq!(s.consume(), None);

        s.rewind();

        assert_eq!(s.consume(), Some('b'));
        assert_eq!(s.consume(), Some('c'));
        assert_eq!(s.consume(), None);
    }
}