html5ever 0.18.0

High-performance browser-grade HTML5 parser
Documentation
// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::{Tokenizer, TokenSink};
use buffer_queue::BufferQueue;
use data;
use tendril::StrTendril;
use util::str::{is_ascii_alnum};

use std::char::from_u32;
use std::borrow::Cow::Borrowed;

pub use self::Status::*;
use self::State::*;

//ยง tokenizing-character-references
pub struct CharRef {
    /// The resulting character(s)
    pub chars: [char; 2],

    /// How many slots in `chars` are valid?
    pub num_chars: u8,
}

pub enum Status {
    Stuck,
    Progress,
    Done,
}

#[derive(Debug)]
enum State {
    Begin,
    Octothorpe,
    Numeric(u32), // base
    NumericSemicolon,
    Named,
    BogusName,
}

pub struct CharRefTokenizer {
    state: State,
    addnl_allowed: Option<char>,
    result: Option<CharRef>,

    num: u32,
    num_too_big: bool,
    seen_digit: bool,
    hex_marker: Option<char>,

    name_buf_opt: Option<StrTendril>,
    name_match: Option<(u32, u32)>,
    name_len: usize,
}

impl CharRefTokenizer {
    // NB: We assume that we have an additional allowed character iff we're
    // tokenizing in an attribute value.
    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
        CharRefTokenizer {
            state: Begin,
            addnl_allowed: addnl_allowed,
            result: None,
            num: 0,
            num_too_big: false,
            seen_digit: false,
            hex_marker: None,
            name_buf_opt: None,
            name_match: None,
            name_len: 0,
        }
    }

    // A CharRefTokenizer can only tokenize one character reference,
    // so this method consumes the tokenizer.
    pub fn get_result(self) -> CharRef {
        self.result.expect("get_result called before done")
    }

    fn name_buf<'t>(&'t self) -> &'t StrTendril {
        self.name_buf_opt.as_ref()
            .expect("name_buf missing in named character reference")
    }

    fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril {
        self.name_buf_opt.as_mut()
            .expect("name_buf missing in named character reference")
    }

    fn finish_none(&mut self) -> Status {
        self.result = Some(CharRef {
            chars: ['\0', '\0'],
            num_chars: 0,
        });
        Done
    }

    fn finish_one(&mut self, c: char) -> Status {
        self.result = Some(CharRef {
            chars: [c, '\0'],
            num_chars: 1,
        });
        Done
    }
}

impl CharRefTokenizer {
    pub fn step<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        if self.result.is_some() {
            return Done;
        }

        debug!("char ref tokenizer stepping in state {:?}", self.state);
        match self.state {
            Begin => self.do_begin(tokenizer, input),
            Octothorpe => self.do_octothorpe(tokenizer, input),
            Numeric(base) => self.do_numeric(tokenizer, input, base),
            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
            Named => self.do_named(tokenizer, input),
            BogusName => self.do_bogus_name(tokenizer, input),
        }
    }

    fn do_begin<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        match unwrap_or_return!(tokenizer.peek(input), Stuck) {
            '\t' | '\n' | '\x0C' | ' ' | '<' | '&'
                => self.finish_none(),
            c if Some(c) == self.addnl_allowed
                => self.finish_none(),

            '#' => {
                tokenizer.discard_char(input);
                self.state = Octothorpe;
                Progress
            }

            _ => {
                self.state = Named;
                self.name_buf_opt = Some(StrTendril::new());
                Progress
            }
        }
    }

    fn do_octothorpe<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
        match c {
            'x' | 'X' => {
                tokenizer.discard_char(input);
                self.hex_marker = Some(c);
                self.state = Numeric(16);
            }

            _ => {
                self.hex_marker = None;
                self.state = Numeric(10);
            }
        }
        Progress
    }

    fn do_numeric<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue,
            base: u32)
            -> Status {
        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
        match c.to_digit(base) {
            Some(n) => {
                tokenizer.discard_char(input);
                self.num = self.num.wrapping_mul(base);
                if self.num > 0x10FFFF {
                    // We might overflow, and the character is definitely invalid.
                    // We still parse digits and semicolon, but don't use the result.
                    self.num_too_big = true;
                }
                self.num = self.num.wrapping_add(n);
                self.seen_digit = true;
                Progress
            }

            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),

            None => {
                self.state = NumericSemicolon;
                Progress
            }
        }
    }

    fn do_numeric_semicolon<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        match unwrap_or_return!(tokenizer.peek(input), Stuck) {
            ';' => tokenizer.discard_char(input),
            _   => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")),
        };
        self.finish_numeric(tokenizer)
    }

    fn unconsume_numeric<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        let mut unconsume = StrTendril::from_char('#');
        match self.hex_marker {
            Some(c) => unconsume.push_char(c),
            None => (),
        }

        input.push_front(unconsume);
        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
        self.finish_none()
    }

    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
        fn conv(n: u32) -> char {
            from_u32(n).expect("invalid char missed by error handling cases")
        }

        let (c, error) = match self.num {
            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
            0x00 | 0xD800...0xDFFF => ('\u{fffd}', true),

            0x80...0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
                Some(c) => (c, true),
                None => (conv(self.num), true),
            },

            0x01...0x08 | 0x0B | 0x0D...0x1F | 0x7F | 0xFDD0...0xFDEF
                => (conv(self.num), true),

            n if (n & 0xFFFE) == 0xFFFE
                => (conv(n), true),

            n => (conv(n), false),
        };

        if error {
            let msg = format_if!(tokenizer.opts.exact_errors,
                "Invalid numeric character reference",
                "Invalid numeric character reference value 0x{:06X}", self.num);
            tokenizer.emit_error(msg);
        }

        self.finish_one(c)
    }

    fn do_named<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
        self.name_buf_mut().push_char(c);
        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
            // We have either a full match or a prefix of one.
            Some(&m) => {
                if m.0 != 0 {
                    // We have a full match, but there might be a longer one to come.
                    self.name_match = Some(m);
                    self.name_len = self.name_buf().len();
                }
                // Otherwise we just have a prefix match.
                Progress
            }

            // Can't continue the match.
            None => self.finish_named(tokenizer, input, Some(c)),
        }
    }

    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
        let msg = format_if!(tokenizer.opts.exact_errors,
            "Invalid character reference",
            "Invalid character reference &{}", self.name_buf());
        tokenizer.emit_error(msg);
    }

    fn unconsume_name(&mut self, input: &mut BufferQueue) {
        input.push_front(self.name_buf_opt.take().unwrap());
    }

    fn finish_named<Sink: TokenSink>(&mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue,
            end_char: Option<char>) -> Status {
        match self.name_match {
            None => {
                match end_char {
                    Some(c) if is_ascii_alnum(c) => {
                        // Keep looking for a semicolon, to determine whether
                        // we emit a parse error.
                        self.state = BogusName;
                        return Progress;
                    }

                    // Check length because &; is not a parse error.
                    Some(';') if self.name_buf().len() > 1
                        => self.emit_name_error(tokenizer),

                    _ => (),
                }
                self.unconsume_name(input);
                self.finish_none()
            }

            Some((c1, c2)) => {
                // We have a complete match, but we may have consumed
                // additional characters into self.name_buf.  Usually
                // at least one, but several in cases like
                //
                //     &not    => match for U+00AC
                //     &noti   => valid prefix for &notin
                //     &notit  => can't continue match

                let name_len = self.name_len;
                assert!(name_len > 0);
                let last_matched = self.name_buf()[name_len-1..].chars().next().unwrap();

                // There might not be a next character after the match, if
                // we had a full match and then hit EOF.
                let next_after = if name_len == self.name_buf().len() {
                    None
                } else {
                    Some(self.name_buf()[name_len..].chars().next().unwrap())
                };

                // "If the character reference is being consumed as part of an
                // attribute, and the last character matched is not a U+003B
                // SEMICOLON character (;), and the next character is either a
                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
                // character, then, for historical reasons, all the characters
                // that were matched after the U+0026 AMPERSAND character (&)
                // must be unconsumed, and nothing is returned. However, if
                // this next character is in fact a U+003D EQUALS SIGN
                // character (=), then this is a parse error"

                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
                    (_, ';', _) => false,
                    (Some(_), _, Some('=')) => {
                        tokenizer.emit_error(Borrowed("Equals sign after character reference in attribute"));
                        true
                    }
                    (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
                    _ => {
                        tokenizer.emit_error(Borrowed("Character reference does not end with semicolon"));
                        false
                    }
                };

                if unconsume_all {
                    self.unconsume_name(input);
                    self.finish_none()
                } else {
                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
                    self.result = Some(CharRef {
                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
                        num_chars: if c2 == 0 { 1 } else { 2 },
                    });
                    Done
                }
            }
        }
    }

    fn do_bogus_name<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue)
            -> Status {
        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
        self.name_buf_mut().push_char(c);
        match c {
            _ if is_ascii_alnum(c) => return Progress,
            ';' => self.emit_name_error(tokenizer),
            _ => ()
        }
        self.unconsume_name(input);
        self.finish_none()
    }

    pub fn end_of_file<Sink: TokenSink>(
            &mut self,
            tokenizer: &mut Tokenizer<Sink>,
            input: &mut BufferQueue) {
        while self.result.is_none() {
            match self.state {
                Begin => drop(self.finish_none()),

                Numeric(_) if !self.seen_digit
                    => drop(self.unconsume_numeric(tokenizer, input)),

                Numeric(_) | NumericSemicolon => {
                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
                    self.finish_numeric(tokenizer);
                }

                Named => drop(self.finish_named(tokenizer, input, None)),

                BogusName => {
                    self.unconsume_name(input);
                    self.finish_none();
                }

                Octothorpe => {
                    input.push_front(StrTendril::from_slice("#"));
                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                    self.finish_none();
                }
            }
        }
    }
}