use super::{
error::{CompilationError, CompilationErrorKind as ErrorKind, ErrorHandler},
util::{non_whitespace, VStr},
Name, Position, SourceLocation,
};
use rustc_hash::FxHashSet;
use std::{iter::FusedIterator, str::Bytes};
#[cfg(feature = "serde")]
use serde::Serialize;
#[cfg_attr(feature = "serde", derive(Serialize))]
pub struct Attribute<'a> {
pub name: Name<'a>,
pub value: Option<AttributeValue<'a>>,
pub name_loc: SourceLocation,
pub location: SourceLocation,
}
impl<'a> Attribute<'a> {
pub fn has_empty_val(&self) -> bool {
self.value
.as_ref()
.map_or(true, |v| !v.content.contains(non_whitespace))
}
}
#[cfg_attr(feature = "serde", derive(Serialize))]
pub struct AttributeValue<'a> {
pub content: VStr<'a>,
pub location: SourceLocation,
}
#[cfg_attr(feature = "serde", derive(Serialize))]
pub struct Tag<'a> {
pub name: Name<'a>,
pub attributes: Vec<Attribute<'a>>,
pub self_closing: bool,
}
#[cfg_attr(feature = "serde", derive(Serialize))]
pub enum Token<'a> {
StartTag(Tag<'a>),
EndTag(Name<'a>), Text(VStr<'a>), Comment(&'a str),
Interpolation(&'a str), }
impl<'a> From<&'a str> for Token<'a> {
fn from(decoded: &'a str) -> Self {
Token::Text(VStr::raw(decoded))
}
}
#[derive(Clone)]
pub struct ScanOption {
pub delimiters: (String, String),
pub get_text_mode: fn(&str) -> TextMode,
}
impl Default for ScanOption {
fn default() -> Self {
Self {
delimiters: ("{{".into(), "}}".into()),
get_text_mode: |_| TextMode::Data,
}
}
}
pub trait FlagCDataNs {
fn set_is_in_html(&mut self, flag: bool);
fn need_flag_hint(&self) -> bool;
}
pub trait Locatable {
fn current_position(&self) -> Position;
fn last_position(&self) -> Position;
fn get_location_from(&self, start: Position) -> SourceLocation;
}
#[derive(PartialEq, Eq)]
pub enum TextMode {
Data,
RcData,
RawText,
}
pub struct Scanner {
option: ScanOption,
delimiter_first_char: char,
}
impl Scanner {
pub fn new(option: ScanOption) -> Self {
let delimiters = &option.delimiters;
let delimiter_first_char = delimiters
.0
.chars()
.next()
.expect("interpolation delimiter cannot be empty");
Self {
option,
delimiter_first_char,
}
}
pub fn scan<'a, E>(&self, source: &'a str, err_handle: E) -> impl TokenSource<'a>
where
E: ErrorHandler,
{
Tokens {
source,
err_handle,
position: Default::default(),
last_pos: Default::default(),
mode: TextMode::Data,
option: self.option.clone(),
last_start_tag_name: None,
is_in_html_namespace: true,
delimiter_first_char: self.delimiter_first_char,
}
}
}
pub struct Tokens<'a, E: ErrorHandler> {
source: &'a str,
err_handle: E,
position: Position,
last_pos: Position,
mode: TextMode,
pub option: ScanOption,
last_start_tag_name: Option<&'a str>,
is_in_html_namespace: bool,
delimiter_first_char: char,
}
impl<'a, C: ErrorHandler> Tokens<'a, C> {
fn scan_data(&mut self) -> Token<'a> {
debug_assert!(self.mode == TextMode::Data);
debug_assert!(!self.source.is_empty());
let d = self.delimiter_first_char;
let mut offset = 0;
while let Some(i) = self.source[offset..].find(&['<', d][..]) {
if i != 0 {
return self.scan_text(i);
} else if self.source.starts_with('<') {
return self.scan_tag_open();
} else if self.source.starts_with(&self.option.delimiters.0) {
return self.scan_interpolation();
} else {
offset = i + 1;
}
}
self.scan_text(self.source.len())
}
fn scan_text(&mut self, size: usize) -> Token<'a> {
debug_assert!(matches!(self.mode, TextMode::Data | TextMode::RcData));
debug_assert_ne!(size, 0);
let src = self.move_by(size);
Token::Text(self.decode_text(src, false))
}
fn scan_interpolation(&mut self) -> Token<'a> {
let delimiters = &self.option.delimiters;
debug_assert!(self.source.starts_with(&delimiters.0));
let index = self.source.find(&delimiters.1);
if index.is_none() {
let src = self.move_by(self.source.len());
self.emit_error(ErrorKind::MissingInterpolationEnd);
return Token::Interpolation(&src[2..]);
}
let src = &self.move_by(index.unwrap())[2..];
self.move_by(self.option.delimiters.1.len());
Token::Interpolation(src)
}
fn scan_tag_open(&mut self) -> Token<'a> {
let source = &self.source;
if source.starts_with("</") {
self.scan_end_tag_open()
} else if source.starts_with("<!") {
self.scan_comment_and_like()
} else if source.starts_with("<?") {
self.emit_error(ErrorKind::UnexpectedQuestionMarkInsteadOfTagName);
self.scan_bogus_comment()
} else if source.len() == 1 {
self.move_by(1);
self.emit_error(ErrorKind::EofBeforeTagName);
Token::from("<")
} else if !source[1..].starts_with(ascii_alpha) {
self.move_by(1);
self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
Token::from("<")
} else {
self.scan_start_tag()
}
}
fn scan_start_tag(&mut self) -> Token<'a> {
debug_assert!(self.source.starts_with('<'));
self.move_by(1);
let tag = self.scan_tag_name();
let parsing_algorithm = self.option.get_text_mode;
self.mode = parsing_algorithm(tag.name);
if self.mode != TextMode::Data {
self.last_start_tag_name.replace(tag.name);
}
Token::StartTag(tag)
}
fn scan_tag_name(&mut self) -> Tag<'a> {
debug_assert!(self.source.starts_with(ascii_alpha));
let bytes = self.source.bytes();
let l = scan_tag_name_length(bytes);
debug_assert!(l > 0);
let name = self.move_by(l);
let attributes = self.scan_attributes();
let self_closing = if self.source.is_empty() {
self.emit_error(ErrorKind::EofInTag);
false
} else {
self.scan_close_start_tag()
};
Tag {
name,
attributes,
self_closing,
}
}
fn scan_attributes(&mut self) -> Vec<Attribute<'a>> {
let mut attrs = vec![]; let mut set = FxHashSet::default();
loop {
self.skip_whitespace();
if self.is_about_to_close_tag() {
return attrs;
}
if self.did_skip_slash_in_tag() {
continue;
}
let attr = self.scan_attribute();
if set.contains(attr.name) {
self.emit_error(ErrorKind::DuplicateAttribute);
continue;
}
set.insert(attr.name);
attrs.push(attr);
}
}
fn scan_attribute(&mut self) -> Attribute<'a> {
debug_assert!(!self.source.is_empty());
let start = self.current_position();
let name = self.scan_attr_name();
let name_loc = self.get_location_from(start.clone());
self.skip_whitespace();
if self.is_about_to_close_tag()
|| self.did_skip_slash_in_tag()
|| !self.source.starts_with('=')
{
let location = self.get_location_from(start);
return Attribute {
name,
location,
name_loc,
value: None,
};
}
self.move_by(1); let value = self.scan_attr_value();
let location = self.get_location_from(start);
Attribute {
name,
value,
name_loc,
location,
}
}
fn is_about_to_close_tag(&self) -> bool {
let source = &self.source; source.is_empty() || source.starts_with("/>") || source.starts_with('>')
}
fn did_skip_slash_in_tag(&mut self) -> bool {
debug_assert!(!self.source.is_empty());
if self.source.starts_with('/') {
self.move_by(1);
self.emit_error(ErrorKind::UnexpectedSolidusInTag);
true
} else {
false
}
}
fn scan_attr_name(&mut self) -> &'a str {
debug_assert!(is_valid_name_char(self.source.as_bytes()[0]));
let offset = if self.source.starts_with('=') {
self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
1
} else {
0
};
let count = self.source[offset..]
.bytes()
.take_while(|&c| semi_valid_attr_name(c))
.count();
let src = self.move_by(count + offset);
if src.contains(&['<', '"', '\''][..]) {
self.emit_error(ErrorKind::UnexpectedCharacterInAttributeName);
}
src
}
fn scan_attr_value(&mut self) -> Option<AttributeValue<'a>> {
self.skip_whitespace();
let source = &self.source;
if source.starts_with('>') {
self.emit_error(ErrorKind::MissingAttributeValue);
return None;
}
let start = self.current_position();
let content = if self.source.starts_with(&['"', '\''][..]) {
let c = self.source.chars().next().unwrap();
self.scan_quoted_attr_value(c)?
} else {
self.scan_unquoted_attr_value()?
};
Some(AttributeValue {
content,
location: self.get_location_from(start),
})
}
fn scan_quoted_attr_value(&mut self, quote: char) -> Option<VStr<'a>> {
debug_assert!(self.source.starts_with(quote));
self.move_by(1);
let src = if let Some(i) = self.source.find(quote) {
let val = if i == 0 { "" } else { self.move_by(i) };
self.move_by(1); val
} else if !self.source.is_empty() {
self.move_by(self.source.len())
} else {
return None;
};
if !self.is_about_to_close_tag()
&& !self.did_skip_slash_in_tag()
&& self.skip_whitespace() == 0
{
self.emit_error(ErrorKind::MissingWhitespaceBetweenAttributes);
}
Some(self.decode_text(src, true))
}
fn scan_unquoted_attr_value(&mut self) -> Option<VStr<'a>> {
let val_len = self
.source
.bytes()
.take_while(semi_valid_unquoted_attr_value)
.count();
if val_len == 0 {
debug_assert!(self.source.is_empty());
return None;
}
let src = self.move_by(val_len);
if src.contains(&['"', '\'', '<', '=', '`'][..]) {
self.emit_error(ErrorKind::UnexpectedCharacterInUnquotedAttributeValue);
}
Some(self.decode_text(src, true))
}
fn scan_close_start_tag(&mut self) -> bool {
debug_assert!(!self.source.is_empty());
if self.source.starts_with("/>") {
self.move_by(2);
true
} else {
debug_assert!(self.source.starts_with('>'));
self.move_by(1);
false
}
}
fn scan_end_tag_open(&mut self) -> Token<'a> {
debug_assert!(self.source.starts_with("</"));
let source = &self.source;
if source.len() == 2 {
self.emit_error(ErrorKind::EofBeforeTagName);
Token::from(self.move_by(2))
} else if source.starts_with("</>") {
self.emit_error(ErrorKind::MissingEndTagName);
self.move_by(3);
Token::from("")
} else if !self.source[2..].starts_with(ascii_alpha) {
self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
self.scan_bogus_comment()
} else {
self.scan_end_tag()
}
}
fn scan_end_tag(&mut self) -> Token<'a> {
debug_assert!(self.source.starts_with("</"));
self.move_by(2);
let tag = self.scan_tag_name();
if !tag.attributes.is_empty() {
self.emit_error(ErrorKind::EndTagWithAttributes);
}
if tag.self_closing {
self.emit_error(ErrorKind::EndTagWithTrailingSolidus);
}
self.mode = TextMode::Data;
Token::EndTag(tag.name)
}
fn scan_comment_and_like(&mut self) -> Token<'a> {
let s = &self.source;
if s.starts_with("<!--") {
self.scan_comment()
} else if s.starts_with("<!DOCTYPE") {
self.scan_bogus_comment()
} else if s.starts_with("<![CDATA[") {
if self.is_in_html_namespace {
self.emit_error(ErrorKind::CDataInHtmlContent);
self.scan_bogus_comment()
} else {
self.scan_cdata()
}
} else {
self.emit_error(ErrorKind::IncorrectlyOpenedComment);
self.scan_bogus_comment()
}
}
fn scan_comment(&mut self) -> Token<'a> {
debug_assert!(self.source.starts_with("<!--"));
let comment_text = self.scan_comment_text();
if self.source.is_empty() {
self.emit_error(ErrorKind::EofInComment);
} else if self.source.starts_with("--!>") {
self.emit_error(ErrorKind::IncorrectlyClosedComment);
self.move_by(4);
} else {
debug_assert!(self.source.starts_with("-->"));
self.move_by(3);
};
Token::Comment(comment_text)
}
fn scan_comment_text(&mut self) -> &'a str {
debug_assert!(self.source.starts_with("<!--"));
let comment_end = self.source.find("-->").or_else(|| self.source.find("--!>"));
let text = if let Some(end) = comment_end {
debug_assert!(end >= 2, "first two chars must be <!");
if end <= 3 {
self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
self.move_by(end);
return "";
}
self.move_by(4); &self.source[..end - 4] } else {
self.move_by(4);
self.source
};
let mut s = text;
while let Some(i) = s.find("<!--") {
self.move_by(i + 4);
if !self.source.is_empty() {
self.emit_error(ErrorKind::NestedComment);
}
s = &s[i + 4..];
}
if !s.is_empty() {
self.move_by(s.len());
}
text
}
#[cold]
#[inline(never)]
fn scan_bogus_comment(&mut self) -> Token<'a> {
let s = &self.source;
debug_assert! {
s.starts_with("<!") || s.starts_with("<?") ||
(
s.starts_with("</") &&
s[2..].starts_with(|c| {
!matches!(c, 'a'..='z'|'A'..='Z'|'>')
})
)
};
let start = if s.starts_with("<?") { 1 } else { 2 };
let text = if let Some(end) = s.find('>') {
let t = &s[start..end];
self.move_by(end + 1);
t
} else {
let len = s.len();
&self.move_by(len)[start..]
};
Token::Comment(text)
}
#[cold]
#[inline(never)]
fn scan_cdata(&mut self) -> Token<'a> {
debug_assert!(self.source.starts_with("<![CDATA["));
self.move_by(9);
let i = self.source.find("]]>").unwrap_or_else(|| self.source.len());
let text = self.move_by(i); if self.source.is_empty() {
self.emit_error(ErrorKind::EofInCdata);
} else {
debug_assert!(self.source.starts_with("]]>"));
self.move_by(3);
}
Token::from(text)
}
fn scan_rawtext(&mut self) -> Token<'a> {
debug_assert!(self.mode == TextMode::RawText);
debug_assert!(!self.source.is_empty());
let end = self.find_appropriate_end();
let src = if end == 0 { "" } else { self.move_by(end) };
self.mode = TextMode::Data;
if src.is_empty() {
self.scan_data()
} else {
Token::from(src)
}
}
fn scan_rcdata(&mut self) -> Token<'a> {
debug_assert!(self.mode == TextMode::RcData);
debug_assert!(!self.source.is_empty());
let delimiter = &self.option.delimiters.0;
if self.source.starts_with(delimiter) {
return self.scan_interpolation();
}
let end = self.find_appropriate_end();
let interpolation_start = self.source.find(delimiter).unwrap_or(end);
if interpolation_start < end {
debug_assert_ne!(interpolation_start, 0);
return self.scan_text(interpolation_start);
}
self.mode = TextMode::Data;
if end > 0 {
self.scan_text(end)
} else {
self.scan_data()
}
}
fn find_appropriate_end(&self) -> usize {
let tag_name = self
.last_start_tag_name
.expect("RAWTEXT/RCDATA must appear inside a tag");
let len = tag_name.len();
let source = self.source; for (i, _) in source.match_indices("</") {
let e = i + 2 + len;
if e >= source.len() {
break;
}
let is_appropriate_end = source[i + 2..e].eq_ignore_ascii_case(tag_name);
let terminated = !is_valid_name_char(source.as_bytes()[e]);
if is_appropriate_end && terminated {
return i;
}
}
source.len()
}
}
impl<'a, C: ErrorHandler> Tokens<'a, C> {
fn emit_error(&self, error_kind: ErrorKind) {
let start = self.current_position();
let loc = self.get_location_from(start);
let err = CompilationError::new(error_kind).with_location(loc);
self.err_handle.on_error(err);
}
fn decode_text(&self, src: &'a str, is_attr: bool) -> VStr<'a> {
*VStr::raw(src).decode(is_attr)
}
fn move_by(&mut self, size: usize) -> &'a str {
debug_assert!(size > 0, "scanner must move forward");
let mut lines = 0;
let mut last_new_line_pos = -1;
for (i, c) in self.source[..size].bytes().enumerate() {
if c == b'\n' {
lines += 1;
last_new_line_pos = i as i32;
}
}
let old_source = self.source;
self.source = &self.source[size..];
let ret = &old_source[..size];
let pos = &mut self.position;
let offset = ret.chars().count();
pos.offset += offset;
pos.line += lines;
pos.column = if last_new_line_pos == -1 {
pos.column + offset as u32
} else {
ret[last_new_line_pos as usize..].chars().count() as u32
};
ret
}
fn skip_whitespace(&mut self) -> usize {
let idx = self.source.find(non_whitespace);
let len = idx.unwrap_or_else(|| self.source.len());
if len != 0 {
self.move_by(len);
}
len
}
}
#[inline]
fn ascii_alpha(c: char) -> bool {
c.is_ascii_alphabetic()
}
#[inline]
fn semi_valid_attr_name(c: u8) -> bool {
is_valid_name_char(c) && c != b'='
}
#[inline]
fn semi_valid_unquoted_attr_value(&c: &u8) -> bool {
!c.is_ascii_whitespace() && c != b'>'
}
#[inline]
fn is_valid_name_char(c: u8) -> bool {
!c.is_ascii_whitespace() && c != b'/' && c != b'>'
}
fn scan_tag_name_length(mut bytes: Bytes<'_>) -> usize {
let first_char = bytes.next();
debug_assert!(first_char.is_some());
if !first_char.unwrap().is_ascii_alphabetic() {
return 0;
}
let l = bytes.take_while(|&c| is_valid_name_char(c)).count();
l + 1
}
impl<'a, C: ErrorHandler> Iterator for Tokens<'a, C> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.source.is_empty() {
return None;
}
self.last_pos = self.current_position();
Some(match self.mode {
TextMode::Data => self.scan_data(),
TextMode::RcData => self.scan_rcdata(),
TextMode::RawText => self.scan_rawtext(),
})
}
}
impl<'a, C: ErrorHandler> FusedIterator for Tokens<'a, C> {}
impl<'a, C: ErrorHandler> FlagCDataNs for Tokens<'a, C> {
fn set_is_in_html(&mut self, in_html: bool) {
self.is_in_html_namespace = in_html;
}
fn need_flag_hint(&self) -> bool {
self.source.contains("<![CDATA[")
}
}
impl<'a, C: ErrorHandler> Locatable for Tokens<'a, C> {
fn current_position(&self) -> Position {
self.position.clone()
}
fn last_position(&self) -> Position {
debug_assert! {
self.position.offset == 0 ||
self.last_pos.offset < self.position.offset
};
self.last_pos.clone()
}
fn get_location_from(&self, start: Position) -> SourceLocation {
let end = self.current_position();
SourceLocation { start, end }
}
}
pub trait TokenSource<'a>: FusedIterator<Item = Token<'a>> + FlagCDataNs + Locatable {}
impl<'a, C> TokenSource<'a> for Tokens<'a, C> where C: ErrorHandler {}
#[cfg(test)]
pub mod test {
use super::{super::error::test::TestErrorHandler, *};
#[test]
fn test_single_delimiter() {
let a: Vec<_> = base_scan("{ test }").collect();
assert_eq!(a.len(), 1);
assert!(matches!(
a[0],
Token::Text(VStr {
raw: "{ test }",
..
})
));
}
fn scan_with_opt(s: &str, opt: ScanOption) -> impl TokenSource {
let scanner = Scanner::new(opt);
let ctx = TestErrorHandler;
scanner.scan(s, ctx)
}
pub fn base_scan(s: &str) -> impl TokenSource {
scan_with_opt(s, ScanOption::default())
}
}