use lazy_static::lazy_static;
use std::collections::HashMap;
use crate::err::ProcessingResult;
use crate::proc::checkpoint::Checkpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::proc::entity::maybe_normalise_entity;
use crate::gen::codepoints::{DIGIT, WHITESPACE, ATTR_QUOTE, DOUBLE_QUOTE, SINGLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR};
fn entity_requires_semicolon(next_char: u8) -> bool {
DIGIT[next_char] || next_char == b';'
}
lazy_static! {
static ref ENCODED: HashMap<u8, &'static [u8]> = {
let mut m = HashMap::<u8, &'static [u8]>::new();
m.insert(b'\'', b"'");
m.insert(b'"', b""");
m.insert(b'>', b">");
m.insert(b'\x09', b"	");
m.insert(b'\x0a', b"
");
m.insert(b'\x0c', b"");
m.insert(b'\x0d', b"
");
m.insert(b'\x20', b" ");
m
};
}
#[derive(Clone, Copy)]
enum CharType {
Start,
End,
Normal(u8),
Whitespace(u8),
SingleQuote,
DoubleQuote,
}
impl CharType {
fn from_char(c: u8) -> CharType {
match c {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
}
}
fn is_start(&self) -> bool {
match self {
CharType::Start => true,
_ => false,
}
}
fn is_end(&self) -> bool {
match self {
CharType::End => true,
_ => false,
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum DelimiterType {
Double,
Single,
Unquoted,
}
struct Metrics {
count_double_quotation: usize,
total_double_quote_encoded_length: usize,
count_single_quotation: usize,
total_single_quote_encoded_length: usize,
count_whitespace: usize,
total_whitespace_encoded_length: usize,
}
impl Metrics {
fn unquoted_len(&self, raw_val: &[u8]) -> usize {
let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| entity_requires_semicolon(c)).is_some() as usize;
let first_char_encoding_cost = match raw_val.first() {
Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
_ => 0,
};
let last_char_encoding_cost = match raw_val.last() {
Some(b'>') => ENCODED[&b'>'].len(),
_ => 0,
};
let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length;
let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
let raw_len = raw_len - (last_char_encoding_cost > 0) as usize + last_char_encoding_cost;
raw_len
}
fn single_quoted_len(&self, raw_len: usize) -> usize {
let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
let raw_len = raw_len + 2;
raw_len
}
fn double_quoted_len(&self, raw_len: usize) -> usize {
let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
let raw_len = raw_len + 2;
raw_len
}
fn get_optimal_delimiter_type(&self, raw_val: &[u8]) -> (DelimiterType, usize) {
let mut min = (DelimiterType::Double, self.double_quoted_len(raw_val.len()));
let single = (DelimiterType::Single, self.single_quoted_len(raw_val.len()));
if single.1 < min.1 {
min = single;
};
let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_val));
if unquoted.1 < min.1 {
min = unquoted;
};
min
}
}
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_pred = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
proc.m(WhileNotInLookup(delim_pred), Discard);
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
Ok(())
}
pub struct ProcessedAttrValue {
pub delimiter: DelimiterType,
pub value: Option<ProcessorRange>,
}
fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metrics) -> () {
proc.write(c);
metrics.count_whitespace += 1;
metrics.total_whitespace_encoded_length += ENCODED[&c].len();
}
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let start = Checkpoint::new(proc);
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_lookup = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
let mut metrics = Metrics {
count_double_quotation: 0,
total_double_quote_encoded_length: 0,
count_single_quotation: 0,
total_single_quote_encoded_length: 0,
count_whitespace: 0,
total_whitespace_encoded_length: 0,
};
let mut currently_in_whitespace = false;
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
CharType::from_char(proc.skip()?)
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
CharType::End
} else {
CharType::from_char(proc.skip()?)
};
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
currently_in_whitespace = true;
continue;
};
if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) {
last_char_type = CharType::Whitespace(b' ');
handle_whitespace_char_type(b' ', proc, &mut metrics);
};
currently_in_whitespace = false;
};
match char_type {
CharType::Start => unreachable!(),
CharType::End => {
break;
}
CharType::Whitespace(c) => {
handle_whitespace_char_type(c, proc, &mut metrics);
}
CharType::SingleQuote => {
proc.write(b'\'');
metrics.count_single_quotation += 1;
metrics.total_single_quote_encoded_length += ENCODED[&b'\''].len();
}
CharType::DoubleQuote => {
proc.write(b'\"');
metrics.count_double_quotation += 1;
metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len();
}
CharType::Normal(c) => {
proc.write(c);
if entity_requires_semicolon(c) {
match last_char_type {
CharType::SingleQuote => metrics.total_single_quote_encoded_length += 1,
CharType::DoubleQuote => metrics.total_double_quote_encoded_length += 1,
CharType::Whitespace(_) => metrics.total_whitespace_encoded_length += 1,
_ => {}
};
};
}
};
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
let minimum_value = start.written_range(proc);
if minimum_value.empty() {
return Ok(ProcessedAttrValue {
delimiter: DelimiterType::Unquoted,
value: None,
});
};
let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(&proc[minimum_value]);
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
DelimiterType::Single => Some(b'\''),
_ => None,
};
proc.reserve_output(optimal_len - minimum_value.len());
let optimal_slice = &mut proc[start.get_written_range_since(optimal_len)];
let mut write = optimal_slice.len() - 1;
if let Some(c) = optimal_delimiter_char {
optimal_slice[write] = c;
write -= 1;
};
for read in (0..minimum_value.len()).rev() {
let is_first = read == 0;
let is_last = read == minimum_value.len() - 1;
let c = optimal_slice[read];
let should_encode = match (c, optimal_delimiter, is_first, is_last) {
(b'>', DelimiterType::Unquoted, _, true) => true,
(c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
(c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
(b'\'', DelimiterType::Single, _, _) => true,
(b'"', DelimiterType::Double, _, _) => true,
_ => false,
};
if should_encode {
let should_add_semicolon = !is_last && entity_requires_semicolon(optimal_slice[write + 1]);
let encoded = ENCODED[&c];
write -= encoded.len() + should_add_semicolon as usize - 1;
optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);
if should_add_semicolon {
optimal_slice[write + encoded.len()] = b';';
};
} else {
optimal_slice[write] = c;
};
if is_first {
break;
};
write -= 1;
};
if let Some(c) = optimal_delimiter_char {
optimal_slice[0] = c;
};
Ok(ProcessedAttrValue {
delimiter: optimal_delimiter,
value: Some(start.written_range(proc)).filter(|r| !r.empty()),
})
}