use crate::data::{EntityItem, ENTITIES};
use lazy_static::lazy_static;
#[cfg(target_arch = "wasm32")]
use num_derive::*;
#[cfg(target_arch = "wasm32")]
use num_traits::FromPrimitive;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
#[cfg(target_arch = "wasm32")]
use wasm_bindgen::prelude::*;
type SortedEntity = Vec<EntityItem>;
type Positions = HashMap<u8, (usize, usize)>;
pub const NOOP: Option<&dyn Fn(char) -> bool> = None::<&dyn Fn(char) -> bool>;
lazy_static! {
static ref IS_SORTED: AtomicBool = AtomicBool::new(false);
static ref DECODE_ENTITIES: Mutex<SortedEntity> = Mutex::new(vec![]);
static ref FIRST_POSITION: Mutex<Positions> = Mutex::new(HashMap::new());
static ref SPECIAL_CHARS: HashMap<char, &'static str> = {
let mut map = HashMap::new();
map.insert('>', ">");
map.insert('<', "<");
map.insert('"', """);
map.insert('\'', "'");
map.insert('&', "&");
map
};
}
pub fn encode_char<F>(ch: char, encode_type: EncodeType, exclude_fn: Option<F>) -> String
where
F: Fn(char) -> bool,
{
use EncodeType::*;
let encode_type = encode_type as u8;
let char_code = ch as u32;
let mut result = String::with_capacity(5);
if encode_type & (Named as u8) > 0 {
let mut should_find_name = true;
if let Some(exclude_fn) = exclude_fn {
if exclude_fn(ch) {
should_find_name = false;
}
}
if should_find_name {
let finded = (&ENTITIES[..]).binary_search_by_key(&char_code, |&(_, code)| code);
if let Ok(index) = finded {
let mut first_index = index;
loop {
if first_index > 0 {
let next_index = first_index - 1;
let (_, cur_char_code) = ENTITIES[next_index];
if cur_char_code != char_code {
break;
}
first_index -= 1;
} else {
break;
}
}
let (entity, _) = ENTITIES[first_index];
result.push('&');
result.push_str(entity);
result.push(';');
return result;
}
}
}
if encode_type & (Hex as u8) > 0 {
let hex = format!("&#x{:x};", char_code);
result.push_str(&hex);
return result;
}
if encode_type & (Decimal as u8) > 0 {
let dec = format!("&#{};", char_code);
result.push_str(&dec);
return result;
}
result.push(ch);
result
}
#[cfg_attr(
target_arch = "wasm32",
wasm_bindgen,
derive(Clone, Copy, FromPrimitive, PartialEq, PartialOrd)
)]
pub enum EntitySet {
Empty = 0,
All = 1,
NoASCII = 2,
SpecialChars = 4,
SpecialCharsAndNoASCII = 6,
}
impl Default for EntitySet {
fn default() -> Self {
EntitySet::SpecialCharsAndNoASCII
}
}
impl EntitySet {
pub fn filter(&self, ch: &char, encode_type: EncodeType) -> (bool, Option<String>) {
use EntitySet::*;
match self {
SpecialChars => {
let encode_type = encode_type as u8;
if let Some(&v) = SPECIAL_CHARS.get(ch) {
if (encode_type & EncodeType::Named as u8) > 0 {
return (true, Some(v.into()));
}
return (true, None);
}
(false, None)
}
NoASCII => (*ch as u32 > 0x80, None),
SpecialCharsAndNoASCII => {
let (need_encode, result) = EntitySet::NoASCII.filter(ch, encode_type);
if need_encode {
return (need_encode, result);
}
EntitySet::SpecialChars.filter(ch, encode_type)
}
All => (true, None),
Empty => (false, None),
}
}
pub fn contains(&self, ch: &char) -> bool {
let (flag, _) = self.filter(ch, EncodeType::Decimal);
flag
}
}
#[cfg(target_arch = "wasm32")]
impl EntitySet {
fn value(&self) -> u8 {
*self as _
}
}
#[cfg(target_arch = "wasm32")]
impl From<u8> for EntitySet {
fn from(orig: u8) -> Self {
Self::from_u8(orig).unwrap_or(EntitySet::Empty)
}
}
pub fn encode(content: &str, entity_set: EntitySet, encode_type: EncodeType) -> String {
let mut result = String::with_capacity(content.len() + 5);
for ch in content.chars() {
let (need_encode, encoded) = entity_set.filter(&ch, encode_type);
if need_encode {
if let Some(encoded) = encoded {
result.push_str(&encoded);
} else {
let encoded = encode_char(ch, encode_type, NOOP);
result.push_str(&encoded);
}
} else {
result.push(ch);
}
}
result
}
pub fn encode_default(content: &str) -> String {
encode(content, Default::default(), Default::default())
}
pub fn encode_filter<F: Fn(char) -> bool, C: Fn(char) -> bool>(
content: &str,
filter_fn: F,
encode_type: EncodeType,
exclude_fn: Option<C>,
) -> String {
let mut result = String::with_capacity(content.len() + 5);
for ch in content.chars() {
if filter_fn(ch) {
result.push_str(&encode_char(ch, encode_type, exclude_fn.as_ref()));
} else {
result.push(ch);
}
}
result
}
pub fn encode_with<F>(content: &str, encoder: F) -> String
where
F: Fn(char) -> Option<EncodeType>,
{
let mut result = String::with_capacity(content.len() + 5);
for ch in content.chars() {
if let Some(encode_type) = encoder(ch) {
result.push_str(&encode_char(ch, encode_type, NOOP));
} else {
result.push(ch);
}
}
result
}
fn sort_entities() {
let mut sorted: SortedEntity = Vec::with_capacity(ENTITIES.len());
let mut counts: Positions = HashMap::new();
let mut firsts: Vec<u8> = Vec::with_capacity(52);
for pair in &ENTITIES[..] {
let entity = *pair;
let chars = entity.0.as_bytes();
let first = chars[0];
binary_insert(&mut sorted, entity);
match counts.get_mut(&first) {
Some((v, _)) => {
*v += 1;
}
None => {
counts.insert(first, (1, 0));
}
}
if !firsts.contains(&first) {
firsts.push(first);
}
}
firsts.sort_unstable();
let mut cur_index: usize = 0;
for char_code in firsts {
let position = counts.get_mut(&char_code).unwrap();
let next_index = cur_index + position.0;
*position = (cur_index, next_index);
cur_index = next_index;
}
let mut positions = FIRST_POSITION.lock().unwrap();
*positions = counts;
let mut entities = DECODE_ENTITIES.lock().unwrap();
*entities = sorted;
}
fn binary_insert(sorted: &mut SortedEntity, cur: EntityItem) {
let mut prev_index = 0;
let len = sorted.len();
if len > 0 {
let search = cur.0;
prev_index = match sorted[..].binary_search_by(|&(name, _)| name.cmp(search)) {
Ok(index) => index,
Err(index) => index,
};
}
(*sorted).insert(prev_index, cur);
}
#[derive(PartialEq, Eq)]
pub enum EntityIn {
Unkown,
Named,
Hex,
Decimal,
HexOrDecimal,
}
#[cfg_attr(
target_arch = "wasm32",
wasm_bindgen,
derive(FromPrimitive, PartialEq, PartialOrd)
)]
#[derive(Copy, Clone)]
pub enum EncodeType {
Ignore = 0,
Named = 0b00001,
Hex = 0b00010,
Decimal = 0b00100,
NamedOrHex = 0b00011,
NamedOrDecimal = 0b00101,
}
impl Default for EncodeType {
fn default() -> Self {
EncodeType::NamedOrDecimal
}
}
#[cfg(target_arch = "wasm32")]
impl EncodeType {
fn value(&self) -> u8 {
*self as _
}
}
#[cfg(target_arch = "wasm32")]
impl From<u8> for EncodeType {
fn from(orig: u8) -> Self {
Self::from_u8(orig).unwrap_or(EncodeType::Ignore)
}
}
pub fn decode_chars(chars: Vec<char>) -> Vec<char> {
let mut result: Vec<char> = Vec::with_capacity(chars.len());
let mut entity: Entity = Entity::new();
let mut is_in_entity: bool = false;
for ch in chars {
if !is_in_entity {
if entity.add(ch) {
is_in_entity = true;
} else {
result.push(ch);
}
} else {
let is_wrong_entity = !entity.add(ch);
if is_wrong_entity || entity.is_end {
result.extend(entity.get_chars());
if is_wrong_entity {
result.push(ch);
}
is_in_entity = false;
entity = Entity::new();
}
}
}
if is_in_entity {
result.extend(entity.get_chars());
}
result
}
pub fn decode(content: &str) -> String {
let chars: Vec<char> = content.chars().collect();
decode_chars(chars).into_iter().collect::<String>()
}
#[derive(Default)]
pub struct Entity {
pub entity_in: Option<EntityIn>,
pub characters: Vec<char>,
pub is_end: bool,
}
impl Entity {
pub fn new() -> Self {
Entity::default()
}
pub fn add(&mut self, ch: char) -> bool {
if self.is_end {
return false;
}
use EntityIn::*;
if let Some(entity_in) = &self.entity_in {
let mut is_in_entity = true;
if ch == ';' {
self.is_end = true;
return true;
} else {
match entity_in {
Named => {
if !ch.is_ascii_alphabetic() {
is_in_entity = false;
}
}
Hex | Decimal => match ch {
'0'..='9' => {}
'a'..='f' | 'A'..='F' if entity_in == &Hex => {}
_ => {
is_in_entity = false;
}
},
Unkown => {
if ch.is_ascii_alphabetic() {
self.entity_in = Some(Named);
} else if ch == '#' {
self.entity_in = Some(HexOrDecimal);
} else {
is_in_entity = false;
}
}
HexOrDecimal => match ch {
'0'..='9' => {
self.entity_in = Some(Decimal);
}
'x' | 'X' => {
self.entity_in = Some(Hex);
}
_ => {
is_in_entity = false;
}
},
};
if is_in_entity {
self.characters.push(ch);
}
return is_in_entity;
}
} else if ch == '&' {
self.entity_in = Some(Unkown);
return true;
}
false
}
pub fn decode(&self) -> Option<char> {
if !self.is_end {
return None;
}
use EntityIn::*;
let entity = &self.characters;
let entity_in = self.entity_in.as_ref().unwrap();
match entity_in {
Named => {
let first = entity[0] as u32 as u8;
let is_sorted = IS_SORTED.load(Ordering::SeqCst);
if !is_sorted {
sort_entities();
IS_SORTED.store(true, Ordering::SeqCst);
}
let sorted = DECODE_ENTITIES.lock().unwrap();
let firsts = FIRST_POSITION.lock().unwrap();
if let Some(&(start_index, end_index)) = firsts.get(&first) {
let searched = entity.iter().collect::<String>();
if let Ok(find_index) = sorted[start_index..end_index]
.binary_search_by(|&(name, _)| name.cmp(searched.as_str()))
{
let last_index = start_index + find_index;
let (_, code) = sorted[last_index];
return Some(std::char::from_u32(code).unwrap());
}
}
}
Hex | Decimal => {
let base_type: u32;
let numbers: &[char];
if entity_in == &Hex {
base_type = 16;
numbers = &entity[2..];
} else {
base_type = 10;
numbers = &entity[1..];
}
if numbers.is_empty() {
return None;
}
let numbers = numbers.iter().collect::<String>();
if let Ok(char_code) = i64::from_str_radix(&numbers, base_type) {
if (0..=0x10ffff).contains(&char_code) {
if let Some(last_ch) = std::char::from_u32(char_code as u32) {
return Some(last_ch);
}
}
}
}
_ => {
}
}
None
}
pub fn get_chars(&self) -> Vec<char> {
if let Some(ch) = self.decode() {
return vec![ch];
}
let is_end = self.is_end;
let mut result = Vec::with_capacity(self.characters.len() + 1 + is_end as usize);
result.push('&');
result.extend(&self.characters);
if is_end {
result.push(';');
}
result
}
}