htmlentity/
entity.rs

1use crate::{
2  data::{ENTITIES, FIRST_LETTER_POSITION, LETTER_ORDERED_ENTITIES},
3  types::{
4    AnyhowResult, Byte, ByteList, BytesCharEntity, CharListResult, CodeRange, CodeRangeTuple,
5    EncodeFilterReturnData, EntityCharBytes, IterDataItem, StringResult,
6  },
7};
8
9use lazy_static::lazy_static;
10use std::{borrow::Cow, char, cmp::Ordering, collections::HashMap, fmt::Display};
11use thiserror::Error;
12
13lazy_static! {
14  // html bytes
15  static ref HTML_BYTES: EntityCharBytes  = {
16    let mut map: EntityCharBytes  = HashMap::with_capacity(3);
17    map.insert('>', b"gt");
18    map.insert('<', b"lt");
19    map.insert('&', b"amp");
20    map
21  };
22  // special bytes
23  static ref SPECIAL_BYTES: EntityCharBytes  = {
24    let mut map: EntityCharBytes  = HashMap::with_capacity(5);
25    map.insert('"', b"quot");
26    map.insert('\'', b"apos");
27    for (k, v) in HTML_BYTES.iter(){
28        map.insert(*k, *v);
29    }
30    map
31  };
32  // normal name entity
33  static ref NORMAL_NAME_ENTITY_BYTE: BytesCharEntity = {
34    let mut map: BytesCharEntity = HashMap::with_capacity(10);
35    map.insert(b"lt", '<');
36    map.insert(b"LT", '<');
37    map.insert(b"gt", '>');
38    map.insert(b"GT", '>');
39    map.insert(b"amp", '&');
40    map.insert(b"AMP", '&');
41    map.insert(b"quot", '"');
42    map.insert(b"QUOT", '"');
43    map.insert(b"apos", '\'');
44    map.insert(b"nbsp", 0xa0 as char);
45    map
46  };
47}
48
49#[derive(Error, Debug)]
50pub enum HtmlEntityError {
51  #[error("Decode error: {0}")]
52  Decode(String),
53  #[error("Encode error: {0}")]
54  Encode(String),
55}
56
57#[inline]
58fn char_to_utf8_bytes(ch: char) -> ByteList {
59  let len = ch.len_utf8();
60  let mut bytes: ByteList = vec![0; len];
61  ch.encode_utf8(&mut bytes);
62  bytes
63}
64
65#[inline]
66fn tr_chars_to_utf8_bytes(chars: &[char]) -> Option<ByteList> {
67  let mut bytes: ByteList = vec![];
68  for ch in chars {
69    if ch.len_utf8() == 1 {
70      bytes.push(*ch as Byte);
71      continue;
72    }
73    return None;
74  }
75  Some(bytes)
76}
77
78#[inline]
79fn numbers_to_char(bytes: &[Byte], radix: u32) -> AnyhowResult<char> {
80  if !bytes.is_empty() {
81    // '&#;' '&#x;'
82    let num = std::str::from_utf8(bytes)?;
83    let char_code = i64::from_str_radix(num, radix)?;
84    return std::char::from_u32(char_code as u32).ok_or(
85      HtmlEntityError::Decode(format!(
86        "The html entity number '&{}{};' is not a valid encoded character.",
87        if radix == 16 { "#" } else { "" },
88        num
89      ))
90      .into(),
91    );
92  }
93  Err(HtmlEntityError::Decode(String::from("Html entity number cannot be empty.")).into())
94}
95
96enum Utf8ParsedData {
97  Correct(char),
98  Wrong(&'static str),
99}
100
101#[inline]
102fn loop_utf8_bytes(
103  bytes: &[Byte],
104  mut handle: impl FnMut(Utf8ParsedData, CodeRangeTuple) -> AnyhowResult<()>,
105) -> AnyhowResult<()> {
106  let mut next_count = 0;
107  let mut ch: u32 = 0;
108  let mut start_index: usize = 0;
109  for (index, byte) in bytes.iter().enumerate() {
110    match next_count {
111      0 => {
112        start_index = index;
113        if (byte >> 7) == 0 {
114          let _ = handle(Utf8ParsedData::Correct(*byte as char), (start_index, index));
115        } else {
116          let mut head = byte >> 3;
117          if head == 0b11110 {
118            next_count = 3;
119            ch = ((byte & 0b111) as u32) << (next_count * 6);
120          } else {
121            head >>= 1;
122            if head == 0b1110 {
123              next_count = 2;
124              ch = ((byte & 0b1111) as u32) << (next_count * 6);
125            } else {
126              head >>= 1;
127              if head == 0b110 {
128                next_count = 1;
129                ch = ((byte & 0b11111) as u32) << (next_count * 6);
130              } else {
131                // wrong utf8 byte
132                next_count = 0;
133                handle(
134                  Utf8ParsedData::Wrong("Illegal utf8 encoded bytes"),
135                  (start_index, index),
136                )?;
137              }
138            }
139          }
140        }
141      }
142      1..=3 => {
143        if (byte >> 6) == 0b10 {
144          next_count -= 1;
145          ch += ((byte & 0b111111) as u32) << (next_count * 6);
146          if next_count == 0 {
147            if let Some(ch) = char::from_u32(ch) {
148              let _ = handle(Utf8ParsedData::Correct(ch), (start_index, index));
149            } else {
150              handle(
151                Utf8ParsedData::Wrong("Illegal encoding utf8 character."),
152                (start_index, index),
153              )?;
154            }
155          }
156        } else {
157          next_count = 0;
158          // wrong utf8
159          handle(
160            Utf8ParsedData::Wrong("Illegal utf8 encoded bytes."),
161            (start_index, index),
162          )?;
163        }
164      }
165      // unreachable feature
166      _ => unreachable!(),
167    }
168  }
169  Ok(())
170}
171
172#[inline]
173fn bytes_to_chars(bytes: &[Byte], data: &mut Vec<char>) -> AnyhowResult<()> {
174  loop_utf8_bytes(bytes, |result, _| match result {
175    Utf8ParsedData::Correct(ch) => {
176      data.push(ch);
177      Ok(())
178    }
179    Utf8ParsedData::Wrong(message) => Err(HtmlEntityError::Decode(String::from(message)).into()),
180  })
181}
182
183#[inline]
184fn call_into_char_list_trait<T>(
185  bytes: &[Byte],
186  entities: &[(CodeRange, T)],
187  handle: impl Fn(&T, &mut Vec<char>),
188) -> CharListResult {
189  let total = bytes.len();
190  let mut result: Vec<char> = Vec::with_capacity(total / 2);
191  if entities.is_empty() {
192    bytes_to_chars(bytes, &mut result)?;
193    return Ok(result);
194  }
195  let mut index = 0;
196  for (range, item) in entities {
197    let start_index = *range.start();
198    let end_index = *range.end();
199    if index < start_index {
200      bytes_to_chars(&bytes[index..start_index], &mut result)?;
201    }
202    handle(item, &mut result);
203    index = end_index + 1;
204  }
205  if index < total {
206    bytes_to_chars(&bytes[index..], &mut result)?;
207  }
208  Ok(result)
209}
210
211#[inline]
212fn call_into_string_trait<T>(
213  bytes: &[Byte],
214  entities: &[(CodeRange, T)],
215  handle: impl Fn(&T, &mut String),
216) -> StringResult {
217  if entities.is_empty() {
218    let code = std::str::from_utf8(bytes)?;
219    return Ok(String::from(code));
220  }
221  let total = bytes.len();
222  let mut result = String::with_capacity(total);
223  let mut index = 0;
224  for (range, item) in entities {
225    let start_index = *range.start();
226    let end_index = *range.end();
227    if index < start_index {
228      let code = std::str::from_utf8(&bytes[index..start_index])?;
229      result.push_str(code);
230    }
231    handle(item, &mut result);
232    index = end_index + 1;
233  }
234  if index < total {
235    let code = std::str::from_utf8(&bytes[index..])?;
236    result.push_str(code);
237  }
238  Ok(result)
239}
240
241#[inline]
242fn gen_into_iter<'a, T: IBytesTrait>(
243  bytes: &'a [Byte],
244  entities: &'a [(CodeRange, T)],
245) -> DataIter<'a, T> {
246  let total_entities = entities.len();
247  let only_bytes = total_entities == 0;
248  let byte_index_of_next_entity = if only_bytes {
249    None
250  } else {
251    Some(*entities[0].0.start())
252  };
253  DataIter {
254    byte_index: 0,
255    total_bytes: bytes.len(),
256    entity_index: 0,
257    total_entities,
258    only_bytes,
259    byte_index_of_next_entity,
260    byte_index_entity_looped: 0,
261    bytes,
262    entities,
263  }
264}
265
266#[inline]
267fn call_trait_method_bytes_len<T: IBytesTrait>(
268  bytes: &[Byte],
269  entities: &[(CodeRange, T)],
270) -> usize {
271  if entities.is_empty() {
272    return bytes.len();
273  }
274  let mut start_index = 0;
275  let mut len: usize = 0;
276  for (range, entity) in entities {
277    len += range.start() - start_index;
278    len += entity.bytes_len();
279    start_index = *range.end() + 1;
280  }
281  len += bytes.len() - start_index;
282  len
283}
284
285#[inline]
286fn call_trait_method_byte<'a, T: IBytesTrait>(
287  bytes: &'a [Byte],
288  entities: &'a [(CodeRange, T)],
289  mut index: usize,
290) -> Option<&'a Byte> {
291  if entities.is_empty() {
292    return bytes.get(index);
293  }
294  let mut prev_start_byte_index: usize = 0;
295  for (range, entity) in entities {
296    let start_byte_index = *range.start();
297    let cur_index = prev_start_byte_index + index;
298    if cur_index < start_byte_index {
299      // in the bytes between 'start_index' to 'start_byte_index'
300      return bytes.get(cur_index);
301    }
302    let entity_len = entity.bytes_len();
303    let cur_entity_index = cur_index - start_byte_index;
304    if cur_entity_index < entity_len {
305      // in entity
306      return entity.byte(cur_entity_index);
307    }
308    index = cur_entity_index - entity_len;
309    prev_start_byte_index = range.end() + 1;
310  }
311  bytes.get(prev_start_byte_index + index)
312}
313
314/// DecodedData, impl the ICodedDataTrait and IBytesTrait and IntoIterator.
315#[derive(Debug)]
316pub struct DecodedData<'b> {
317  inner_bytes: Cow<'b, [Byte]>,
318  entities: Vec<(CodeRange, (char, ByteList))>,
319  errors: Vec<(CodeRange, anyhow::Error)>,
320}
321
322impl<'b> ICodedDataTrait for DecodedData<'b> {}
323
324impl<'b> IBytesTrait for DecodedData<'b> {
325  // bytes len
326  fn bytes_len(&self) -> usize {
327    call_trait_method_bytes_len(&self.inner_bytes, &self.entities)
328  }
329  // byte
330  fn byte(&self, index: usize) -> Option<&Byte> {
331    call_trait_method_byte(&self.inner_bytes, &self.entities, index)
332  }
333}
334
335impl<'b> DecodedData<'b> {
336  // detect if has errors
337  pub fn is_ok(&self) -> bool {
338    self.errors.is_empty()
339  }
340  // get errors
341  pub fn get_errors(&self) -> &[(CodeRange, anyhow::Error)] {
342    &self.errors
343  }
344  // entity count
345  pub fn entity_count(&self) -> usize {
346    self.entities.len()
347  }
348  // to owned
349  pub fn to_owned(&mut self) {
350    if !self.entities.is_empty() {
351      let bytes = self.to_bytes();
352      self.inner_bytes = Cow::Owned(bytes);
353      self.entities.clear();
354    }
355  }
356  // into bytes
357  pub fn into_bytes(self) -> ByteList {
358    if self.entities.is_empty() {
359      return self.inner_bytes.into_owned();
360    }
361    self.to_bytes()
362  }
363  // get bytes with cow
364  pub fn bytes(&self) -> Cow<'b, [Byte]> {
365    if self.entities.is_empty() {
366      return self.inner_bytes.clone();
367    }
368    return Cow::Owned(self.to_bytes());
369  }
370}
371
372pub trait IBytesTrait {
373  fn byte(&self, index: usize) -> Option<&Byte>;
374  fn bytes_len(&self) -> usize;
375}
376
377impl IBytesTrait for (char, ByteList) {
378  fn byte(&self, index: usize) -> Option<&Byte> {
379    self.1.get(index)
380  }
381  fn bytes_len(&self) -> usize {
382    self.1.len()
383  }
384}
385
386impl IBytesTrait for CharEntity {
387  fn byte(&self, index: usize) -> Option<&Byte> {
388    let prefix_len = self.prefix_len();
389    if index > prefix_len {
390      // from entity data or
391      let cur_index = index - prefix_len - 1;
392      return match cur_index.cmp(&self.entity_data.len()) {
393        Ordering::Less => self.entity_data.get(cur_index),
394        Ordering::Equal => Some(&b';'),
395        Ordering::Greater => None,
396      };
397    } else if index == 0 {
398      // the first byte
399      return Some(&b'&');
400    } else {
401      // the next prefix bytes
402      match prefix_len {
403        1 => Some(&b'#'),
404        2 => {
405          if index == 1 {
406            return Some(&b'#');
407          }
408          Some(&b'x')
409        }
410        _ => unreachable!(),
411      }
412    }
413  }
414  fn bytes_len(&self) -> usize {
415    let prefix_len = self.prefix_len();
416    // '&;' => 2 '#'|'#x' => prefix_len
417    2 + prefix_len + self.entity_data.len()
418  }
419}
420
421/// ICodedDataTrait
422pub trait ICodedDataTrait
423where
424  for<'a> &'a Self: Into<StringResult> + Into<ByteList> + Into<CharListResult>,
425{
426  // to string
427  fn to_string(&self) -> StringResult {
428    self.into()
429  }
430  // to byptes
431  fn to_bytes(&self) -> ByteList {
432    self.into()
433  }
434  // to char list
435  fn to_chars(&self) -> CharListResult {
436    self.into()
437  }
438}
439
440pub struct DataIter<'a, T: IBytesTrait> {
441  only_bytes: bool,
442  byte_index: usize,
443  total_bytes: usize,
444  entity_index: usize,
445  total_entities: usize,
446  byte_index_entity_looped: usize,
447  byte_index_of_next_entity: Option<usize>,
448  bytes: &'a [Byte],
449  entities: &'a [(CodeRange, T)],
450}
451
452impl<'a, T: IBytesTrait> Iterator for DataIter<'a, T> {
453  type Item = IterDataItem<'a>;
454  fn next(&mut self) -> Option<Self::Item> {
455    let cur_byte_index = self.byte_index;
456    if cur_byte_index < self.total_bytes {
457      if self.only_bytes {
458        // all the entities have been looped
459        // or no entities exist
460        self.byte_index += 1;
461        return Some((&self.bytes[cur_byte_index], None));
462      }
463      let looped_index = self.byte_index_entity_looped;
464      if looped_index == 0 {
465        // if only_bytes = false
466        // the next entity byte index must always has a value
467        let next_index = self.byte_index_of_next_entity.unwrap();
468        // when the byte index equal to next entity start index
469        // should loop the entity instead of the bytes
470        if cur_byte_index != next_index {
471          self.byte_index += 1;
472          return Some((&self.bytes[cur_byte_index], None));
473        }
474        // otherwise should loop the entity bytes
475      }
476      let cur_entity = &self.entities[self.entity_index];
477      let cur_byte = &cur_entity
478        .1
479        .byte(looped_index)
480        .expect("The 'byte' method must use a correct 'index' parameter.");
481      let entity_position = Some((self.entity_index, looped_index));
482      if looped_index == cur_entity.1.bytes_len() - 1 {
483        // end the cur_entity_bytes
484        self.byte_index_entity_looped = 0;
485        self.entity_index += 1;
486        // reset the byte index to the next of entity end index
487        self.byte_index = cur_entity.0.end() + 1;
488        // judge if entities have all looped
489        if self.entity_index < self.total_entities {
490          self.byte_index_of_next_entity = Some(*self.entities[self.entity_index].0.start());
491        } else {
492          // now only bytes left
493          self.only_bytes = true;
494        }
495      } else {
496        self.byte_index_entity_looped += 1;
497      }
498      return Some((cur_byte, entity_position));
499    }
500    None
501  }
502}
503
504impl<'a> IntoIterator for &'a DecodedData<'a> {
505  type Item = IterDataItem<'a>;
506  type IntoIter = DataIter<'a, (char, ByteList)>;
507  fn into_iter(self) -> Self::IntoIter {
508    gen_into_iter(&self.inner_bytes, &self.entities)
509  }
510}
511
512impl<'a> IntoIterator for &'a EncodedData<'a> {
513  type Item = IterDataItem<'a>;
514  type IntoIter = DataIter<'a, CharEntity>;
515  fn into_iter(self) -> Self::IntoIter {
516    gen_into_iter(&self.inner_bytes, &self.entities)
517  }
518}
519/**
520 * impl decode data to string
521 *  
522 *
523 */
524impl<'b> From<&DecodedData<'b>> for StringResult {
525  fn from(value: &DecodedData<'b>) -> Self {
526    call_into_string_trait(&value.inner_bytes, &value.entities, |&(ch, _), result| {
527      result.push(ch)
528    })
529  }
530}
531
532impl<'b> From<DecodedData<'b>> for StringResult {
533  fn from(value: DecodedData<'b>) -> Self {
534    (&value).into()
535  }
536}
537
538/**
539 * impl decode data into vec bytes
540 */
541impl<'b> From<&DecodedData<'b>> for ByteList {
542  fn from(value: &DecodedData<'b>) -> Self {
543    value
544      .into_iter()
545      .map(|(byte, _)| *byte)
546      .collect::<ByteList>()
547  }
548}
549// easy to call `decode(data).into()`
550impl<'b> From<DecodedData<'b>> for ByteList {
551  fn from(value: DecodedData<'b>) -> Self {
552    if value.entity_count() == 0 {
553      return value.inner_bytes.into_owned();
554    }
555    (&value).into()
556  }
557}
558
559/**
560 * impl decoded data into char list
561 */
562impl<'b> From<&DecodedData<'b>> for CharListResult {
563  fn from(value: &DecodedData<'b>) -> Self {
564    call_into_char_list_trait(&value.inner_bytes, &value.entities, |&(ch, _), result| {
565      result.push(ch)
566    })
567  }
568}
569
570impl<'b> From<DecodedData<'b>> for CharListResult {
571  fn from(value: DecodedData<'b>) -> Self {
572    (&value).into()
573  }
574}
575/// EncodedData, impl the ICodedDataTrait and IBytesTrait and IntoIterator.
576#[derive(Debug)]
577pub struct EncodedData<'b> {
578  inner_bytes: Cow<'b, [Byte]>,
579  entities: Vec<(CodeRange, CharEntity)>,
580}
581
582impl<'b> ICodedDataTrait for EncodedData<'b> {}
583
584impl<'b> IBytesTrait for EncodedData<'b> {
585  fn byte(&self, index: usize) -> Option<&Byte> {
586    call_trait_method_byte(&self.inner_bytes, &self.entities, index)
587  }
588  fn bytes_len(&self) -> usize {
589    call_trait_method_bytes_len(&self.inner_bytes, &self.entities)
590  }
591}
592
593impl<'b> EncodedData<'b> {
594  // detect
595  pub fn entity_count(&self) -> usize {
596    self.entities.len()
597  }
598  // to owned
599  pub fn to_owned(&mut self) {
600    if !self.entities.is_empty() {
601      let bytes = self.to_bytes();
602      self.inner_bytes = Cow::Owned(bytes);
603      self.entities.clear();
604    }
605  }
606  // into bytes
607  pub fn into_bytes(self) -> ByteList {
608    if self.entities.is_empty() {
609      return self.inner_bytes.into_owned();
610    }
611    self.to_bytes()
612  }
613  // get bytes with cow
614  pub fn bytes(&self) -> Cow<'b, [Byte]> {
615    if self.entities.is_empty() {
616      return self.inner_bytes.clone();
617    }
618    return Cow::Owned(self.to_bytes());
619  }
620}
621
622impl<'b> From<&EncodedData<'b>> for StringResult {
623  fn from(value: &EncodedData<'b>) -> Self {
624    call_into_string_trait(
625      &value.inner_bytes,
626      &value.entities,
627      |char_entity, result| {
628        char_entity.write_string(result);
629      },
630    )
631  }
632}
633
634impl<'b> From<EncodedData<'b>> for StringResult {
635  fn from(value: EncodedData<'b>) -> Self {
636    (&value).into()
637  }
638}
639
640impl<'b> From<&EncodedData<'b>> for CharListResult {
641  fn from(value: &EncodedData<'b>) -> Self {
642    call_into_char_list_trait(
643      &value.inner_bytes,
644      &value.entities,
645      |char_entity, result| {
646        char_entity.write_chars(result);
647      },
648    )
649  }
650}
651
652impl<'b> From<EncodedData<'b>> for CharListResult {
653  fn from(value: EncodedData<'b>) -> Self {
654    (&value).into()
655  }
656}
657
658impl<'b> From<&EncodedData<'b>> for ByteList {
659  fn from(value: &EncodedData<'b>) -> Self {
660    value
661      .into_iter()
662      .map(|(byte, _)| *byte)
663      .collect::<ByteList>()
664  }
665}
666
667impl<'b> From<EncodedData<'b>> for ByteList {
668  fn from(value: EncodedData<'b>) -> Self {
669    if value.entity_count() == 0 {
670      return value.inner_bytes.into_owned();
671    }
672    (&value).into()
673  }
674}
675
676/// EncodeType: html entity encoding format
677#[derive(Copy, Clone, Default)]
678#[repr(u8)]
679pub enum EncodeType {
680  #[default]
681  Named = 0b00001,
682  Hex = 0b00010,
683  Decimal = 0b00100,
684  NamedOrHex = 0b00011,
685  NamedOrDecimal = 0b00101,
686}
687
688#[inline]
689fn filter_entity_set(
690  charset: &EntityCharBytes,
691  encode_type: &EncodeType,
692  ch: &char,
693) -> EncodeFilterReturnData {
694  let encode_type = *encode_type as u8;
695  if let Some(&v) = charset.get(ch) {
696    if (encode_type & EncodeType::Named as u8) > 0 {
697      return (true, Some((EntityType::Named, Cow::from(v))));
698    }
699    return (true, None);
700  }
701  (false, None)
702}
703
704/// The character set that needs to be encoded to html entity.
705#[derive(Default)]
706pub enum CharacterSet {
707  /// all characters
708  All = 1,
709  /// non ASCII, code point > 0xff                
710  NonASCII = 2,
711  /// html: '<','>','&'    
712  #[default]
713  Html = 3,
714  /// special characters: '<','>','&', '\'', '"'                
715  SpecialChars = 4,
716  /// html and non ascii
717  HtmlAndNonASCII = 5,
718  /// special characters and non ascii
719  SpecialCharsAndNonASCII = 6,
720}
721
722impl CharacterSet {
723  /// check if a character need encode by the encode type, and encode it if nessessary.
724  pub fn filter(&self, ch: &char, encode_type: &EncodeType) -> EncodeFilterReturnData {
725    use CharacterSet::*;
726    match self {
727      SpecialChars => filter_entity_set(&SPECIAL_BYTES, encode_type, ch),
728      Html => filter_entity_set(&HTML_BYTES, encode_type, ch),
729      NonASCII => (*ch as u32 > 0xff, None),
730      HtmlAndNonASCII => {
731        let result = CharacterSet::NonASCII.filter(ch, encode_type);
732        if result.0 {
733          return result;
734        }
735        CharacterSet::Html.filter(ch, encode_type)
736      }
737      SpecialCharsAndNonASCII => {
738        let result = CharacterSet::NonASCII.filter(ch, encode_type);
739        if result.0 {
740          return result;
741        }
742        CharacterSet::SpecialChars.filter(ch, encode_type)
743      }
744      All => (true, None),
745    }
746  }
747  /// Check if the character is in the charcter set
748  pub fn contains(&self, ch: &char) -> bool {
749    use CharacterSet::*;
750    match self {
751      SpecialChars => SPECIAL_BYTES.get(ch).is_some(),
752      Html => HTML_BYTES.get(ch).is_some(),
753      NonASCII => *ch as u32 > 0xff,
754      HtmlAndNonASCII => CharacterSet::NonASCII.contains(ch) || CharacterSet::Html.contains(ch),
755      SpecialCharsAndNonASCII => {
756        CharacterSet::NonASCII.contains(ch) || CharacterSet::SpecialChars.contains(ch)
757      }
758      All => true,
759    }
760  }
761}
762
763#[derive(PartialEq, Eq, Debug)]
764pub enum EntityType {
765  Named,
766  Hex,
767  Decimal,
768}
769
770/// CharEntity struct
771#[derive(Debug)]
772pub struct CharEntity {
773  entity_type: EntityType,
774  entity_data: Cow<'static, [Byte]>,
775}
776
777impl CharEntity {
778  // prefix len
779  pub fn prefix_len(&self) -> usize {
780    match &self.entity_type {
781      EntityType::Named => 0,
782      EntityType::Hex => 2,
783      EntityType::Decimal => 1,
784    }
785  }
786  // write bytes
787  pub fn write_bytes(&self, bytes: &mut ByteList) {
788    bytes.push(b'&');
789    match &self.entity_type {
790      EntityType::Named => {
791        // nothing to do
792      }
793      EntityType::Hex => {
794        bytes.push(b'#');
795        bytes.push(b'x');
796      }
797      EntityType::Decimal => {
798        bytes.push(b'#');
799      }
800    }
801    bytes.extend_from_slice(&self.entity_data);
802    bytes.push(b';');
803  }
804  // write chars
805  pub fn write_chars(&self, chars: &mut Vec<char>) {
806    chars.push('&');
807    match &self.entity_type {
808      EntityType::Named => {
809        // nothing to do
810      }
811      EntityType::Hex => {
812        chars.push('#');
813        chars.push('x');
814      }
815      EntityType::Decimal => {
816        chars.push('#');
817      }
818    }
819    for byte in self.entity_data.iter() {
820      chars.push(*byte as char);
821    }
822    chars.push(';');
823  }
824  // write string
825  pub fn write_string(&self, code: &mut String) {
826    code.push('&');
827    match &self.entity_type {
828      EntityType::Named => {
829        // nothing to do
830      }
831      EntityType::Hex => {
832        code.push('#');
833        code.push('x');
834      }
835      EntityType::Decimal => {
836        code.push('#');
837      }
838    }
839    for byte in self.entity_data.iter() {
840      code.push(*byte as char);
841    }
842    code.push(';');
843  }
844  // to bytes
845  pub fn to_bytes(&self) -> ByteList {
846    let mut bytes: ByteList = Vec::with_capacity(self.entity_data.len() + 2);
847    self.write_bytes(&mut bytes);
848    bytes
849  }
850  // get out of entity_data
851  pub fn data(self) -> ByteList {
852    self.entity_data.into_owned()
853  }
854}
855
856impl ToString for CharEntity {
857  fn to_string(&self) -> String {
858    let mut code = String::with_capacity(self.entity_data.len() + 2);
859    self.write_string(&mut code);
860    code
861  }
862}
863/// Entity struct
864#[derive(Default)]
865pub struct Entity;
866
867impl Entity {
868  /// Decode html entity utf-8 bytes(does't contain the beginning '&' and the end ';') into the character.
869  pub fn decode(bytes: &[Byte]) -> AnyhowResult<char> {
870    let total = bytes.len();
871    if total == 0 {
872      return Err(
873        HtmlEntityError::Decode(String::from(
874          "Can't decode with an empty bytelist argument.",
875        ))
876        .into(),
877      );
878    }
879    // check type
880    let first = bytes[0];
881    let mut entity_type: EntityType = EntityType::Named;
882    if first.is_ascii_alphabetic() {
883      for ch in &bytes[1..] {
884        if !ch.is_ascii_alphanumeric() {
885          let code = std::str::from_utf8(bytes)?;
886          return Err(
887            HtmlEntityError::Decode(format!(
888							"Html entity name can't contain characters other than English letters or numbers, here is '{}'",
889							code
890						))
891            .into(),
892          );
893        }
894      }
895    } else if first == b'#' && total > 1 {
896      let second = bytes[1];
897      match second {
898        b'0'..=b'9' => {
899          // decimal
900          for byte in &bytes[2..] {
901            if !byte.is_ascii_digit() {
902              let code = std::str::from_utf8(bytes)?;
903              return Err(
904                HtmlEntityError::Decode(format!(
905                  "Html entity number can't contain characters other than numbers, here is '{}'.",
906                  code
907                ))
908                .into(),
909              );
910            }
911          }
912          entity_type = EntityType::Decimal;
913        }
914        b'x' | b'X' => {
915          // hex
916          if total > 2 {
917            for byte in &bytes[2..] {
918              if !byte.is_ascii_hexdigit() {
919                let code = std::str::from_utf8(bytes)?;
920                return Err(
921                  HtmlEntityError::Decode(format!(
922										"Hexadecimal html entity can't contain characters other than hexadecimal, here is '&{};'.",
923										code
924									))
925                  .into(),
926                );
927              }
928            }
929            entity_type = EntityType::Hex;
930          } else {
931            return Err(
932              HtmlEntityError::Decode(String::from(
933                "Hexadecimal html entity must contain one or more hexadecimal characters.",
934              ))
935              .into(),
936            );
937          }
938        }
939        _ => {
940          return Err(
941            HtmlEntityError::Decode(String::from("Illegal html entity number character format"))
942              .into(),
943          );
944        }
945      }
946    } else {
947      return Err(
948        HtmlEntityError::Decode(String::from("Illegal html entity character format.")).into(),
949      );
950    }
951    // go on the steps
952    match entity_type {
953      // named entity
954      EntityType::Named => {
955        // normal entity characters
956        if let Some(&ch) = NORMAL_NAME_ENTITY_BYTE.get(bytes) {
957          return Ok(ch);
958        }
959        // try to find the entity
960        if let Some(&(start_index, end_index)) = FIRST_LETTER_POSITION.get(&bytes[0]) {
961          if let Some(find_index) = LETTER_ORDERED_ENTITIES[start_index..end_index]
962            .iter()
963            .position(|&(name, _)| name == bytes)
964          {
965            let last_index = start_index + find_index;
966            let (_, code) = LETTER_ORDERED_ENTITIES[last_index];
967            return Ok(code);
968          }
969        }
970        let code = std::str::from_utf8(bytes)?;
971        Err(
972          HtmlEntityError::Decode(format!(
973            "Unable to find corresponding the html entity name '&{};'",
974            code
975          ))
976          .into(),
977        )
978      }
979      // hex entity
980      EntityType::Hex => {
981        // remove the prefix '#x'
982        numbers_to_char(&bytes[2..], 16)
983      }
984      // decimal entity
985      EntityType::Decimal => {
986        // remove the prefix '#'
987        numbers_to_char(&bytes[1..], 10)
988      }
989    }
990  }
991  /// Similar to the `decode` method, but takes a character type as an argument.
992  pub fn decode_chars(chars: &[char]) -> AnyhowResult<char> {
993    let total = chars.len();
994    if total == 0 {
995      return Err(
996        HtmlEntityError::Decode(String::from(
997          "Can't decode with an empty character list argument.",
998        ))
999        .into(),
1000      );
1001    }
1002    let mut bytes: ByteList = Vec::with_capacity(total);
1003    let max_u8 = u8::MAX as u32;
1004    let is_non_bytes = chars.iter().any(|ch| {
1005      let char_code = *ch as u32;
1006      if char_code > max_u8 {
1007        true
1008      } else {
1009        bytes.push(char_code as Byte);
1010        false
1011      }
1012    });
1013    if !is_non_bytes {
1014      return Entity::decode(&bytes);
1015    }
1016    Err(
1017      HtmlEntityError::Decode(format!(
1018        "Unable to find corresponding the html entity name '&{};'",
1019        chars.iter().collect::<String>()
1020      ))
1021      .into(),
1022    )
1023  }
1024}
1025
1026/// Encode character into html entity.
1027///
1028/// # Examples
1029///
1030/// ```
1031/// use htmlentity::entity::*;
1032///
1033/// let character = '<';
1034/// let char_entity = encode_char(&character, &EncodeType::Named);
1035/// assert!(char_entity.is_some());
1036/// assert_eq!(char_entity.unwrap().to_string(), "&lt;");
1037///
1038/// let character = '<';
1039/// let char_entity = encode_char(&character, &EncodeType::Decimal);
1040/// assert!(char_entity.is_some());
1041/// assert_eq!(char_entity.unwrap().to_string(), "&#60;");
1042///
1043/// let character = '<';
1044/// let char_entity = encode_char(&character, &EncodeType::Hex);
1045/// assert!(char_entity.is_some());
1046/// assert_eq!(char_entity.unwrap().to_string(), "&#x3c;");
1047/// ```
1048pub fn encode_char(ch: &char, encode_type: &EncodeType) -> Option<CharEntity> {
1049  let encode_type = *encode_type as u8;
1050  let char_code = *ch as u32;
1051  // encode to named
1052  if (encode_type & (EncodeType::Named as u8)) > 0 {
1053    // find the named entity from the ENTITIES
1054    if let Ok(mut index) = ENTITIES.binary_search_by_key(&char_code, |&(_, code)| code) {
1055      // make sure the entity is the first one, short and lowercase
1056      while index > 0 {
1057        let prev_index = index - 1;
1058        if ENTITIES[prev_index].1 != char_code {
1059          break;
1060        }
1061        index = prev_index;
1062      }
1063      let &(entity, _) = &ENTITIES[index];
1064      return Some(CharEntity {
1065        entity_type: EntityType::Named,
1066        entity_data: Cow::from(entity),
1067      });
1068    }
1069  }
1070  // encode to hex
1071  if (encode_type & (EncodeType::Hex as u8)) > 0 {
1072    return Some(CharEntity {
1073      entity_type: EntityType::Hex,
1074      entity_data: Cow::Owned(format!("{:x}", char_code).into_bytes()),
1075    });
1076  }
1077  // encode to decimal
1078  if (encode_type & (EncodeType::Decimal as u8)) > 0 {
1079    return Some(CharEntity {
1080      entity_type: EntityType::Decimal,
1081      entity_data: Cow::Owned(char_code.to_string().into_bytes()),
1082    });
1083  }
1084  // no need to encode or failure
1085  None
1086}
1087
1088/// Encode characters in the utf-8 bytes into html entities according to the specified encoding format and specified encoding character set.
1089///
1090/// # Examples
1091///
1092/// ```
1093/// use htmlentity::entity::*;
1094/// use htmlentity::types::AnyhowResult;
1095/// # fn main() -> AnyhowResult<()> {
1096/// let html = "<div class='header'></div>";
1097/// let encoded_data = encode(html.as_bytes(), &EncodeType::Named, &CharacterSet::SpecialChars);
1098/// // Convert encoded data to string.
1099/// let data_to_string = encoded_data.to_string();
1100/// assert!(data_to_string.is_ok());
1101/// assert_eq!(data_to_string?, "&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;");
1102/// // Convert encoded data to Vec<char>.
1103/// let data_to_chars = encoded_data.to_chars();
1104/// assert!(data_to_chars.is_ok());
1105/// assert_eq!(data_to_chars?, String::from("&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;").chars().collect::<Vec<char>>());
1106/// // Convert encoded data to bytes(Vec<u8>).
1107/// let data_to_bytes = encoded_data.to_bytes();
1108/// let bytes = b"&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;";
1109/// assert_eq!(data_to_bytes, bytes);
1110/// // Encoded data can be iterated by byte
1111/// for (idx, (byte, _)) in encoded_data.into_iter().enumerate(){
1112///    assert_eq!(*byte, bytes[idx]);
1113/// }
1114/// // Get the total bytes size through the 'bytes_len' method and visit the byte through the 'byte(idx)' method.
1115/// for idx in 0..encoded_data.bytes_len(){
1116///    assert_eq!(encoded_data.byte(idx), Some(&bytes[idx]));
1117/// }
1118/// # Ok(())
1119/// # }
1120/// ```
1121pub fn encode<'a>(
1122  content: &'a [Byte],
1123  encode_type: &EncodeType,
1124  charset: &CharacterSet,
1125) -> EncodedData<'a> {
1126  encode_with(content, encode_type, |ch, encode_type| {
1127    charset.filter(ch, encode_type)
1128  })
1129}
1130
1131/// Similar to the `encode` method, but directly writes the byte data into the last parameter passed in.
1132///
1133/// # Examples
1134///
1135/// ```
1136/// use htmlentity::entity::*;
1137/// use htmlentity::types::{ ByteList, AnyhowResult };
1138///
1139/// let html = "<div class='header'></div>";
1140/// let mut data: ByteList = vec![];
1141/// encode_to(html.as_bytes(), &EncodeType::Named, &CharacterSet::SpecialChars, &mut data);
1142/// assert_eq!(data, b"&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;");
1143/// ```
1144pub fn encode_to(
1145  content: &[Byte],
1146  encode_type: &EncodeType,
1147  charset: &CharacterSet,
1148  data: &mut ByteList,
1149) {
1150  encode_with_to(
1151    content,
1152    encode_type,
1153    |ch, encode_type| charset.filter(ch, encode_type),
1154    data,
1155  );
1156}
1157
1158/// Encode the html entities in utf-8 bytes into encoded data, and specify the characters to be encoded and the encoding format through the `filter_fn` method parameter.
1159///
1160/// # Examples
1161/// ```
1162/// use htmlentity::entity::*;
1163/// use htmlentity::types::AnyhowResult;
1164/// use std::borrow::Cow;
1165/// # fn main() -> AnyhowResult<()> {
1166/// let html = "<div class='header'></div>";
1167/// let charset = CharacterSet::SpecialChars;
1168/// let encoded_data = encode_with(&html.as_bytes(), &EncodeType::Named, |ch, encode_type|{
1169///    // Use html entity number encoding for single quotes (')
1170///    if ch == &'\''{
1171///       if let Some(char_entity) = encode_char(ch, &EncodeType::Decimal){
1172///         return (true, Some((EntityType::Decimal, Cow::from(char_entity.data()))));
1173///       }
1174///    }
1175///    return charset.filter(ch, encode_type);
1176/// });
1177/// let data_to_string = encoded_data.to_string();
1178/// assert!(data_to_string.is_ok());
1179/// assert_eq!(data_to_string?, String::from("&lt;div class=&#39;header&#39;&gt;&lt;/div&gt;"));
1180/// # Ok(())
1181/// # }
1182/// ```
1183pub fn encode_with<'a>(
1184  content: &'a [Byte],
1185  encode_type: &EncodeType,
1186  filter_fn: impl Fn(&char, &EncodeType) -> EncodeFilterReturnData,
1187) -> EncodedData<'a> {
1188  let mut entities: Vec<(CodeRange, CharEntity)> = vec![];
1189  let _ = loop_utf8_bytes(content, |result, (start_index, index)| match result {
1190    Utf8ParsedData::Correct(ch) => {
1191      let (need_encode, maybe_entity) = filter_fn(&ch, encode_type);
1192      if need_encode {
1193        if let Some((entity_type, entity_data)) = maybe_entity {
1194          entities.push((
1195            start_index..=index,
1196            CharEntity {
1197              entity_type,
1198              entity_data,
1199            },
1200          ));
1201        } else if let Some(entity) = encode_char(&ch, encode_type) {
1202          entities.push((start_index..=index, entity));
1203        }
1204      }
1205      Ok(())
1206    }
1207    _ => Ok(()),
1208  });
1209  EncodedData {
1210    inner_bytes: Cow::from(content),
1211    entities,
1212  }
1213}
1214
1215/// Similar to the `encode_with` method, but directly writes the byte data into the last parameter passed in.
1216///
1217/// # Examples
1218/// ```
1219/// use htmlentity::entity::*;
1220/// use htmlentity::types::{ ByteList, AnyhowResult };
1221/// use std::borrow::Cow;
1222/// # fn main() -> AnyhowResult<()> {
1223/// let html = "<div class='header'></div>";
1224/// let charset = CharacterSet::SpecialChars;
1225/// let mut data: ByteList = vec![];
1226/// let encoded_data = encode_with_to(&html.as_bytes(), &EncodeType::Named, |ch, encode_type|{
1227///    return charset.filter(ch, encode_type);
1228/// }, &mut data);
1229/// assert_eq!(data, b"&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;");
1230/// # Ok(())
1231/// # }
1232/// ```
1233pub fn encode_with_to(
1234  content: &[Byte],
1235  encode_type: &EncodeType,
1236  filter_fn: impl Fn(&char, &EncodeType) -> EncodeFilterReturnData,
1237  data: &mut ByteList,
1238) {
1239  let _ = loop_utf8_bytes(content, |result, (start_index, end_index)| match result {
1240    Utf8ParsedData::Correct(ch) => {
1241      let (need_encode, maybe_entity) = filter_fn(&ch, encode_type);
1242      if need_encode {
1243        if let Some((entity_type, entity_data)) = maybe_entity {
1244          let entity = CharEntity {
1245            entity_type,
1246            entity_data,
1247          };
1248          entity.write_bytes(data);
1249          return Ok(());
1250        } else if let Some(entity) = encode_char(&ch, encode_type) {
1251          entity.write_bytes(data);
1252          return Ok(());
1253        }
1254      }
1255      data.extend_from_slice(&content[start_index..=end_index]);
1256      Ok(())
1257    }
1258    Utf8ParsedData::Wrong(_) => {
1259      data.extend_from_slice(&content[start_index..=end_index]);
1260      Ok(())
1261    }
1262  });
1263}
1264
1265/// Encode a list of characters using a filter function.
1266///
1267/// # Examples
1268///
1269/// ```
1270/// use htmlentity::entity::*;
1271/// use std::borrow::Cow;
1272///
1273/// let chars = String::from("<div class='header'></div>").chars().collect::<Vec<char>>();
1274/// let character_set = CharacterSet::HtmlAndNonASCII;
1275/// let encoded_chars = encode_chars_with(&chars, |ch|{
1276///   if character_set.contains(ch) || *ch == '\''{
1277///      return Some(&EncodeType::Named);
1278///   }
1279///   return None;
1280/// });
1281/// assert_eq!(encoded_chars.iter().collect::<String>(), "&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;");
1282/// ```
1283pub fn encode_chars_with(
1284  chars: &[char],
1285  filter_fn: impl Fn(&char) -> Option<&EncodeType>,
1286) -> Cow<'_, [char]> {
1287  let mut result = vec![];
1288  let mut iter = chars.iter();
1289  for (index, ch) in iter.by_ref().enumerate() {
1290    if let Some(encode_type) = filter_fn(ch) {
1291      if let Some(entity) = encode_char(ch, encode_type) {
1292        if index > 0 {
1293          result.extend_from_slice(&chars[..index]);
1294        }
1295        entity.write_chars(&mut result);
1296        break;
1297      }
1298    }
1299  }
1300  for ch in iter {
1301    if let Some(encode_type) = filter_fn(ch) {
1302      if let Some(entity) = encode_char(ch, encode_type) {
1303        entity.write_chars(&mut result);
1304        continue;
1305      }
1306    }
1307    result.push(*ch);
1308  }
1309  if !result.is_empty() {
1310    return Cow::Owned(result);
1311  }
1312  Cow::Borrowed(chars)
1313}
1314
1315/// Decode the html entities in the character list.
1316///
1317/// # Examples
1318///
1319/// ```
1320/// use htmlentity::entity::*;
1321/// use std::borrow::Cow;
1322///
1323/// let char_list = Cow::from(vec!['a', '<', 'b']);
1324/// assert_eq!(decode_chars(&String::from("a&lt;b").chars().collect::<Vec<char>>()), char_list);
1325/// assert_eq!(decode_chars(&String::from("a&#60;b").chars().collect::<Vec<char>>()), char_list);
1326/// assert_eq!(decode_chars(&String::from("a&#x3c;b").chars().collect::<Vec<char>>()), char_list);
1327/// ```
1328pub fn decode_chars(chars: &[char]) -> Cow<'_, [char]> {
1329  let mut data: Vec<char> = vec![];
1330  let mut is_in_entity = false;
1331  let mut start_index: usize = 0;
1332  for (idx, ch) in chars.iter().enumerate() {
1333    if !is_in_entity {
1334      // not in entity
1335      if *ch == '&' {
1336        is_in_entity = true;
1337        start_index = idx + 1;
1338      }
1339    } else {
1340      // in entity
1341      match *ch {
1342        ';' => {
1343          // end of the entity, ignore '&;'
1344          if start_index != idx {
1345            let bytes = tr_chars_to_utf8_bytes(&chars[start_index..idx]);
1346            if let Some(bytes) = bytes {
1347              if let Ok(decode_char) = Entity::decode(&bytes) {
1348                // find at least one entity
1349                // append the entity's prev chars
1350                if start_index > 1 {
1351                  data.extend_from_slice(&chars[..start_index - 1]);
1352                }
1353                // append entity character
1354                data.push(decode_char);
1355                // append the left character
1356                let next_idx = idx + 1;
1357                if next_idx != chars.len() {
1358                  decode_chars_to(&chars[next_idx..], &mut data);
1359                }
1360                return Cow::Owned(data);
1361              }
1362            }
1363          }
1364          is_in_entity = false;
1365        }
1366        '&' => {
1367          // always reset entity start index
1368          start_index = idx + 1;
1369        }
1370        _ => {}
1371      }
1372    }
1373  }
1374  Cow::from(chars)
1375}
1376
1377/// Similar to the `decode_chars` method, but directly writes the character data into the last parameter passed in.
1378///
1379/// # Examples
1380///
1381/// ```
1382/// use htmlentity::entity::*;
1383/// use std::borrow::Cow;
1384///
1385/// let char_list = vec!['a','<', 'b'];
1386/// let mut data: Vec<char> = vec![];
1387/// decode_chars_to(&String::from("a&lt;b").chars().collect::<Vec<char>>(), &mut data);
1388/// assert_eq!(data, char_list);
1389///
1390/// data.clear();
1391/// decode_chars_to(&String::from("a&#60;b").chars().collect::<Vec<char>>(), &mut data);
1392/// assert_eq!(data, char_list);
1393///
1394/// data.clear();
1395/// decode_chars_to(&String::from("a&#x3c;b").chars().collect::<Vec<char>>(), &mut data);
1396/// assert_eq!(data, char_list);
1397/// ```
1398pub fn decode_chars_to(chars: &[char], data: &mut Vec<char>) {
1399  let mut is_in_entity = false;
1400  let mut start_index: usize = 0;
1401  for (idx, &ch) in chars.iter().enumerate() {
1402    if !is_in_entity {
1403      // not in entity
1404      if ch == '&' {
1405        is_in_entity = true;
1406        start_index = idx + 1;
1407      } else {
1408        data.push(ch);
1409      }
1410    } else {
1411      // in entity
1412      match ch {
1413        ';' => {
1414          // end of the entity, ignore '&;'
1415          if start_index != idx {
1416            let bytes = tr_chars_to_utf8_bytes(&chars[start_index..idx]);
1417            if let Some(bytes) = bytes {
1418              if let Ok(decode_char) = Entity::decode(&bytes) {
1419                // find the
1420                data.push(decode_char);
1421                is_in_entity = false;
1422                continue;
1423              }
1424            }
1425          }
1426          // not a regular entity
1427          data.extend_from_slice(&chars[start_index - 1..=idx]);
1428          is_in_entity = false;
1429        }
1430        '&' => {
1431          // always reset the entity start index, '&a&lt;'
1432          data.extend_from_slice(&chars[start_index - 1..idx]);
1433          start_index = idx + 1;
1434        }
1435        _ => {}
1436      }
1437    }
1438  }
1439  if is_in_entity {
1440    // add the end non regular entity
1441    data.extend_from_slice(&chars[start_index - 1..]);
1442  }
1443}
1444
1445/// Decode html entities in utf-8 bytes.
1446///
1447/// # Examples
1448///
1449/// ```
1450/// use htmlentity::entity::*;
1451/// use htmlentity::types::AnyhowResult;
1452/// # fn main() -> AnyhowResult<()> {
1453/// let html = "<div class='header'></div>";
1454/// let orig_bytes = html.as_bytes();
1455/// let encoded_data = encode(orig_bytes, &EncodeType::Named, &CharacterSet::SpecialChars);
1456/// let encode_bytes = encoded_data.to_bytes();
1457/// assert_eq!(encode_bytes, b"&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;");
1458/// // decode the bytes
1459/// let decoded_data = decode(&encode_bytes);
1460/// let data_to_string = decoded_data.to_string();
1461/// assert!(data_to_string.is_ok());
1462/// assert_eq!(data_to_string?, String::from(html));
1463/// // Convert encoded data to Vec<char>.
1464/// let data_to_chars = decoded_data.to_chars();
1465/// assert!(data_to_chars.is_ok());
1466/// assert_eq!(data_to_chars?, String::from(html).chars().collect::<Vec<char>>());
1467/// // Convert encoded data to bytes(Vec<u8>).
1468/// let data_to_bytes = decoded_data.to_bytes();
1469/// assert_eq!(data_to_bytes, html.as_bytes());
1470/// // Decoded data can be also iterated by byte
1471/// for (idx, (byte, _)) in decoded_data.into_iter().enumerate(){
1472///    assert_eq!(*byte, orig_bytes[idx]);
1473/// }
1474/// // Get the total bytes size through the 'bytes_len' method and visit the byte through the 'byte(idx)' method.
1475/// for idx in 0..decoded_data.bytes_len(){
1476///    assert_eq!(decoded_data.byte(idx), Some(&orig_bytes[idx]));
1477/// }
1478/// # Ok(())
1479/// # }
1480/// ```
1481/// ```
1482pub fn decode(content: &[Byte]) -> DecodedData<'_> {
1483  let mut entities: Vec<(CodeRange, (char, ByteList))> = vec![];
1484  let mut errors: Vec<(CodeRange, anyhow::Error)> = vec![];
1485  let mut is_in_entity = false;
1486  let mut start_index: usize = 0;
1487  for (idx, byte) in content.iter().enumerate() {
1488    if !is_in_entity {
1489      // not in entity
1490      if *byte == b'&' {
1491        is_in_entity = true;
1492        start_index = idx + 1;
1493      }
1494    } else {
1495      // in entity
1496      match *byte {
1497        b';' => {
1498          // end of the entity, ignore '&;'
1499          if start_index != idx {
1500            let decode_result = Entity::decode(&content[start_index..idx]);
1501            match decode_result {
1502              Ok(decode_char) => {
1503                entities.push((
1504                  start_index - 1..=idx,
1505                  (decode_char, char_to_utf8_bytes(decode_char)),
1506                ));
1507              }
1508              Err(err) => {
1509                errors.push((start_index - 1..=idx, err));
1510              }
1511            };
1512          }
1513          is_in_entity = false;
1514        }
1515        b'&' => {
1516          // always reset entity start index
1517          errors.push((
1518            start_index - 1..=start_index - 1,
1519            HtmlEntityError::Decode(String::from("Unencoded html entity characters '&'.")).into(),
1520          ));
1521          start_index = idx + 1;
1522        }
1523        _ => {
1524          // entity bytes
1525        }
1526      }
1527    }
1528  }
1529  // wrong entity at the end
1530  DecodedData {
1531    inner_bytes: Cow::from(content),
1532    entities,
1533    errors,
1534  }
1535}
1536
1537/// Similar to the `decode` method, but directly writes the byte data into the last parameter passed in.
1538///
1539/// # Examples
1540///
1541/// ```
1542/// use htmlentity::entity::*;
1543/// use htmlentity::types::ByteList;
1544/// use std::borrow::Cow;
1545///
1546/// let encoded_bytes = b"&lt;div class=&apos;header&apos;&gt;&lt;/div&gt;";
1547/// let mut data: ByteList = vec![];
1548/// decode_to(encoded_bytes, &mut data);
1549/// assert_eq!(data, b"<div class='header'></div>");
1550/// ```
1551pub fn decode_to(content: &[Byte], data: &mut Vec<Byte>) {
1552  let mut is_in_entity = false;
1553  let mut start_index: usize = 0;
1554  for (idx, byte) in content.iter().enumerate() {
1555    if !is_in_entity {
1556      // not in entity
1557      if *byte == b'&' {
1558        is_in_entity = true;
1559        start_index = idx + 1;
1560      } else {
1561        data.push(*byte);
1562      }
1563    } else {
1564      // in entity
1565      match *byte {
1566        b';' => {
1567          // end of the entity, ignore '&;'
1568          if start_index != idx {
1569            if let Ok(decode_char) = Entity::decode(&content[start_index..idx]) {
1570              data.extend(char_to_utf8_bytes(decode_char));
1571              is_in_entity = false;
1572              continue;
1573            }
1574          }
1575          data.extend_from_slice(&content[start_index - 1..=idx]);
1576          is_in_entity = false;
1577        }
1578        b'&' => {
1579          // always reset entity start index
1580          data.extend_from_slice(&content[start_index - 1..idx]);
1581          start_index = idx + 1;
1582        }
1583        _ => {
1584          // entity bytes
1585        }
1586      }
1587    }
1588  }
1589  if is_in_entity {
1590    // add the end non regular entity
1591    data.extend_from_slice(&content[start_index - 1..]);
1592  }
1593}