1use crate::{
2 data::{ENTITIES, FIRST_LETTER_POSITION, LETTER_ORDERED_ENTITIES},
3 types::{
4 AnyhowResult, Byte, ByteList, BytesCharEntity, CharListResult, CodeRange, CodeRangeTuple,
5 EncodeFilterReturnData, EntityCharBytes, IterDataItem, StringResult,
6 },
7};
8
9use lazy_static::lazy_static;
10use std::{borrow::Cow, char, cmp::Ordering, collections::HashMap, fmt::Display};
11use thiserror::Error;
12
13lazy_static! {
14 static ref HTML_BYTES: EntityCharBytes = {
16 let mut map: EntityCharBytes = HashMap::with_capacity(3);
17 map.insert('>', b"gt");
18 map.insert('<', b"lt");
19 map.insert('&', b"amp");
20 map
21 };
22 static ref SPECIAL_BYTES: EntityCharBytes = {
24 let mut map: EntityCharBytes = HashMap::with_capacity(5);
25 map.insert('"', b"quot");
26 map.insert('\'', b"apos");
27 for (k, v) in HTML_BYTES.iter(){
28 map.insert(*k, *v);
29 }
30 map
31 };
32 static ref NORMAL_NAME_ENTITY_BYTE: BytesCharEntity = {
34 let mut map: BytesCharEntity = HashMap::with_capacity(10);
35 map.insert(b"lt", '<');
36 map.insert(b"LT", '<');
37 map.insert(b"gt", '>');
38 map.insert(b"GT", '>');
39 map.insert(b"amp", '&');
40 map.insert(b"AMP", '&');
41 map.insert(b"quot", '"');
42 map.insert(b"QUOT", '"');
43 map.insert(b"apos", '\'');
44 map.insert(b"nbsp", 0xa0 as char);
45 map
46 };
47}
48
49#[derive(Error, Debug)]
50pub enum HtmlEntityError {
51 #[error("Decode error: {0}")]
52 Decode(String),
53 #[error("Encode error: {0}")]
54 Encode(String),
55}
56
57#[inline]
58fn char_to_utf8_bytes(ch: char) -> ByteList {
59 let len = ch.len_utf8();
60 let mut bytes: ByteList = vec![0; len];
61 ch.encode_utf8(&mut bytes);
62 bytes
63}
64
65#[inline]
66fn tr_chars_to_utf8_bytes(chars: &[char]) -> Option<ByteList> {
67 let mut bytes: ByteList = vec![];
68 for ch in chars {
69 if ch.len_utf8() == 1 {
70 bytes.push(*ch as Byte);
71 continue;
72 }
73 return None;
74 }
75 Some(bytes)
76}
77
78#[inline]
79fn numbers_to_char(bytes: &[Byte], radix: u32) -> AnyhowResult<char> {
80 if !bytes.is_empty() {
81 let num = std::str::from_utf8(bytes)?;
83 let char_code = i64::from_str_radix(num, radix)?;
84 return std::char::from_u32(char_code as u32).ok_or(
85 HtmlEntityError::Decode(format!(
86 "The html entity number '&{}{};' is not a valid encoded character.",
87 if radix == 16 { "#" } else { "" },
88 num
89 ))
90 .into(),
91 );
92 }
93 Err(HtmlEntityError::Decode(String::from("Html entity number cannot be empty.")).into())
94}
95
96enum Utf8ParsedData {
97 Correct(char),
98 Wrong(&'static str),
99}
100
101#[inline]
102fn loop_utf8_bytes(
103 bytes: &[Byte],
104 mut handle: impl FnMut(Utf8ParsedData, CodeRangeTuple) -> AnyhowResult<()>,
105) -> AnyhowResult<()> {
106 let mut next_count = 0;
107 let mut ch: u32 = 0;
108 let mut start_index: usize = 0;
109 for (index, byte) in bytes.iter().enumerate() {
110 match next_count {
111 0 => {
112 start_index = index;
113 if (byte >> 7) == 0 {
114 let _ = handle(Utf8ParsedData::Correct(*byte as char), (start_index, index));
115 } else {
116 let mut head = byte >> 3;
117 if head == 0b11110 {
118 next_count = 3;
119 ch = ((byte & 0b111) as u32) << (next_count * 6);
120 } else {
121 head >>= 1;
122 if head == 0b1110 {
123 next_count = 2;
124 ch = ((byte & 0b1111) as u32) << (next_count * 6);
125 } else {
126 head >>= 1;
127 if head == 0b110 {
128 next_count = 1;
129 ch = ((byte & 0b11111) as u32) << (next_count * 6);
130 } else {
131 next_count = 0;
133 handle(
134 Utf8ParsedData::Wrong("Illegal utf8 encoded bytes"),
135 (start_index, index),
136 )?;
137 }
138 }
139 }
140 }
141 }
142 1..=3 => {
143 if (byte >> 6) == 0b10 {
144 next_count -= 1;
145 ch += ((byte & 0b111111) as u32) << (next_count * 6);
146 if next_count == 0 {
147 if let Some(ch) = char::from_u32(ch) {
148 let _ = handle(Utf8ParsedData::Correct(ch), (start_index, index));
149 } else {
150 handle(
151 Utf8ParsedData::Wrong("Illegal encoding utf8 character."),
152 (start_index, index),
153 )?;
154 }
155 }
156 } else {
157 next_count = 0;
158 handle(
160 Utf8ParsedData::Wrong("Illegal utf8 encoded bytes."),
161 (start_index, index),
162 )?;
163 }
164 }
165 _ => unreachable!(),
167 }
168 }
169 Ok(())
170}
171
172#[inline]
173fn bytes_to_chars(bytes: &[Byte], data: &mut Vec<char>) -> AnyhowResult<()> {
174 loop_utf8_bytes(bytes, |result, _| match result {
175 Utf8ParsedData::Correct(ch) => {
176 data.push(ch);
177 Ok(())
178 }
179 Utf8ParsedData::Wrong(message) => Err(HtmlEntityError::Decode(String::from(message)).into()),
180 })
181}
182
183#[inline]
184fn call_into_char_list_trait<T>(
185 bytes: &[Byte],
186 entities: &[(CodeRange, T)],
187 handle: impl Fn(&T, &mut Vec<char>),
188) -> CharListResult {
189 let total = bytes.len();
190 let mut result: Vec<char> = Vec::with_capacity(total / 2);
191 if entities.is_empty() {
192 bytes_to_chars(bytes, &mut result)?;
193 return Ok(result);
194 }
195 let mut index = 0;
196 for (range, item) in entities {
197 let start_index = *range.start();
198 let end_index = *range.end();
199 if index < start_index {
200 bytes_to_chars(&bytes[index..start_index], &mut result)?;
201 }
202 handle(item, &mut result);
203 index = end_index + 1;
204 }
205 if index < total {
206 bytes_to_chars(&bytes[index..], &mut result)?;
207 }
208 Ok(result)
209}
210
211#[inline]
212fn call_into_string_trait<T>(
213 bytes: &[Byte],
214 entities: &[(CodeRange, T)],
215 handle: impl Fn(&T, &mut String),
216) -> StringResult {
217 if entities.is_empty() {
218 let code = std::str::from_utf8(bytes)?;
219 return Ok(String::from(code));
220 }
221 let total = bytes.len();
222 let mut result = String::with_capacity(total);
223 let mut index = 0;
224 for (range, item) in entities {
225 let start_index = *range.start();
226 let end_index = *range.end();
227 if index < start_index {
228 let code = std::str::from_utf8(&bytes[index..start_index])?;
229 result.push_str(code);
230 }
231 handle(item, &mut result);
232 index = end_index + 1;
233 }
234 if index < total {
235 let code = std::str::from_utf8(&bytes[index..])?;
236 result.push_str(code);
237 }
238 Ok(result)
239}
240
241#[inline]
242fn gen_into_iter<'a, T: IBytesTrait>(
243 bytes: &'a [Byte],
244 entities: &'a [(CodeRange, T)],
245) -> DataIter<'a, T> {
246 let total_entities = entities.len();
247 let only_bytes = total_entities == 0;
248 let byte_index_of_next_entity = if only_bytes {
249 None
250 } else {
251 Some(*entities[0].0.start())
252 };
253 DataIter {
254 byte_index: 0,
255 total_bytes: bytes.len(),
256 entity_index: 0,
257 total_entities,
258 only_bytes,
259 byte_index_of_next_entity,
260 byte_index_entity_looped: 0,
261 bytes,
262 entities,
263 }
264}
265
266#[inline]
267fn call_trait_method_bytes_len<T: IBytesTrait>(
268 bytes: &[Byte],
269 entities: &[(CodeRange, T)],
270) -> usize {
271 if entities.is_empty() {
272 return bytes.len();
273 }
274 let mut start_index = 0;
275 let mut len: usize = 0;
276 for (range, entity) in entities {
277 len += range.start() - start_index;
278 len += entity.bytes_len();
279 start_index = *range.end() + 1;
280 }
281 len += bytes.len() - start_index;
282 len
283}
284
285#[inline]
286fn call_trait_method_byte<'a, T: IBytesTrait>(
287 bytes: &'a [Byte],
288 entities: &'a [(CodeRange, T)],
289 mut index: usize,
290) -> Option<&'a Byte> {
291 if entities.is_empty() {
292 return bytes.get(index);
293 }
294 let mut prev_start_byte_index: usize = 0;
295 for (range, entity) in entities {
296 let start_byte_index = *range.start();
297 let cur_index = prev_start_byte_index + index;
298 if cur_index < start_byte_index {
299 return bytes.get(cur_index);
301 }
302 let entity_len = entity.bytes_len();
303 let cur_entity_index = cur_index - start_byte_index;
304 if cur_entity_index < entity_len {
305 return entity.byte(cur_entity_index);
307 }
308 index = cur_entity_index - entity_len;
309 prev_start_byte_index = range.end() + 1;
310 }
311 bytes.get(prev_start_byte_index + index)
312}
313
314#[derive(Debug)]
316pub struct DecodedData<'b> {
317 inner_bytes: Cow<'b, [Byte]>,
318 entities: Vec<(CodeRange, (char, ByteList))>,
319 errors: Vec<(CodeRange, anyhow::Error)>,
320}
321
322impl<'b> ICodedDataTrait for DecodedData<'b> {}
323
324impl<'b> IBytesTrait for DecodedData<'b> {
325 fn bytes_len(&self) -> usize {
327 call_trait_method_bytes_len(&self.inner_bytes, &self.entities)
328 }
329 fn byte(&self, index: usize) -> Option<&Byte> {
331 call_trait_method_byte(&self.inner_bytes, &self.entities, index)
332 }
333}
334
335impl<'b> DecodedData<'b> {
336 pub fn is_ok(&self) -> bool {
338 self.errors.is_empty()
339 }
340 pub fn get_errors(&self) -> &[(CodeRange, anyhow::Error)] {
342 &self.errors
343 }
344 pub fn entity_count(&self) -> usize {
346 self.entities.len()
347 }
348 pub fn to_owned(&mut self) {
350 if !self.entities.is_empty() {
351 let bytes = self.to_bytes();
352 self.inner_bytes = Cow::Owned(bytes);
353 self.entities.clear();
354 }
355 }
356 pub fn into_bytes(self) -> ByteList {
358 if self.entities.is_empty() {
359 return self.inner_bytes.into_owned();
360 }
361 self.to_bytes()
362 }
363 pub fn bytes(&self) -> Cow<'b, [Byte]> {
365 if self.entities.is_empty() {
366 return self.inner_bytes.clone();
367 }
368 return Cow::Owned(self.to_bytes());
369 }
370}
371
372pub trait IBytesTrait {
373 fn byte(&self, index: usize) -> Option<&Byte>;
374 fn bytes_len(&self) -> usize;
375}
376
377impl IBytesTrait for (char, ByteList) {
378 fn byte(&self, index: usize) -> Option<&Byte> {
379 self.1.get(index)
380 }
381 fn bytes_len(&self) -> usize {
382 self.1.len()
383 }
384}
385
386impl IBytesTrait for CharEntity {
387 fn byte(&self, index: usize) -> Option<&Byte> {
388 let prefix_len = self.prefix_len();
389 if index > prefix_len {
390 let cur_index = index - prefix_len - 1;
392 return match cur_index.cmp(&self.entity_data.len()) {
393 Ordering::Less => self.entity_data.get(cur_index),
394 Ordering::Equal => Some(&b';'),
395 Ordering::Greater => None,
396 };
397 } else if index == 0 {
398 return Some(&b'&');
400 } else {
401 match prefix_len {
403 1 => Some(&b'#'),
404 2 => {
405 if index == 1 {
406 return Some(&b'#');
407 }
408 Some(&b'x')
409 }
410 _ => unreachable!(),
411 }
412 }
413 }
414 fn bytes_len(&self) -> usize {
415 let prefix_len = self.prefix_len();
416 2 + prefix_len + self.entity_data.len()
418 }
419}
420
421pub trait ICodedDataTrait
423where
424 for<'a> &'a Self: Into<StringResult> + Into<ByteList> + Into<CharListResult>,
425{
426 fn to_string(&self) -> StringResult {
428 self.into()
429 }
430 fn to_bytes(&self) -> ByteList {
432 self.into()
433 }
434 fn to_chars(&self) -> CharListResult {
436 self.into()
437 }
438}
439
440pub struct DataIter<'a, T: IBytesTrait> {
441 only_bytes: bool,
442 byte_index: usize,
443 total_bytes: usize,
444 entity_index: usize,
445 total_entities: usize,
446 byte_index_entity_looped: usize,
447 byte_index_of_next_entity: Option<usize>,
448 bytes: &'a [Byte],
449 entities: &'a [(CodeRange, T)],
450}
451
452impl<'a, T: IBytesTrait> Iterator for DataIter<'a, T> {
453 type Item = IterDataItem<'a>;
454 fn next(&mut self) -> Option<Self::Item> {
455 let cur_byte_index = self.byte_index;
456 if cur_byte_index < self.total_bytes {
457 if self.only_bytes {
458 self.byte_index += 1;
461 return Some((&self.bytes[cur_byte_index], None));
462 }
463 let looped_index = self.byte_index_entity_looped;
464 if looped_index == 0 {
465 let next_index = self.byte_index_of_next_entity.unwrap();
468 if cur_byte_index != next_index {
471 self.byte_index += 1;
472 return Some((&self.bytes[cur_byte_index], None));
473 }
474 }
476 let cur_entity = &self.entities[self.entity_index];
477 let cur_byte = &cur_entity
478 .1
479 .byte(looped_index)
480 .expect("The 'byte' method must use a correct 'index' parameter.");
481 let entity_position = Some((self.entity_index, looped_index));
482 if looped_index == cur_entity.1.bytes_len() - 1 {
483 self.byte_index_entity_looped = 0;
485 self.entity_index += 1;
486 self.byte_index = cur_entity.0.end() + 1;
488 if self.entity_index < self.total_entities {
490 self.byte_index_of_next_entity = Some(*self.entities[self.entity_index].0.start());
491 } else {
492 self.only_bytes = true;
494 }
495 } else {
496 self.byte_index_entity_looped += 1;
497 }
498 return Some((cur_byte, entity_position));
499 }
500 None
501 }
502}
503
504impl<'a> IntoIterator for &'a DecodedData<'a> {
505 type Item = IterDataItem<'a>;
506 type IntoIter = DataIter<'a, (char, ByteList)>;
507 fn into_iter(self) -> Self::IntoIter {
508 gen_into_iter(&self.inner_bytes, &self.entities)
509 }
510}
511
512impl<'a> IntoIterator for &'a EncodedData<'a> {
513 type Item = IterDataItem<'a>;
514 type IntoIter = DataIter<'a, CharEntity>;
515 fn into_iter(self) -> Self::IntoIter {
516 gen_into_iter(&self.inner_bytes, &self.entities)
517 }
518}
519impl<'b> From<&DecodedData<'b>> for StringResult {
525 fn from(value: &DecodedData<'b>) -> Self {
526 call_into_string_trait(&value.inner_bytes, &value.entities, |&(ch, _), result| {
527 result.push(ch)
528 })
529 }
530}
531
532impl<'b> From<DecodedData<'b>> for StringResult {
533 fn from(value: DecodedData<'b>) -> Self {
534 (&value).into()
535 }
536}
537
538impl<'b> From<&DecodedData<'b>> for ByteList {
542 fn from(value: &DecodedData<'b>) -> Self {
543 value
544 .into_iter()
545 .map(|(byte, _)| *byte)
546 .collect::<ByteList>()
547 }
548}
549impl<'b> From<DecodedData<'b>> for ByteList {
551 fn from(value: DecodedData<'b>) -> Self {
552 if value.entity_count() == 0 {
553 return value.inner_bytes.into_owned();
554 }
555 (&value).into()
556 }
557}
558
559impl<'b> From<&DecodedData<'b>> for CharListResult {
563 fn from(value: &DecodedData<'b>) -> Self {
564 call_into_char_list_trait(&value.inner_bytes, &value.entities, |&(ch, _), result| {
565 result.push(ch)
566 })
567 }
568}
569
570impl<'b> From<DecodedData<'b>> for CharListResult {
571 fn from(value: DecodedData<'b>) -> Self {
572 (&value).into()
573 }
574}
575#[derive(Debug)]
577pub struct EncodedData<'b> {
578 inner_bytes: Cow<'b, [Byte]>,
579 entities: Vec<(CodeRange, CharEntity)>,
580}
581
582impl<'b> ICodedDataTrait for EncodedData<'b> {}
583
584impl<'b> IBytesTrait for EncodedData<'b> {
585 fn byte(&self, index: usize) -> Option<&Byte> {
586 call_trait_method_byte(&self.inner_bytes, &self.entities, index)
587 }
588 fn bytes_len(&self) -> usize {
589 call_trait_method_bytes_len(&self.inner_bytes, &self.entities)
590 }
591}
592
593impl<'b> EncodedData<'b> {
594 pub fn entity_count(&self) -> usize {
596 self.entities.len()
597 }
598 pub fn to_owned(&mut self) {
600 if !self.entities.is_empty() {
601 let bytes = self.to_bytes();
602 self.inner_bytes = Cow::Owned(bytes);
603 self.entities.clear();
604 }
605 }
606 pub fn into_bytes(self) -> ByteList {
608 if self.entities.is_empty() {
609 return self.inner_bytes.into_owned();
610 }
611 self.to_bytes()
612 }
613 pub fn bytes(&self) -> Cow<'b, [Byte]> {
615 if self.entities.is_empty() {
616 return self.inner_bytes.clone();
617 }
618 return Cow::Owned(self.to_bytes());
619 }
620}
621
622impl<'b> From<&EncodedData<'b>> for StringResult {
623 fn from(value: &EncodedData<'b>) -> Self {
624 call_into_string_trait(
625 &value.inner_bytes,
626 &value.entities,
627 |char_entity, result| {
628 char_entity.write_string(result);
629 },
630 )
631 }
632}
633
634impl<'b> From<EncodedData<'b>> for StringResult {
635 fn from(value: EncodedData<'b>) -> Self {
636 (&value).into()
637 }
638}
639
640impl<'b> From<&EncodedData<'b>> for CharListResult {
641 fn from(value: &EncodedData<'b>) -> Self {
642 call_into_char_list_trait(
643 &value.inner_bytes,
644 &value.entities,
645 |char_entity, result| {
646 char_entity.write_chars(result);
647 },
648 )
649 }
650}
651
652impl<'b> From<EncodedData<'b>> for CharListResult {
653 fn from(value: EncodedData<'b>) -> Self {
654 (&value).into()
655 }
656}
657
658impl<'b> From<&EncodedData<'b>> for ByteList {
659 fn from(value: &EncodedData<'b>) -> Self {
660 value
661 .into_iter()
662 .map(|(byte, _)| *byte)
663 .collect::<ByteList>()
664 }
665}
666
667impl<'b> From<EncodedData<'b>> for ByteList {
668 fn from(value: EncodedData<'b>) -> Self {
669 if value.entity_count() == 0 {
670 return value.inner_bytes.into_owned();
671 }
672 (&value).into()
673 }
674}
675
676#[derive(Copy, Clone, Default)]
678#[repr(u8)]
679pub enum EncodeType {
680 #[default]
681 Named = 0b00001,
682 Hex = 0b00010,
683 Decimal = 0b00100,
684 NamedOrHex = 0b00011,
685 NamedOrDecimal = 0b00101,
686}
687
688#[inline]
689fn filter_entity_set(
690 charset: &EntityCharBytes,
691 encode_type: &EncodeType,
692 ch: &char,
693) -> EncodeFilterReturnData {
694 let encode_type = *encode_type as u8;
695 if let Some(&v) = charset.get(ch) {
696 if (encode_type & EncodeType::Named as u8) > 0 {
697 return (true, Some((EntityType::Named, Cow::from(v))));
698 }
699 return (true, None);
700 }
701 (false, None)
702}
703
704#[derive(Default)]
706pub enum CharacterSet {
707 All = 1,
709 NonASCII = 2,
711 #[default]
713 Html = 3,
714 SpecialChars = 4,
716 HtmlAndNonASCII = 5,
718 SpecialCharsAndNonASCII = 6,
720}
721
722impl CharacterSet {
723 pub fn filter(&self, ch: &char, encode_type: &EncodeType) -> EncodeFilterReturnData {
725 use CharacterSet::*;
726 match self {
727 SpecialChars => filter_entity_set(&SPECIAL_BYTES, encode_type, ch),
728 Html => filter_entity_set(&HTML_BYTES, encode_type, ch),
729 NonASCII => (*ch as u32 > 0xff, None),
730 HtmlAndNonASCII => {
731 let result = CharacterSet::NonASCII.filter(ch, encode_type);
732 if result.0 {
733 return result;
734 }
735 CharacterSet::Html.filter(ch, encode_type)
736 }
737 SpecialCharsAndNonASCII => {
738 let result = CharacterSet::NonASCII.filter(ch, encode_type);
739 if result.0 {
740 return result;
741 }
742 CharacterSet::SpecialChars.filter(ch, encode_type)
743 }
744 All => (true, None),
745 }
746 }
747 pub fn contains(&self, ch: &char) -> bool {
749 use CharacterSet::*;
750 match self {
751 SpecialChars => SPECIAL_BYTES.get(ch).is_some(),
752 Html => HTML_BYTES.get(ch).is_some(),
753 NonASCII => *ch as u32 > 0xff,
754 HtmlAndNonASCII => CharacterSet::NonASCII.contains(ch) || CharacterSet::Html.contains(ch),
755 SpecialCharsAndNonASCII => {
756 CharacterSet::NonASCII.contains(ch) || CharacterSet::SpecialChars.contains(ch)
757 }
758 All => true,
759 }
760 }
761}
762
763#[derive(PartialEq, Eq, Debug)]
764pub enum EntityType {
765 Named,
766 Hex,
767 Decimal,
768}
769
770#[derive(Debug)]
772pub struct CharEntity {
773 entity_type: EntityType,
774 entity_data: Cow<'static, [Byte]>,
775}
776
777impl CharEntity {
778 pub fn prefix_len(&self) -> usize {
780 match &self.entity_type {
781 EntityType::Named => 0,
782 EntityType::Hex => 2,
783 EntityType::Decimal => 1,
784 }
785 }
786 pub fn write_bytes(&self, bytes: &mut ByteList) {
788 bytes.push(b'&');
789 match &self.entity_type {
790 EntityType::Named => {
791 }
793 EntityType::Hex => {
794 bytes.push(b'#');
795 bytes.push(b'x');
796 }
797 EntityType::Decimal => {
798 bytes.push(b'#');
799 }
800 }
801 bytes.extend_from_slice(&self.entity_data);
802 bytes.push(b';');
803 }
804 pub fn write_chars(&self, chars: &mut Vec<char>) {
806 chars.push('&');
807 match &self.entity_type {
808 EntityType::Named => {
809 }
811 EntityType::Hex => {
812 chars.push('#');
813 chars.push('x');
814 }
815 EntityType::Decimal => {
816 chars.push('#');
817 }
818 }
819 for byte in self.entity_data.iter() {
820 chars.push(*byte as char);
821 }
822 chars.push(';');
823 }
824 pub fn write_string(&self, code: &mut String) {
826 code.push('&');
827 match &self.entity_type {
828 EntityType::Named => {
829 }
831 EntityType::Hex => {
832 code.push('#');
833 code.push('x');
834 }
835 EntityType::Decimal => {
836 code.push('#');
837 }
838 }
839 for byte in self.entity_data.iter() {
840 code.push(*byte as char);
841 }
842 code.push(';');
843 }
844 pub fn to_bytes(&self) -> ByteList {
846 let mut bytes: ByteList = Vec::with_capacity(self.entity_data.len() + 2);
847 self.write_bytes(&mut bytes);
848 bytes
849 }
850 pub fn data(self) -> ByteList {
852 self.entity_data.into_owned()
853 }
854}
855
856impl ToString for CharEntity {
857 fn to_string(&self) -> String {
858 let mut code = String::with_capacity(self.entity_data.len() + 2);
859 self.write_string(&mut code);
860 code
861 }
862}
863#[derive(Default)]
865pub struct Entity;
866
867impl Entity {
868 pub fn decode(bytes: &[Byte]) -> AnyhowResult<char> {
870 let total = bytes.len();
871 if total == 0 {
872 return Err(
873 HtmlEntityError::Decode(String::from(
874 "Can't decode with an empty bytelist argument.",
875 ))
876 .into(),
877 );
878 }
879 let first = bytes[0];
881 let mut entity_type: EntityType = EntityType::Named;
882 if first.is_ascii_alphabetic() {
883 for ch in &bytes[1..] {
884 if !ch.is_ascii_alphanumeric() {
885 let code = std::str::from_utf8(bytes)?;
886 return Err(
887 HtmlEntityError::Decode(format!(
888 "Html entity name can't contain characters other than English letters or numbers, here is '{}'",
889 code
890 ))
891 .into(),
892 );
893 }
894 }
895 } else if first == b'#' && total > 1 {
896 let second = bytes[1];
897 match second {
898 b'0'..=b'9' => {
899 for byte in &bytes[2..] {
901 if !byte.is_ascii_digit() {
902 let code = std::str::from_utf8(bytes)?;
903 return Err(
904 HtmlEntityError::Decode(format!(
905 "Html entity number can't contain characters other than numbers, here is '{}'.",
906 code
907 ))
908 .into(),
909 );
910 }
911 }
912 entity_type = EntityType::Decimal;
913 }
914 b'x' | b'X' => {
915 if total > 2 {
917 for byte in &bytes[2..] {
918 if !byte.is_ascii_hexdigit() {
919 let code = std::str::from_utf8(bytes)?;
920 return Err(
921 HtmlEntityError::Decode(format!(
922 "Hexadecimal html entity can't contain characters other than hexadecimal, here is '&{};'.",
923 code
924 ))
925 .into(),
926 );
927 }
928 }
929 entity_type = EntityType::Hex;
930 } else {
931 return Err(
932 HtmlEntityError::Decode(String::from(
933 "Hexadecimal html entity must contain one or more hexadecimal characters.",
934 ))
935 .into(),
936 );
937 }
938 }
939 _ => {
940 return Err(
941 HtmlEntityError::Decode(String::from("Illegal html entity number character format"))
942 .into(),
943 );
944 }
945 }
946 } else {
947 return Err(
948 HtmlEntityError::Decode(String::from("Illegal html entity character format.")).into(),
949 );
950 }
951 match entity_type {
953 EntityType::Named => {
955 if let Some(&ch) = NORMAL_NAME_ENTITY_BYTE.get(bytes) {
957 return Ok(ch);
958 }
959 if let Some(&(start_index, end_index)) = FIRST_LETTER_POSITION.get(&bytes[0]) {
961 if let Some(find_index) = LETTER_ORDERED_ENTITIES[start_index..end_index]
962 .iter()
963 .position(|&(name, _)| name == bytes)
964 {
965 let last_index = start_index + find_index;
966 let (_, code) = LETTER_ORDERED_ENTITIES[last_index];
967 return Ok(code);
968 }
969 }
970 let code = std::str::from_utf8(bytes)?;
971 Err(
972 HtmlEntityError::Decode(format!(
973 "Unable to find corresponding the html entity name '&{};'",
974 code
975 ))
976 .into(),
977 )
978 }
979 EntityType::Hex => {
981 numbers_to_char(&bytes[2..], 16)
983 }
984 EntityType::Decimal => {
986 numbers_to_char(&bytes[1..], 10)
988 }
989 }
990 }
991 pub fn decode_chars(chars: &[char]) -> AnyhowResult<char> {
993 let total = chars.len();
994 if total == 0 {
995 return Err(
996 HtmlEntityError::Decode(String::from(
997 "Can't decode with an empty character list argument.",
998 ))
999 .into(),
1000 );
1001 }
1002 let mut bytes: ByteList = Vec::with_capacity(total);
1003 let max_u8 = u8::MAX as u32;
1004 let is_non_bytes = chars.iter().any(|ch| {
1005 let char_code = *ch as u32;
1006 if char_code > max_u8 {
1007 true
1008 } else {
1009 bytes.push(char_code as Byte);
1010 false
1011 }
1012 });
1013 if !is_non_bytes {
1014 return Entity::decode(&bytes);
1015 }
1016 Err(
1017 HtmlEntityError::Decode(format!(
1018 "Unable to find corresponding the html entity name '&{};'",
1019 chars.iter().collect::<String>()
1020 ))
1021 .into(),
1022 )
1023 }
1024}
1025
1026pub fn encode_char(ch: &char, encode_type: &EncodeType) -> Option<CharEntity> {
1049 let encode_type = *encode_type as u8;
1050 let char_code = *ch as u32;
1051 if (encode_type & (EncodeType::Named as u8)) > 0 {
1053 if let Ok(mut index) = ENTITIES.binary_search_by_key(&char_code, |&(_, code)| code) {
1055 while index > 0 {
1057 let prev_index = index - 1;
1058 if ENTITIES[prev_index].1 != char_code {
1059 break;
1060 }
1061 index = prev_index;
1062 }
1063 let &(entity, _) = &ENTITIES[index];
1064 return Some(CharEntity {
1065 entity_type: EntityType::Named,
1066 entity_data: Cow::from(entity),
1067 });
1068 }
1069 }
1070 if (encode_type & (EncodeType::Hex as u8)) > 0 {
1072 return Some(CharEntity {
1073 entity_type: EntityType::Hex,
1074 entity_data: Cow::Owned(format!("{:x}", char_code).into_bytes()),
1075 });
1076 }
1077 if (encode_type & (EncodeType::Decimal as u8)) > 0 {
1079 return Some(CharEntity {
1080 entity_type: EntityType::Decimal,
1081 entity_data: Cow::Owned(char_code.to_string().into_bytes()),
1082 });
1083 }
1084 None
1086}
1087
1088pub fn encode<'a>(
1122 content: &'a [Byte],
1123 encode_type: &EncodeType,
1124 charset: &CharacterSet,
1125) -> EncodedData<'a> {
1126 encode_with(content, encode_type, |ch, encode_type| {
1127 charset.filter(ch, encode_type)
1128 })
1129}
1130
1131pub fn encode_to(
1145 content: &[Byte],
1146 encode_type: &EncodeType,
1147 charset: &CharacterSet,
1148 data: &mut ByteList,
1149) {
1150 encode_with_to(
1151 content,
1152 encode_type,
1153 |ch, encode_type| charset.filter(ch, encode_type),
1154 data,
1155 );
1156}
1157
1158pub fn encode_with<'a>(
1184 content: &'a [Byte],
1185 encode_type: &EncodeType,
1186 filter_fn: impl Fn(&char, &EncodeType) -> EncodeFilterReturnData,
1187) -> EncodedData<'a> {
1188 let mut entities: Vec<(CodeRange, CharEntity)> = vec![];
1189 let _ = loop_utf8_bytes(content, |result, (start_index, index)| match result {
1190 Utf8ParsedData::Correct(ch) => {
1191 let (need_encode, maybe_entity) = filter_fn(&ch, encode_type);
1192 if need_encode {
1193 if let Some((entity_type, entity_data)) = maybe_entity {
1194 entities.push((
1195 start_index..=index,
1196 CharEntity {
1197 entity_type,
1198 entity_data,
1199 },
1200 ));
1201 } else if let Some(entity) = encode_char(&ch, encode_type) {
1202 entities.push((start_index..=index, entity));
1203 }
1204 }
1205 Ok(())
1206 }
1207 _ => Ok(()),
1208 });
1209 EncodedData {
1210 inner_bytes: Cow::from(content),
1211 entities,
1212 }
1213}
1214
1215pub fn encode_with_to(
1234 content: &[Byte],
1235 encode_type: &EncodeType,
1236 filter_fn: impl Fn(&char, &EncodeType) -> EncodeFilterReturnData,
1237 data: &mut ByteList,
1238) {
1239 let _ = loop_utf8_bytes(content, |result, (start_index, end_index)| match result {
1240 Utf8ParsedData::Correct(ch) => {
1241 let (need_encode, maybe_entity) = filter_fn(&ch, encode_type);
1242 if need_encode {
1243 if let Some((entity_type, entity_data)) = maybe_entity {
1244 let entity = CharEntity {
1245 entity_type,
1246 entity_data,
1247 };
1248 entity.write_bytes(data);
1249 return Ok(());
1250 } else if let Some(entity) = encode_char(&ch, encode_type) {
1251 entity.write_bytes(data);
1252 return Ok(());
1253 }
1254 }
1255 data.extend_from_slice(&content[start_index..=end_index]);
1256 Ok(())
1257 }
1258 Utf8ParsedData::Wrong(_) => {
1259 data.extend_from_slice(&content[start_index..=end_index]);
1260 Ok(())
1261 }
1262 });
1263}
1264
1265pub fn encode_chars_with(
1284 chars: &[char],
1285 filter_fn: impl Fn(&char) -> Option<&EncodeType>,
1286) -> Cow<'_, [char]> {
1287 let mut result = vec![];
1288 let mut iter = chars.iter();
1289 for (index, ch) in iter.by_ref().enumerate() {
1290 if let Some(encode_type) = filter_fn(ch) {
1291 if let Some(entity) = encode_char(ch, encode_type) {
1292 if index > 0 {
1293 result.extend_from_slice(&chars[..index]);
1294 }
1295 entity.write_chars(&mut result);
1296 break;
1297 }
1298 }
1299 }
1300 for ch in iter {
1301 if let Some(encode_type) = filter_fn(ch) {
1302 if let Some(entity) = encode_char(ch, encode_type) {
1303 entity.write_chars(&mut result);
1304 continue;
1305 }
1306 }
1307 result.push(*ch);
1308 }
1309 if !result.is_empty() {
1310 return Cow::Owned(result);
1311 }
1312 Cow::Borrowed(chars)
1313}
1314
1315pub fn decode_chars(chars: &[char]) -> Cow<'_, [char]> {
1329 let mut data: Vec<char> = vec![];
1330 let mut is_in_entity = false;
1331 let mut start_index: usize = 0;
1332 for (idx, ch) in chars.iter().enumerate() {
1333 if !is_in_entity {
1334 if *ch == '&' {
1336 is_in_entity = true;
1337 start_index = idx + 1;
1338 }
1339 } else {
1340 match *ch {
1342 ';' => {
1343 if start_index != idx {
1345 let bytes = tr_chars_to_utf8_bytes(&chars[start_index..idx]);
1346 if let Some(bytes) = bytes {
1347 if let Ok(decode_char) = Entity::decode(&bytes) {
1348 if start_index > 1 {
1351 data.extend_from_slice(&chars[..start_index - 1]);
1352 }
1353 data.push(decode_char);
1355 let next_idx = idx + 1;
1357 if next_idx != chars.len() {
1358 decode_chars_to(&chars[next_idx..], &mut data);
1359 }
1360 return Cow::Owned(data);
1361 }
1362 }
1363 }
1364 is_in_entity = false;
1365 }
1366 '&' => {
1367 start_index = idx + 1;
1369 }
1370 _ => {}
1371 }
1372 }
1373 }
1374 Cow::from(chars)
1375}
1376
1377pub fn decode_chars_to(chars: &[char], data: &mut Vec<char>) {
1399 let mut is_in_entity = false;
1400 let mut start_index: usize = 0;
1401 for (idx, &ch) in chars.iter().enumerate() {
1402 if !is_in_entity {
1403 if ch == '&' {
1405 is_in_entity = true;
1406 start_index = idx + 1;
1407 } else {
1408 data.push(ch);
1409 }
1410 } else {
1411 match ch {
1413 ';' => {
1414 if start_index != idx {
1416 let bytes = tr_chars_to_utf8_bytes(&chars[start_index..idx]);
1417 if let Some(bytes) = bytes {
1418 if let Ok(decode_char) = Entity::decode(&bytes) {
1419 data.push(decode_char);
1421 is_in_entity = false;
1422 continue;
1423 }
1424 }
1425 }
1426 data.extend_from_slice(&chars[start_index - 1..=idx]);
1428 is_in_entity = false;
1429 }
1430 '&' => {
1431 data.extend_from_slice(&chars[start_index - 1..idx]);
1433 start_index = idx + 1;
1434 }
1435 _ => {}
1436 }
1437 }
1438 }
1439 if is_in_entity {
1440 data.extend_from_slice(&chars[start_index - 1..]);
1442 }
1443}
1444
1445pub fn decode(content: &[Byte]) -> DecodedData<'_> {
1483 let mut entities: Vec<(CodeRange, (char, ByteList))> = vec![];
1484 let mut errors: Vec<(CodeRange, anyhow::Error)> = vec![];
1485 let mut is_in_entity = false;
1486 let mut start_index: usize = 0;
1487 for (idx, byte) in content.iter().enumerate() {
1488 if !is_in_entity {
1489 if *byte == b'&' {
1491 is_in_entity = true;
1492 start_index = idx + 1;
1493 }
1494 } else {
1495 match *byte {
1497 b';' => {
1498 if start_index != idx {
1500 let decode_result = Entity::decode(&content[start_index..idx]);
1501 match decode_result {
1502 Ok(decode_char) => {
1503 entities.push((
1504 start_index - 1..=idx,
1505 (decode_char, char_to_utf8_bytes(decode_char)),
1506 ));
1507 }
1508 Err(err) => {
1509 errors.push((start_index - 1..=idx, err));
1510 }
1511 };
1512 }
1513 is_in_entity = false;
1514 }
1515 b'&' => {
1516 errors.push((
1518 start_index - 1..=start_index - 1,
1519 HtmlEntityError::Decode(String::from("Unencoded html entity characters '&'.")).into(),
1520 ));
1521 start_index = idx + 1;
1522 }
1523 _ => {
1524 }
1526 }
1527 }
1528 }
1529 DecodedData {
1531 inner_bytes: Cow::from(content),
1532 entities,
1533 errors,
1534 }
1535}
1536
1537pub fn decode_to(content: &[Byte], data: &mut Vec<Byte>) {
1552 let mut is_in_entity = false;
1553 let mut start_index: usize = 0;
1554 for (idx, byte) in content.iter().enumerate() {
1555 if !is_in_entity {
1556 if *byte == b'&' {
1558 is_in_entity = true;
1559 start_index = idx + 1;
1560 } else {
1561 data.push(*byte);
1562 }
1563 } else {
1564 match *byte {
1566 b';' => {
1567 if start_index != idx {
1569 if let Ok(decode_char) = Entity::decode(&content[start_index..idx]) {
1570 data.extend(char_to_utf8_bytes(decode_char));
1571 is_in_entity = false;
1572 continue;
1573 }
1574 }
1575 data.extend_from_slice(&content[start_index - 1..=idx]);
1576 is_in_entity = false;
1577 }
1578 b'&' => {
1579 data.extend_from_slice(&content[start_index - 1..idx]);
1581 start_index = idx + 1;
1582 }
1583 _ => {
1584 }
1586 }
1587 }
1588 }
1589 if is_in_entity {
1590 data.extend_from_slice(&content[start_index - 1..]);
1592 }
1593}