1extern crate alloc;
8
9use alloc::format;
10use alloc::string::{String, ToString};
11use alloc::vec::Vec;
12use quick_xml::escape::unescape;
13use quick_xml::events::{BytesStart, Event};
14use quick_xml::reader::Reader;
15
16#[derive(Clone, Debug, PartialEq, Eq)]
18#[non_exhaustive]
19pub enum Token {
20 Text(String),
22 ParagraphBreak,
24 Heading(u8),
26 Emphasis(bool),
28 Strong(bool),
30 LineBreak,
32 ListStart(bool),
34 ListEnd,
36 ListItemStart,
38 ListItemEnd,
40 LinkStart(String),
42 LinkEnd,
44 Image {
46 src: String,
48 alt: String,
50 },
51}
52
53#[derive(Clone, Debug, PartialEq, Eq)]
55#[non_exhaustive]
56pub enum TokenizeError {
57 ParseError(String),
59 InvalidStructure(String),
61}
62
63impl core::fmt::Display for TokenizeError {
64 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
65 match self {
66 TokenizeError::ParseError(msg) => write!(f, "Parse error: {}", msg),
67 TokenizeError::InvalidStructure(msg) => write!(f, "Invalid structure: {}", msg),
68 }
69 }
70}
71
72#[cfg(feature = "std")]
73impl std::error::Error for TokenizeError {}
74
75pub fn tokenize_html(html: &str) -> Result<Vec<Token>, TokenizeError> {
89 let estimated_tokens = html.len() / 10;
91 let mut tokens = Vec::with_capacity(estimated_tokens.min(10000));
92 tokenize_html_into(html, &mut tokens)?;
93 Ok(tokens)
94}
95
96#[derive(Clone, Copy, Debug, PartialEq, Eq)]
98pub struct TokenizeLimits {
99 pub max_tokens: usize,
101 pub max_nesting: usize,
103 pub max_text_bytes: usize,
105}
106
107impl Default for TokenizeLimits {
108 fn default() -> Self {
109 Self {
110 max_tokens: 100_000,
111 max_nesting: 256,
112 max_text_bytes: 64 * 1024,
113 }
114 }
115}
116
117impl TokenizeLimits {
118 pub fn embedded() -> Self {
120 Self {
121 max_tokens: 10_000,
122 max_nesting: 64,
123 max_text_bytes: 8 * 1024,
124 }
125 }
126}
127
128pub fn tokenize_html_with<F>(html: &str, mut on_token: F) -> Result<(), TokenizeError>
133where
134 F: FnMut(Token),
135{
136 for token in tokenize_html(html)? {
137 on_token(token);
138 }
139 Ok(())
140}
141
142pub fn tokenize_html_limited(
152 html: &str,
153 limits: TokenizeLimits,
154) -> Result<Vec<Token>, TokenizeError> {
155 let mut reader = Reader::from_str(html);
156 reader.config_mut().trim_text(false);
157 reader.config_mut().expand_empty_elements = false;
158
159 let mut buf = Vec::with_capacity(0);
160 let mut tokens = Vec::with_capacity(limits.max_tokens.min(1024));
161
162 let mut element_stack: Vec<ElementType> = Vec::with_capacity(limits.max_nesting.min(64));
164 let mut skip_depth: usize = 0;
166 let mut pending_paragraph_break: bool = false;
168 let mut pending_heading_close: Option<u8> = None;
170
171 let mut token_count: usize = 0;
172
173 loop {
174 match reader.read_event_into(&mut buf) {
175 Ok(Event::Start(e)) => {
176 let name = decode_name(e.name().as_ref(), &reader)?;
177
178 if should_skip_element(&name) {
180 skip_depth += 1;
181 continue;
182 }
183
184 if skip_depth > 0 {
186 continue;
187 }
188
189 if element_stack.len() >= limits.max_nesting {
191 return Err(TokenizeError::InvalidStructure(format!(
192 "Nesting depth exceeds max_nesting ({})",
193 limits.max_nesting
194 )));
195 }
196
197 if pending_paragraph_break && !tokens.is_empty() {
199 if token_count >= limits.max_tokens {
200 return Err(TokenizeError::InvalidStructure(format!(
201 "Token count exceeds max_tokens ({}",
202 limits.max_tokens
203 )));
204 }
205 tokens.push(Token::ParagraphBreak);
206 token_count += 1;
207 pending_paragraph_break = false;
208 }
209
210 if let Some(level) = pending_heading_close.take() {
212 if token_count >= limits.max_tokens {
213 return Err(TokenizeError::InvalidStructure(format!(
214 "Token count exceeds max_tokens ({}",
215 limits.max_tokens
216 )));
217 }
218 tokens.push(Token::Heading(level));
219 token_count += 1;
220 pending_paragraph_break = true;
221 }
222
223 match name.as_str() {
224 "p" | "div" => {
225 element_stack.push(ElementType::Paragraph);
226 }
227 "span" => {
228 element_stack.push(ElementType::Span);
229 }
230 h if h.starts_with('h') && h.len() == 2 => {
231 if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
232 if (1..=6).contains(&level) {
233 element_stack.push(ElementType::Heading(level as u8));
234 pending_heading_close = Some(level as u8);
235 }
236 }
237 }
238 "em" | "i" => {
239 element_stack.push(ElementType::Emphasis);
240 if token_count >= limits.max_tokens {
241 return Err(TokenizeError::InvalidStructure(format!(
242 "Token count exceeds max_tokens ({}",
243 limits.max_tokens
244 )));
245 }
246 tokens.push(Token::Emphasis(true));
247 token_count += 1;
248 }
249 "strong" | "b" => {
250 element_stack.push(ElementType::Strong);
251 if token_count >= limits.max_tokens {
252 return Err(TokenizeError::InvalidStructure(format!(
253 "Token count exceeds max_tokens ({}",
254 limits.max_tokens
255 )));
256 }
257 tokens.push(Token::Strong(true));
258 token_count += 1;
259 }
260 "ul" => {
261 element_stack.push(ElementType::UnorderedList);
262 if token_count >= limits.max_tokens {
263 return Err(TokenizeError::InvalidStructure(format!(
264 "Token count exceeds max_tokens ({}",
265 limits.max_tokens
266 )));
267 }
268 tokens.push(Token::ListStart(false));
269 token_count += 1;
270 }
271 "ol" => {
272 element_stack.push(ElementType::OrderedList);
273 if token_count >= limits.max_tokens {
274 return Err(TokenizeError::InvalidStructure(format!(
275 "Token count exceeds max_tokens ({}",
276 limits.max_tokens
277 )));
278 }
279 tokens.push(Token::ListStart(true));
280 token_count += 1;
281 }
282 "li" => {
283 element_stack.push(ElementType::ListItem);
284 if token_count >= limits.max_tokens {
285 return Err(TokenizeError::InvalidStructure(format!(
286 "Token count exceeds max_tokens ({}",
287 limits.max_tokens
288 )));
289 }
290 tokens.push(Token::ListItemStart);
291 token_count += 1;
292 }
293 "a" => {
294 if let Some(href) = get_attribute(&e, &reader, "href") {
295 element_stack.push(ElementType::Link);
296 if token_count >= limits.max_tokens {
297 return Err(TokenizeError::InvalidStructure(format!(
298 "Token count exceeds max_tokens ({}",
299 limits.max_tokens
300 )));
301 }
302 tokens.push(Token::LinkStart(href));
303 token_count += 1;
304 } else {
305 element_stack.push(ElementType::Generic);
307 }
308 }
309 "img" => {
310 if let Some(src) = get_attribute(&e, &reader, "src") {
312 let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
313 if token_count >= limits.max_tokens {
314 return Err(TokenizeError::InvalidStructure(format!(
315 "Token count exceeds max_tokens ({}",
316 limits.max_tokens
317 )));
318 }
319 tokens.push(Token::Image { src, alt });
320 token_count += 1;
321 }
322 element_stack.push(ElementType::Generic);
323 }
324 _ => {
325 element_stack.push(ElementType::Generic);
327 }
328 }
329 }
330 Ok(Event::Text(e)) => {
331 if skip_depth > 0 {
333 continue;
334 }
335
336 let text = e
337 .decode()
338 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
339 .to_string();
340
341 let normalized = normalize_whitespace_limited(&text, limits.max_text_bytes);
343
344 if !normalized.is_empty() {
345 if let Some(level) = pending_heading_close.take() {
347 if token_count >= limits.max_tokens {
348 return Err(TokenizeError::InvalidStructure(format!(
349 "Token count exceeds max_tokens ({}",
350 limits.max_tokens
351 )));
352 }
353 tokens.push(Token::Heading(level));
354 token_count += 1;
355 }
356 if token_count >= limits.max_tokens {
357 return Err(TokenizeError::InvalidStructure(format!(
358 "Token count exceeds max_tokens ({}",
359 limits.max_tokens
360 )));
361 }
362 tokens.push(Token::Text(normalized));
363 token_count += 1;
364 }
365 }
366 Ok(Event::End(e)) => {
367 let name = decode_name(e.name().as_ref(), &reader)?;
368
369 if should_skip_element(&name) {
371 skip_depth = skip_depth.saturating_sub(1);
372 continue;
373 }
374
375 if skip_depth > 0 {
377 continue;
378 }
379
380 if let Some(element) = element_stack.pop() {
382 match element {
383 ElementType::Paragraph => {
384 pending_paragraph_break = true;
385 }
386 ElementType::Heading(_level) => {
387 pending_paragraph_break = true;
389 pending_heading_close = None;
391 }
392 ElementType::Emphasis => {
393 if token_count >= limits.max_tokens {
394 return Err(TokenizeError::InvalidStructure(format!(
395 "Token count exceeds max_tokens ({}",
396 limits.max_tokens
397 )));
398 }
399 tokens.push(Token::Emphasis(false));
400 token_count += 1;
401 }
402 ElementType::Strong => {
403 if token_count >= limits.max_tokens {
404 return Err(TokenizeError::InvalidStructure(format!(
405 "Token count exceeds max_tokens ({}",
406 limits.max_tokens
407 )));
408 }
409 tokens.push(Token::Strong(false));
410 token_count += 1;
411 }
412 ElementType::UnorderedList | ElementType::OrderedList => {
413 if token_count >= limits.max_tokens {
414 return Err(TokenizeError::InvalidStructure(format!(
415 "Token count exceeds max_tokens ({}",
416 limits.max_tokens
417 )));
418 }
419 tokens.push(Token::ListEnd);
420 token_count += 1;
421 }
422 ElementType::ListItem => {
423 if token_count >= limits.max_tokens {
424 return Err(TokenizeError::InvalidStructure(format!(
425 "Token count exceeds max_tokens ({}",
426 limits.max_tokens
427 )));
428 }
429 tokens.push(Token::ListItemEnd);
430 token_count += 1;
431 }
432 ElementType::Link => {
433 if token_count >= limits.max_tokens {
434 return Err(TokenizeError::InvalidStructure(format!(
435 "Token count exceeds max_tokens ({}",
436 limits.max_tokens
437 )));
438 }
439 tokens.push(Token::LinkEnd);
440 token_count += 1;
441 }
442 ElementType::Span | ElementType::Generic => {
443 }
445 }
446 }
447 }
448 Ok(Event::Empty(e)) => {
449 let name = decode_name(e.name().as_ref(), &reader)?;
450
451 if skip_depth > 0 {
453 continue;
454 }
455
456 if pending_paragraph_break && !tokens.is_empty() {
458 if token_count >= limits.max_tokens {
459 return Err(TokenizeError::InvalidStructure(format!(
460 "Token count exceeds max_tokens ({}",
461 limits.max_tokens
462 )));
463 }
464 tokens.push(Token::ParagraphBreak);
465 token_count += 1;
466 pending_paragraph_break = false;
467 }
468
469 if let Some(level) = pending_heading_close.take() {
471 if token_count >= limits.max_tokens {
472 return Err(TokenizeError::InvalidStructure(format!(
473 "Token count exceeds max_tokens ({}",
474 limits.max_tokens
475 )));
476 }
477 tokens.push(Token::Heading(level));
478 token_count += 1;
479 pending_paragraph_break = true;
480 }
481
482 match name.as_str() {
483 "br" => {
484 if token_count >= limits.max_tokens {
485 return Err(TokenizeError::InvalidStructure(format!(
486 "Token count exceeds max_tokens ({}",
487 limits.max_tokens
488 )));
489 }
490 tokens.push(Token::LineBreak);
491 token_count += 1;
492 }
493 "p" | "div" => {
494 pending_paragraph_break = true;
496 }
497 h if h.starts_with('h') && h.len() == 2 => {
498 if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
499 if (1..=6).contains(&level) {
500 if token_count >= limits.max_tokens {
502 return Err(TokenizeError::InvalidStructure(format!(
503 "Token count exceeds max_tokens ({}",
504 limits.max_tokens
505 )));
506 }
507 tokens.push(Token::Heading(level as u8));
508 token_count += 1;
509 pending_paragraph_break = true;
510 }
511 }
512 }
513 "img" => {
514 if let Some(src) = get_attribute(&e, &reader, "src") {
515 let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
516 if token_count >= limits.max_tokens {
517 return Err(TokenizeError::InvalidStructure(format!(
518 "Token count exceeds max_tokens ({}",
519 limits.max_tokens
520 )));
521 }
522 tokens.push(Token::Image { src, alt });
523 token_count += 1;
524 }
525 }
527 _ => {
528 }
530 }
531 }
532 Ok(Event::CData(e)) => {
533 if skip_depth == 0 {
535 let text = reader
536 .decoder()
537 .decode(&e)
538 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
539 .to_string();
540
541 let normalized = normalize_whitespace_limited(&text, limits.max_text_bytes);
542 if !normalized.is_empty() {
543 if let Some(level) = pending_heading_close.take() {
544 if token_count >= limits.max_tokens {
545 return Err(TokenizeError::InvalidStructure(format!(
546 "Token count exceeds max_tokens ({}",
547 limits.max_tokens
548 )));
549 }
550 tokens.push(Token::Heading(level));
551 token_count += 1;
552 }
553 if token_count >= limits.max_tokens {
554 return Err(TokenizeError::InvalidStructure(format!(
555 "Token count exceeds max_tokens ({}",
556 limits.max_tokens
557 )));
558 }
559 tokens.push(Token::Text(normalized));
560 token_count += 1;
561 }
562 }
563 }
564 Ok(Event::GeneralRef(e)) => {
565 if skip_depth > 0 {
567 continue;
568 }
569
570 let entity_name = e
571 .decode()
572 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?;
573 let entity_str = format!("&{};", entity_name);
575 let resolved = unescape(&entity_str)
576 .map_err(|e| TokenizeError::ParseError(format!("Unescape error: {:?}", e)))?
577 .to_string();
578
579 if !resolved.is_empty() {
580 if let Some(level) = pending_heading_close.take() {
582 if token_count >= limits.max_tokens {
583 return Err(TokenizeError::InvalidStructure(format!(
584 "Token count exceeds max_tokens ({}",
585 limits.max_tokens
586 )));
587 }
588 tokens.push(Token::Heading(level));
589 token_count += 1;
590 }
591 if let Some(Token::Text(ref mut last_text)) = tokens.last_mut() {
593 if last_text.len() + resolved.len() <= limits.max_text_bytes {
594 last_text.push_str(&resolved);
595 }
596 } else {
597 if token_count >= limits.max_tokens {
598 return Err(TokenizeError::InvalidStructure(format!(
599 "Token count exceeds max_tokens ({}",
600 limits.max_tokens
601 )));
602 }
603 tokens.push(Token::Text(resolved));
604 token_count += 1;
605 }
606 }
607 }
608 Ok(Event::Comment(_)) => {
609 }
611 Ok(Event::Decl(_)) => {
612 }
614 Ok(Event::PI(_)) => {
615 }
617 Ok(Event::DocType(_)) => {
618 }
620 Ok(Event::Eof) => break,
621 Err(e) => {
622 return Err(TokenizeError::ParseError(format!("XML error: {:?}", e)));
623 }
624 }
625 buf.clear();
626 }
627
628 while let Some(element) = element_stack.pop() {
630 match element {
631 ElementType::Emphasis => {
632 if token_count >= limits.max_tokens {
633 return Err(TokenizeError::InvalidStructure(format!(
634 "Token count exceeds max_tokens ({}",
635 limits.max_tokens
636 )));
637 }
638 tokens.push(Token::Emphasis(false));
639 token_count += 1;
640 }
641 ElementType::Strong => {
642 if token_count >= limits.max_tokens {
643 return Err(TokenizeError::InvalidStructure(format!(
644 "Token count exceeds max_tokens ({}",
645 limits.max_tokens
646 )));
647 }
648 tokens.push(Token::Strong(false));
649 token_count += 1;
650 }
651 ElementType::UnorderedList | ElementType::OrderedList => {
652 if token_count >= limits.max_tokens {
653 return Err(TokenizeError::InvalidStructure(format!(
654 "Token count exceeds max_tokens ({}",
655 limits.max_tokens
656 )));
657 }
658 tokens.push(Token::ListEnd);
659 token_count += 1;
660 }
661 ElementType::ListItem => {
662 if token_count >= limits.max_tokens {
663 return Err(TokenizeError::InvalidStructure(format!(
664 "Token count exceeds max_tokens ({}",
665 limits.max_tokens
666 )));
667 }
668 tokens.push(Token::ListItemEnd);
669 token_count += 1;
670 }
671 ElementType::Link => {
672 if token_count >= limits.max_tokens {
673 return Err(TokenizeError::InvalidStructure(format!(
674 "Token count exceeds max_tokens ({}",
675 limits.max_tokens
676 )));
677 }
678 tokens.push(Token::LinkEnd);
679 token_count += 1;
680 }
681 ElementType::Paragraph | ElementType::Heading(_) => {
682 }
684 _ => {}
685 }
686 }
687
688 if let Some(level) = pending_heading_close {
690 if token_count >= limits.max_tokens {
691 return Err(TokenizeError::InvalidStructure(format!(
692 "Token count exceeds max_tokens ({}",
693 limits.max_tokens
694 )));
695 }
696 tokens.push(Token::Heading(level));
697 }
698
699 Ok(tokens)
700}
701
702fn normalize_whitespace_limited(text: &str, max_bytes: usize) -> String {
704 let mut result = String::with_capacity(text.len().min(max_bytes));
705 let mut prev_was_space = true; for ch in text.chars() {
708 if result.len() >= max_bytes {
709 break;
710 }
711 if ch.is_whitespace() {
712 if !prev_was_space {
713 result.push(' ');
714 prev_was_space = true;
715 }
716 } else {
717 result.push(ch);
718 prev_was_space = false;
719 }
720 }
721
722 if result.ends_with(' ') {
724 result.pop();
725 }
726
727 result
728}
729
730#[derive(Clone, Debug, PartialEq)]
732enum ElementType {
733 Paragraph,
734 Heading(u8),
735 Emphasis,
736 Strong,
737 Span,
738 UnorderedList,
739 OrderedList,
740 ListItem,
741 Link,
742 Generic,
743}
744
745fn should_skip_element(name: &str) -> bool {
747 matches!(
748 name,
749 "script" | "style" | "head" | "nav" | "header" | "footer" | "aside" | "noscript"
750 )
751}
752
753fn normalize_whitespace(text: &str) -> String {
756 let mut result = String::with_capacity(text.len());
757 let mut prev_was_space = true; for ch in text.chars() {
760 if ch.is_whitespace() {
761 if !prev_was_space {
762 result.push(' ');
763 prev_was_space = true;
764 }
765 } else {
766 result.push(ch);
767 prev_was_space = false;
768 }
769 }
770
771 if result.ends_with(' ') {
773 result.pop();
774 }
775
776 result
777}
778
779fn get_attribute(e: &BytesStart, reader: &Reader<&[u8]>, name: &str) -> Option<String> {
781 for attr in e.attributes().flatten() {
782 let key = reader.decoder().decode(attr.key.as_ref()).ok()?;
783 if key.as_ref() == name {
784 let value = reader.decoder().decode(&attr.value).ok()?;
785 return Some(value.to_string());
786 }
787 }
788 None
789}
790
791fn decode_name(name: &[u8], reader: &Reader<&[u8]>) -> Result<String, TokenizeError> {
793 reader
794 .decoder()
795 .decode(name)
796 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))
797 .map(|s| s.to_string())
798}
799
800#[derive(Debug)]
806pub struct TokenizeScratch {
807 pub xml_buf: Vec<u8>,
809 pub text_buf: String,
811 element_buf: Vec<ElementType>,
813}
814
815impl TokenizeScratch {
816 pub fn new(xml_capacity: usize, text_capacity: usize) -> Self {
829 Self {
830 xml_buf: Vec::with_capacity(xml_capacity),
831 text_buf: String::with_capacity(text_capacity),
832 element_buf: Vec::with_capacity(64),
833 }
834 }
835
836 pub fn embedded() -> Self {
843 Self::new(4096, 8192)
844 }
845
846 pub fn desktop() -> Self {
853 Self::new(32768, 65536)
854 }
855
856 pub fn clear(&mut self) {
862 self.xml_buf.clear();
863 self.text_buf.clear();
864 self.element_buf.clear();
865 }
866
867 pub fn ensure_text_capacity(&mut self, min_cap: usize) {
875 if self.text_buf.capacity() < min_cap {
876 self.text_buf.reserve(min_cap - self.text_buf.capacity());
877 }
878 }
879}
880
881pub fn tokenize_html_into(html: &str, tokens_out: &mut Vec<Token>) -> Result<(), TokenizeError> {
910 let mut scratch = TokenizeScratch::embedded();
911 tokenize_html_with_scratch(html, tokens_out, &mut scratch)
912}
913
914pub fn tokenize_html_with_scratch(
949 html: &str,
950 tokens_out: &mut Vec<Token>,
951 scratch: &mut TokenizeScratch,
952) -> Result<(), TokenizeError> {
953 tokens_out.clear();
954 scratch.clear();
955
956 let mut reader = Reader::from_str(html);
957 reader.config_mut().trim_text(false);
958 reader.config_mut().expand_empty_elements = false;
959
960 let mut skip_depth: usize = 0;
962 let mut pending_paragraph_break: bool = false;
964 let mut pending_heading_close: Option<u8> = None;
966
967 loop {
968 match reader.read_event_into(&mut scratch.xml_buf) {
969 Ok(Event::Start(e)) => {
970 let name = decode_name(e.name().as_ref(), &reader)?;
971
972 if should_skip_element(&name) {
974 skip_depth += 1;
975 continue;
976 }
977
978 if skip_depth > 0 {
980 continue;
981 }
982
983 if pending_paragraph_break && !tokens_out.is_empty() {
985 tokens_out.push(Token::ParagraphBreak);
986 pending_paragraph_break = false;
987 }
988
989 if let Some(level) = pending_heading_close.take() {
991 tokens_out.push(Token::Heading(level));
992 pending_paragraph_break = true;
993 }
994
995 match name.as_str() {
996 "p" | "div" => {
997 scratch.element_buf.push(ElementType::Paragraph);
998 }
999 "span" => {
1000 scratch.element_buf.push(ElementType::Span);
1001 }
1002 h if h.starts_with('h') && h.len() == 2 => {
1003 if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
1004 if (1..=6).contains(&level) {
1005 scratch.element_buf.push(ElementType::Heading(level as u8));
1006 pending_heading_close = Some(level as u8);
1007 }
1008 }
1009 }
1010 "em" | "i" => {
1011 scratch.element_buf.push(ElementType::Emphasis);
1012 tokens_out.push(Token::Emphasis(true));
1013 }
1014 "strong" | "b" => {
1015 scratch.element_buf.push(ElementType::Strong);
1016 tokens_out.push(Token::Strong(true));
1017 }
1018 "ul" => {
1019 scratch.element_buf.push(ElementType::UnorderedList);
1020 tokens_out.push(Token::ListStart(false));
1021 }
1022 "ol" => {
1023 scratch.element_buf.push(ElementType::OrderedList);
1024 tokens_out.push(Token::ListStart(true));
1025 }
1026 "li" => {
1027 scratch.element_buf.push(ElementType::ListItem);
1028 tokens_out.push(Token::ListItemStart);
1029 }
1030 "a" => {
1031 if let Some(href) = get_attribute(&e, &reader, "href") {
1032 scratch.element_buf.push(ElementType::Link);
1033 tokens_out.push(Token::LinkStart(href));
1034 } else {
1035 scratch.element_buf.push(ElementType::Generic);
1037 }
1038 }
1039 "img" => {
1040 if let Some(src) = get_attribute(&e, &reader, "src") {
1042 let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
1043 tokens_out.push(Token::Image { src, alt });
1044 }
1045 scratch.element_buf.push(ElementType::Generic);
1046 }
1047 _ => {
1048 scratch.element_buf.push(ElementType::Generic);
1050 }
1051 }
1052 }
1053 Ok(Event::Text(e)) => {
1054 if skip_depth > 0 {
1056 continue;
1057 }
1058
1059 let text = e
1060 .decode()
1061 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
1062 .to_string();
1063
1064 let normalized = normalize_whitespace(&text);
1066
1067 if !normalized.is_empty() {
1068 if let Some(level) = pending_heading_close.take() {
1070 tokens_out.push(Token::Heading(level));
1071 }
1072 tokens_out.push(Token::Text(normalized));
1073 }
1074 }
1075 Ok(Event::End(e)) => {
1076 let name = decode_name(e.name().as_ref(), &reader)?;
1077
1078 if should_skip_element(&name) {
1080 skip_depth = skip_depth.saturating_sub(1);
1081 continue;
1082 }
1083
1084 if skip_depth > 0 {
1086 continue;
1087 }
1088
1089 if let Some(element) = scratch.element_buf.pop() {
1091 match element {
1092 ElementType::Paragraph => {
1093 pending_paragraph_break = true;
1094 }
1095 ElementType::Heading(_level) => {
1096 pending_paragraph_break = true;
1098 pending_heading_close = None;
1100 }
1101 ElementType::Emphasis => {
1102 tokens_out.push(Token::Emphasis(false));
1103 }
1104 ElementType::Strong => {
1105 tokens_out.push(Token::Strong(false));
1106 }
1107 ElementType::UnorderedList | ElementType::OrderedList => {
1108 tokens_out.push(Token::ListEnd);
1109 }
1110 ElementType::ListItem => {
1111 tokens_out.push(Token::ListItemEnd);
1112 }
1113 ElementType::Link => {
1114 tokens_out.push(Token::LinkEnd);
1115 }
1116 ElementType::Span | ElementType::Generic => {
1117 }
1119 }
1120 }
1121 }
1122 Ok(Event::Empty(e)) => {
1123 let name = decode_name(e.name().as_ref(), &reader)?;
1124
1125 if skip_depth > 0 {
1127 continue;
1128 }
1129
1130 if pending_paragraph_break && !tokens_out.is_empty() {
1132 tokens_out.push(Token::ParagraphBreak);
1133 pending_paragraph_break = false;
1134 }
1135
1136 if let Some(level) = pending_heading_close.take() {
1138 tokens_out.push(Token::Heading(level));
1139 pending_paragraph_break = true;
1140 }
1141
1142 match name.as_str() {
1143 "br" => {
1144 tokens_out.push(Token::LineBreak);
1145 }
1146 "p" | "div" => {
1147 pending_paragraph_break = true;
1149 }
1150 h if h.starts_with('h') && h.len() == 2 => {
1151 if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
1152 if (1..=6).contains(&level) {
1153 tokens_out.push(Token::Heading(level as u8));
1155 pending_paragraph_break = true;
1156 }
1157 }
1158 }
1159 "img" => {
1160 if let Some(src) = get_attribute(&e, &reader, "src") {
1161 let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
1162 tokens_out.push(Token::Image { src, alt });
1163 }
1164 }
1166 _ => {
1167 }
1169 }
1170 }
1171 Ok(Event::CData(e)) => {
1172 if skip_depth == 0 {
1174 let text = reader
1175 .decoder()
1176 .decode(&e)
1177 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
1178 .to_string();
1179
1180 let normalized = normalize_whitespace(&text);
1181 if !normalized.is_empty() {
1182 if let Some(level) = pending_heading_close.take() {
1184 tokens_out.push(Token::Heading(level));
1185 }
1186 tokens_out.push(Token::Text(normalized));
1187 }
1188 }
1189 }
1190 Ok(Event::GeneralRef(e)) => {
1191 if skip_depth > 0 {
1193 continue;
1194 }
1195
1196 let entity_name = e
1197 .decode()
1198 .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?;
1199 let entity_str = format!("&{};", entity_name);
1201 let resolved = unescape(&entity_str)
1202 .map_err(|e| TokenizeError::ParseError(format!("Unescape error: {:?}", e)))?
1203 .to_string();
1204
1205 if !resolved.is_empty() {
1206 if let Some(level) = pending_heading_close.take() {
1208 tokens_out.push(Token::Heading(level));
1209 }
1210 if let Some(Token::Text(ref mut last_text)) = tokens_out.last_mut() {
1212 last_text.push_str(&resolved);
1213 } else {
1214 tokens_out.push(Token::Text(resolved));
1215 }
1216 }
1217 }
1218 Ok(Event::Comment(_)) => {
1219 }
1221 Ok(Event::Decl(_)) => {
1222 }
1224 Ok(Event::PI(_)) => {
1225 }
1227 Ok(Event::DocType(_)) => {
1228 }
1230 Ok(Event::Eof) => break,
1231 Err(e) => {
1232 return Err(TokenizeError::ParseError(format!("XML error: {:?}", e)));
1233 }
1234 }
1235 scratch.xml_buf.clear();
1236 }
1237
1238 if pending_paragraph_break && !tokens_out.is_empty() {
1240 }
1243
1244 while let Some(element) = scratch.element_buf.pop() {
1246 match element {
1247 ElementType::Emphasis => {
1248 tokens_out.push(Token::Emphasis(false));
1249 }
1250 ElementType::Strong => {
1251 tokens_out.push(Token::Strong(false));
1252 }
1253 ElementType::UnorderedList | ElementType::OrderedList => {
1254 tokens_out.push(Token::ListEnd);
1255 }
1256 ElementType::ListItem => {
1257 tokens_out.push(Token::ListItemEnd);
1258 }
1259 ElementType::Link => {
1260 tokens_out.push(Token::LinkEnd);
1261 }
1262 ElementType::Paragraph | ElementType::Heading(_) => {
1263 }
1265 _ => {}
1266 }
1267 }
1268
1269 if let Some(level) = pending_heading_close {
1271 tokens_out.push(Token::Heading(level));
1272 }
1273
1274 Ok(())
1275}
1276
1277#[cfg(test)]
1278mod tests {
1279 use super::*;
1280 use alloc::vec;
1281
1282 #[test]
1283 fn test_tokenize_simple_paragraph() {
1284 let html = "<p>Hello world</p>";
1285 let tokens = tokenize_html(html).unwrap();
1286 assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
1288 }
1289
1290 #[test]
1291 fn test_tokenize_emphasis() {
1292 let html = "<p>This is <em>italic</em> and <strong>bold</strong> text.</p>";
1293 let tokens = tokenize_html(html).unwrap();
1294 assert_eq!(
1296 tokens,
1297 vec![
1298 Token::Text("This is".to_string()),
1299 Token::Emphasis(true),
1300 Token::Text("italic".to_string()),
1301 Token::Emphasis(false),
1302 Token::Text("and".to_string()),
1303 Token::Strong(true),
1304 Token::Text("bold".to_string()),
1305 Token::Strong(false),
1306 Token::Text("text.".to_string()),
1307 ]
1308 );
1309 }
1310
1311 #[test]
1312 fn test_tokenize_heading_and_paragraphs() {
1313 let html = "<h1>Chapter Title</h1><p>First paragraph.</p><p>Second paragraph.</p>";
1314 let tokens = tokenize_html(html).unwrap();
1315
1316 assert_eq!(
1317 tokens,
1318 vec![
1319 Token::Heading(1),
1320 Token::Text("Chapter Title".to_string()),
1321 Token::ParagraphBreak,
1322 Token::Text("First paragraph.".to_string()),
1323 Token::ParagraphBreak,
1324 Token::Text("Second paragraph.".to_string()),
1325 ]
1326 );
1327 }
1328
1329 #[test]
1330 fn test_tokenize_multiple_headings() {
1331 let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3>";
1332 let tokens = tokenize_html(html).unwrap();
1333
1334 assert_eq!(
1335 tokens,
1336 vec![
1337 Token::Heading(1),
1338 Token::Text("Title".to_string()),
1339 Token::ParagraphBreak,
1340 Token::Heading(2),
1341 Token::Text("Subtitle".to_string()),
1342 Token::ParagraphBreak,
1343 Token::Heading(3),
1344 Token::Text("Section".to_string()),
1345 ]
1346 );
1347 }
1348
1349 #[test]
1350 fn test_tokenize_line_break() {
1351 let html = "<p>Line one<br/>Line two</p>";
1353 let tokens = tokenize_html(html).unwrap();
1354
1355 assert_eq!(
1356 tokens,
1357 vec![
1358 Token::Text("Line one".to_string()),
1359 Token::LineBreak,
1360 Token::Text("Line two".to_string()),
1361 ]
1362 );
1363 }
1364
1365 #[test]
1366 fn test_tokenize_nested_formatting() {
1367 let html = "<p>Text with <strong>bold and <em>italic nested</em></strong>.</p>";
1368 let tokens = tokenize_html(html).unwrap();
1369
1370 assert_eq!(
1371 tokens,
1372 vec![
1373 Token::Text("Text with".to_string()),
1374 Token::Strong(true),
1375 Token::Text("bold and".to_string()),
1376 Token::Emphasis(true),
1377 Token::Text("italic nested".to_string()),
1378 Token::Emphasis(false),
1379 Token::Strong(false),
1380 Token::Text(".".to_string()),
1381 ]
1382 );
1383 }
1384
1385 #[test]
1386 fn test_strip_script_and_style() {
1387 let html = r#"<p>Visible text</p><script>alert("hidden");</script><style>.hidden{}</style><p>More visible</p>"#;
1388 let tokens = tokenize_html(html).unwrap();
1389
1390 assert_eq!(
1391 tokens,
1392 vec![
1393 Token::Text("Visible text".to_string()),
1394 Token::ParagraphBreak,
1395 Token::Text("More visible".to_string()),
1396 ]
1397 );
1398 }
1399
1400 #[test]
1401 fn test_strip_head() {
1402 let html = "<head><title>Title</title></head><body><p>Content</p></body>";
1403 let tokens = tokenize_html(html).unwrap();
1404
1405 assert_eq!(tokens, vec![Token::Text("Content".to_string())]);
1406 }
1407
1408 #[test]
1409 fn test_whitespace_normalization() {
1410 let html = "<p> Multiple spaces and\n\nnewlines </p>";
1411 let tokens = tokenize_html(html).unwrap();
1412
1413 assert_eq!(
1414 tokens,
1415 vec![Token::Text("Multiple spaces and newlines".to_string())]
1416 );
1417 }
1418
1419 #[test]
1420 fn test_empty_paragraph() {
1421 let html = "<p></p>";
1422 let tokens = tokenize_html(html).unwrap();
1423 assert_eq!(tokens, vec![]);
1425 }
1426
1427 #[test]
1428 fn test_unclosed_tags_rejected() {
1429 let html = "<p>Text with <em>italic</p>";
1431 assert!(tokenize_html(html).is_err());
1432 }
1433
1434 #[test]
1435 fn test_b_and_i_tags() {
1436 let html = "<p><b>bold</b> and <i>italic</i></p>";
1437 let tokens = tokenize_html(html).unwrap();
1438
1439 assert_eq!(
1440 tokens,
1441 vec![
1442 Token::Strong(true),
1443 Token::Text("bold".to_string()),
1444 Token::Strong(false),
1445 Token::Text("and".to_string()),
1446 Token::Emphasis(true),
1447 Token::Text("italic".to_string()),
1448 Token::Emphasis(false),
1449 ]
1450 );
1451 }
1452
1453 #[test]
1454 fn test_div_handling() {
1455 let html = "<div>Block content</div><div>Another block</div>";
1456 let tokens = tokenize_html(html).unwrap();
1457
1458 assert_eq!(
1459 tokens,
1460 vec![
1461 Token::Text("Block content".to_string()),
1462 Token::ParagraphBreak,
1463 Token::Text("Another block".to_string()),
1464 ]
1465 );
1466 }
1467
1468 #[test]
1469 fn test_span_handling() {
1470 let html = "<p>Text with <span>spanned</span> content</p>";
1471 let tokens = tokenize_html(html).unwrap();
1472
1473 assert_eq!(
1474 tokens,
1475 vec![
1476 Token::Text("Text with".to_string()),
1477 Token::Text("spanned".to_string()),
1478 Token::Text("content".to_string()),
1479 ]
1480 );
1481 }
1482
1483 #[test]
1484 fn test_example_from_spec() {
1485 let html = r#"<p>This is <em>italic</em> and <strong>bold</strong> text.</p>
1486<h1>Chapter Title</h1>
1487<p>Another paragraph.</p>"#;
1488
1489 let tokens = tokenize_html(html).unwrap();
1490
1491 let expected = vec![
1492 Token::Text("This is".to_string()),
1493 Token::Emphasis(true),
1494 Token::Text("italic".to_string()),
1495 Token::Emphasis(false),
1496 Token::Text("and".to_string()),
1497 Token::Strong(true),
1498 Token::Text("bold".to_string()),
1499 Token::Strong(false),
1500 Token::Text("text.".to_string()),
1501 Token::ParagraphBreak,
1502 Token::Heading(1),
1503 Token::Text("Chapter Title".to_string()),
1504 Token::ParagraphBreak,
1505 Token::Text("Another paragraph.".to_string()),
1506 ];
1507
1508 assert_eq!(tokens, expected);
1509 }
1510
1511 #[test]
1512 fn test_all_heading_levels() {
1513 let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
1514 let tokens = tokenize_html(html).unwrap();
1515
1516 assert_eq!(
1517 tokens,
1518 vec![
1519 Token::Heading(1),
1520 Token::Text("H1".to_string()),
1521 Token::ParagraphBreak,
1522 Token::Heading(2),
1523 Token::Text("H2".to_string()),
1524 Token::ParagraphBreak,
1525 Token::Heading(3),
1526 Token::Text("H3".to_string()),
1527 Token::ParagraphBreak,
1528 Token::Heading(4),
1529 Token::Text("H4".to_string()),
1530 Token::ParagraphBreak,
1531 Token::Heading(5),
1532 Token::Text("H5".to_string()),
1533 Token::ParagraphBreak,
1534 Token::Heading(6),
1535 Token::Text("H6".to_string()),
1536 ]
1537 );
1538 }
1539
1540 #[test]
1543 fn test_simple_unordered_list() {
1544 let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
1545 let tokens = tokenize_html(html).unwrap();
1546
1547 assert_eq!(
1548 tokens,
1549 vec![
1550 Token::ListStart(false),
1551 Token::ListItemStart,
1552 Token::Text("Item 1".to_string()),
1553 Token::ListItemEnd,
1554 Token::ListItemStart,
1555 Token::Text("Item 2".to_string()),
1556 Token::ListItemEnd,
1557 Token::ListEnd,
1558 ]
1559 );
1560 }
1561
1562 #[test]
1563 fn test_simple_ordered_list() {
1564 let html = "<ol><li>First</li><li>Second</li></ol>";
1565 let tokens = tokenize_html(html).unwrap();
1566
1567 assert_eq!(
1568 tokens,
1569 vec![
1570 Token::ListStart(true),
1571 Token::ListItemStart,
1572 Token::Text("First".to_string()),
1573 Token::ListItemEnd,
1574 Token::ListItemStart,
1575 Token::Text("Second".to_string()),
1576 Token::ListItemEnd,
1577 Token::ListEnd,
1578 ]
1579 );
1580 }
1581
1582 #[test]
1583 fn test_nested_lists() {
1584 let html = "<ul><li>A<ul><li>B</li></ul></li></ul>";
1585 let tokens = tokenize_html(html).unwrap();
1586
1587 assert_eq!(
1588 tokens,
1589 vec![
1590 Token::ListStart(false),
1591 Token::ListItemStart,
1592 Token::Text("A".to_string()),
1593 Token::ListStart(false),
1594 Token::ListItemStart,
1595 Token::Text("B".to_string()),
1596 Token::ListItemEnd,
1597 Token::ListEnd,
1598 Token::ListItemEnd,
1599 Token::ListEnd,
1600 ]
1601 );
1602 }
1603
1604 #[test]
1605 fn test_list_with_formatted_text() {
1606 let html = "<ul><li><em>italic</em> item</li></ul>";
1607 let tokens = tokenize_html(html).unwrap();
1608
1609 assert_eq!(
1610 tokens,
1611 vec![
1612 Token::ListStart(false),
1613 Token::ListItemStart,
1614 Token::Emphasis(true),
1615 Token::Text("italic".to_string()),
1616 Token::Emphasis(false),
1617 Token::Text("item".to_string()),
1618 Token::ListItemEnd,
1619 Token::ListEnd,
1620 ]
1621 );
1622 }
1623
1624 #[test]
1625 fn test_empty_list() {
1626 let html = "<ul></ul>";
1627 let tokens = tokenize_html(html).unwrap();
1628
1629 assert_eq!(tokens, vec![Token::ListStart(false), Token::ListEnd]);
1630 }
1631
1632 #[test]
1635 fn test_link_with_href() {
1636 let html = r#"<a href="ch2.xhtml">Next Chapter</a>"#;
1637 let tokens = tokenize_html(html).unwrap();
1638
1639 assert_eq!(
1640 tokens,
1641 vec![
1642 Token::LinkStart("ch2.xhtml".to_string()),
1643 Token::Text("Next Chapter".to_string()),
1644 Token::LinkEnd,
1645 ]
1646 );
1647 }
1648
1649 #[test]
1650 fn test_link_without_href() {
1651 let html = "<a>No link</a>";
1652 let tokens = tokenize_html(html).unwrap();
1653
1654 assert_eq!(tokens, vec![Token::Text("No link".to_string())]);
1656 }
1657
1658 #[test]
1659 fn test_link_with_formatted_text() {
1660 let html = r#"<a href="x.html"><em>italic link</em></a>"#;
1661 let tokens = tokenize_html(html).unwrap();
1662
1663 assert_eq!(
1664 tokens,
1665 vec![
1666 Token::LinkStart("x.html".to_string()),
1667 Token::Emphasis(true),
1668 Token::Text("italic link".to_string()),
1669 Token::Emphasis(false),
1670 Token::LinkEnd,
1671 ]
1672 );
1673 }
1674
1675 #[test]
1678 fn test_image_self_closing() {
1679 let html = r#"<img src="cover.jpg" alt="Cover Image"/>"#;
1680 let tokens = tokenize_html(html).unwrap();
1681
1682 assert_eq!(
1683 tokens,
1684 vec![Token::Image {
1685 src: "cover.jpg".to_string(),
1686 alt: "Cover Image".to_string(),
1687 }]
1688 );
1689 }
1690
1691 #[test]
1692 fn test_image_without_alt() {
1693 let html = r#"<img src="photo.jpg"/>"#;
1694 let tokens = tokenize_html(html).unwrap();
1695
1696 assert_eq!(
1697 tokens,
1698 vec![Token::Image {
1699 src: "photo.jpg".to_string(),
1700 alt: String::with_capacity(0),
1701 }]
1702 );
1703 }
1704
1705 #[test]
1706 fn test_image_without_src() {
1707 let html = r#"<img alt="Missing"/>"#;
1708 let tokens = tokenize_html(html).unwrap();
1709
1710 assert_eq!(tokens, vec![]);
1712 }
1713
1714 #[test]
1715 fn test_image_as_start_tag() {
1716 let html = r#"<img src="pic.png" alt="Pic"></img>"#;
1718 let tokens = tokenize_html(html).unwrap();
1719
1720 assert_eq!(
1721 tokens,
1722 vec![Token::Image {
1723 src: "pic.png".to_string(),
1724 alt: "Pic".to_string(),
1725 }]
1726 );
1727 }
1728
1729 #[test]
1732 fn test_mixed_content() {
1733 let html = r#"<p>See <a href="ch2.xhtml">chapter 2</a> for details.</p><ul><li>Item with <img src="icon.png" alt="icon"/></li></ul>"#;
1734 let tokens = tokenize_html(html).unwrap();
1735
1736 assert_eq!(
1737 tokens,
1738 vec![
1739 Token::Text("See".to_string()),
1740 Token::LinkStart("ch2.xhtml".to_string()),
1741 Token::Text("chapter 2".to_string()),
1742 Token::LinkEnd,
1743 Token::Text("for details.".to_string()),
1744 Token::ParagraphBreak,
1745 Token::ListStart(false),
1746 Token::ListItemStart,
1747 Token::Text("Item with".to_string()),
1748 Token::Image {
1749 src: "icon.png".to_string(),
1750 alt: "icon".to_string(),
1751 },
1752 Token::ListItemEnd,
1753 Token::ListEnd,
1754 ]
1755 );
1756 }
1757
1758 #[test]
1761 fn test_deeply_nested_formatting() {
1762 let html = "<em><strong><em>triple</em></strong></em>";
1763 let tokens = tokenize_html(html).unwrap();
1764
1765 assert_eq!(
1766 tokens,
1767 vec![
1768 Token::Emphasis(true),
1769 Token::Strong(true),
1770 Token::Emphasis(true),
1771 Token::Text("triple".to_string()),
1772 Token::Emphasis(false),
1773 Token::Strong(false),
1774 Token::Emphasis(false),
1775 ]
1776 );
1777 }
1778
1779 #[test]
1780 fn test_consecutive_headings_same_level() {
1781 let html = "<h2>First</h2><h2>Second</h2>";
1782 let tokens = tokenize_html(html).unwrap();
1783
1784 assert_eq!(
1785 tokens,
1786 vec![
1787 Token::Heading(2),
1788 Token::Text("First".to_string()),
1789 Token::ParagraphBreak,
1790 Token::Heading(2),
1791 Token::Text("Second".to_string()),
1792 ]
1793 );
1794 }
1795
1796 #[test]
1797 fn test_multiple_consecutive_line_breaks() {
1798 let html = "<p>A<br/><br/><br/>B</p>";
1799 let tokens = tokenize_html(html).unwrap();
1800
1801 assert_eq!(
1802 tokens,
1803 vec![
1804 Token::Text("A".to_string()),
1805 Token::LineBreak,
1806 Token::LineBreak,
1807 Token::LineBreak,
1808 Token::Text("B".to_string()),
1809 ]
1810 );
1811 }
1812
1813 #[test]
1814 fn test_cdata_sections() {
1815 let html = "<p><![CDATA[Some raw content]]></p>";
1816 let tokens = tokenize_html(html).unwrap();
1817
1818 assert_eq!(tokens, vec![Token::Text("Some raw content".to_string())]);
1819 }
1820
1821 #[test]
1822 fn test_whitespace_only_text_nodes() {
1823 let html = "<p>First</p> \n <p>Second</p>";
1825 let tokens = tokenize_html(html).unwrap();
1826
1827 assert_eq!(
1828 tokens,
1829 vec![
1830 Token::Text("First".to_string()),
1831 Token::ParagraphBreak,
1832 Token::Text("Second".to_string()),
1833 ]
1834 );
1835 }
1836
1837 #[test]
1838 fn test_very_long_text() {
1839 let long_word = "word ".repeat(10_000);
1841 let html = format!("<p>{}</p>", long_word);
1842 let tokens = tokenize_html(&html).unwrap();
1843
1844 assert_eq!(tokens.len(), 1);
1845 if let Token::Text(ref text) = tokens[0] {
1846 assert!(text.len() > 40_000);
1847 } else {
1848 panic!("Expected Token::Text");
1849 }
1850 }
1851
1852 #[test]
1853 fn test_mixed_block_and_inline() {
1854 let html = "<div><p><em>text</em></p></div>";
1855 let tokens = tokenize_html(html).unwrap();
1856
1857 assert_eq!(
1858 tokens,
1859 vec![
1860 Token::Emphasis(true),
1861 Token::Text("text".to_string()),
1862 Token::Emphasis(false),
1863 ]
1864 );
1865 }
1866
1867 #[test]
1868 fn test_block_inside_inline_no_crash() {
1869 let html = "<em><p>text</p></em>";
1871 let result = tokenize_html(html);
1873 assert!(result.is_ok());
1874 let tokens = result.unwrap();
1875 assert!(tokens
1877 .iter()
1878 .any(|t| matches!(t, Token::Text(s) if s == "text")));
1879 }
1880
1881 #[test]
1882 fn test_link_in_paragraph() {
1883 let html = r#"<p>Click <a href="http://example.com">here</a> to continue.</p>"#;
1884 let tokens = tokenize_html(html).unwrap();
1885
1886 assert_eq!(
1887 tokens,
1888 vec![
1889 Token::Text("Click".to_string()),
1890 Token::LinkStart("http://example.com".to_string()),
1891 Token::Text("here".to_string()),
1892 Token::LinkEnd,
1893 Token::Text("to continue.".to_string()),
1894 ]
1895 );
1896 }
1897
1898 #[test]
1899 fn test_image_in_paragraph() {
1900 let html = r#"<p>An image: <img src="fig1.png" alt="Figure 1"/></p>"#;
1901 let tokens = tokenize_html(html).unwrap();
1902
1903 assert_eq!(
1904 tokens,
1905 vec![
1906 Token::Text("An image:".to_string()),
1907 Token::Image {
1908 src: "fig1.png".to_string(),
1909 alt: "Figure 1".to_string(),
1910 },
1911 ]
1912 );
1913 }
1914
1915 #[test]
1916 fn test_list_after_paragraph() {
1917 let html = "<p>Intro:</p><ul><li>One</li><li>Two</li></ul>";
1918 let tokens = tokenize_html(html).unwrap();
1919
1920 assert_eq!(
1921 tokens,
1922 vec![
1923 Token::Text("Intro:".to_string()),
1924 Token::ParagraphBreak,
1925 Token::ListStart(false),
1926 Token::ListItemStart,
1927 Token::Text("One".to_string()),
1928 Token::ListItemEnd,
1929 Token::ListItemStart,
1930 Token::Text("Two".to_string()),
1931 Token::ListItemEnd,
1932 Token::ListEnd,
1933 ]
1934 );
1935 }
1936
1937 #[test]
1938 fn test_ordered_list_with_links() {
1939 let html = r#"<ol><li><a href="ch1.html">Chapter 1</a></li><li><a href="ch2.html">Chapter 2</a></li></ol>"#;
1940 let tokens = tokenize_html(html).unwrap();
1941
1942 assert_eq!(
1943 tokens,
1944 vec![
1945 Token::ListStart(true),
1946 Token::ListItemStart,
1947 Token::LinkStart("ch1.html".to_string()),
1948 Token::Text("Chapter 1".to_string()),
1949 Token::LinkEnd,
1950 Token::ListItemEnd,
1951 Token::ListItemStart,
1952 Token::LinkStart("ch2.html".to_string()),
1953 Token::Text("Chapter 2".to_string()),
1954 Token::LinkEnd,
1955 Token::ListItemEnd,
1956 Token::ListEnd,
1957 ]
1958 );
1959 }
1960
1961 #[test]
1962 fn test_tokenize_html_with_matches_tokenize_html() {
1963 let html = "<h1>T</h1><p>Hello <em>world</em><br/>line 2</p>";
1964 let baseline = tokenize_html(html).unwrap();
1965 let mut streamed = Vec::with_capacity(0);
1966 tokenize_html_with(html, |token| streamed.push(token)).unwrap();
1967 assert_eq!(baseline, streamed);
1968 }
1969}