epub_stream/
tokenizer.rs

1//! XHTML to token stream converter for EPUB content
2//!
3//! Converts XHTML chapters into a simplified token format that's easier
4//! to layout. Uses quick_xml for SAX-style parsing to handle large
5//! documents efficiently without loading the entire DOM.
6
7extern crate alloc;
8
9use alloc::format;
10use alloc::string::{String, ToString};
11use alloc::vec::Vec;
12use quick_xml::escape::unescape;
13use quick_xml::events::{BytesStart, Event};
14use quick_xml::reader::Reader;
15
16/// Token types for simplified XHTML representation
17#[derive(Clone, Debug, PartialEq, Eq)]
18#[non_exhaustive]
19pub enum Token {
20    /// Plain text content
21    Text(String),
22    /// New paragraph break
23    ParagraphBreak,
24    /// Heading with level 1-6
25    Heading(u8),
26    /// Start (true) or end (false) of italic emphasis
27    Emphasis(bool),
28    /// Start (true) or end (false) of bold strong
29    Strong(bool),
30    /// Line break (<br>)
31    LineBreak,
32    /// Start of a list (true = ordered, false = unordered)
33    ListStart(bool),
34    /// End of a list
35    ListEnd,
36    /// Start of a list item
37    ListItemStart,
38    /// End of a list item
39    ListItemEnd,
40    /// Start of a link with href
41    LinkStart(String),
42    /// End of a link
43    LinkEnd,
44    /// Image reference with src and alt text
45    Image {
46        /// Image source path (relative to EPUB content)
47        src: String,
48        /// Alternative text for the image
49        alt: String,
50    },
51}
52
53/// Error type for tokenization failures
54#[derive(Clone, Debug, PartialEq, Eq)]
55#[non_exhaustive]
56pub enum TokenizeError {
57    /// XML parsing error
58    ParseError(String),
59    /// Invalid HTML structure
60    InvalidStructure(String),
61}
62
63impl core::fmt::Display for TokenizeError {
64    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
65        match self {
66            TokenizeError::ParseError(msg) => write!(f, "Parse error: {}", msg),
67            TokenizeError::InvalidStructure(msg) => write!(f, "Invalid structure: {}", msg),
68        }
69    }
70}
71
72#[cfg(feature = "std")]
73impl std::error::Error for TokenizeError {}
74
75/// Convert XHTML string into a token stream
76///
77/// Parses HTML tags: p, h1-h6, em, strong, br, span, div
78/// Strips out: script, style, head, attributes (except class for styling)
79/// Extracts text content and converts HTML entities
80///
81/// # Example
82/// ```
83/// use epub_stream::tokenizer::tokenize_html;
84///
85/// let html = "<p>Hello <em>world</em></p>";
86/// let tokens = tokenize_html(html).unwrap();
87/// ```
88pub fn tokenize_html(html: &str) -> Result<Vec<Token>, TokenizeError> {
89    // Estimate token count: roughly 1 token per 10 bytes of HTML
90    let estimated_tokens = html.len() / 10;
91    let mut tokens = Vec::with_capacity(estimated_tokens.min(10000));
92    tokenize_html_into(html, &mut tokens)?;
93    Ok(tokens)
94}
95
96/// Limits for bounded tokenization to prevent unbounded Vec growth.
97#[derive(Clone, Copy, Debug, PartialEq, Eq)]
98pub struct TokenizeLimits {
99    /// Maximum number of tokens to emit before returning an error.
100    pub max_tokens: usize,
101    /// Maximum nesting depth for element stack.
102    pub max_nesting: usize,
103    /// Maximum text node size in bytes before truncation.
104    pub max_text_bytes: usize,
105}
106
107impl Default for TokenizeLimits {
108    fn default() -> Self {
109        Self {
110            max_tokens: 100_000,
111            max_nesting: 256,
112            max_text_bytes: 64 * 1024,
113        }
114    }
115}
116
117impl TokenizeLimits {
118    /// Create limits suitable for embedded environments.
119    pub fn embedded() -> Self {
120        Self {
121            max_tokens: 10_000,
122            max_nesting: 64,
123            max_text_bytes: 8 * 1024,
124        }
125    }
126}
127
128/// Convert XHTML string into a streamed token sequence.
129///
130/// This callback-oriented API keeps ownership of each token with the caller,
131/// so downstream code can avoid storing a full token vector.
132pub fn tokenize_html_with<F>(html: &str, mut on_token: F) -> Result<(), TokenizeError>
133where
134    F: FnMut(Token),
135{
136    for token in tokenize_html(html)? {
137        on_token(token);
138    }
139    Ok(())
140}
141
142/// Convert XHTML string into a token stream with bounded limits.
143///
144/// Enforces `max_tokens` limit to prevent unbounded Vec growth on malicious
145/// or extremely large inputs. Returns an error if limits are exceeded.
146///
147/// # Allocation behavior
148/// - Allocates token Vec with capacity hints
149/// - Returns error instead of unbounded growth
150/// - Stack usage is bounded by `max_nesting`
151pub fn tokenize_html_limited(
152    html: &str,
153    limits: TokenizeLimits,
154) -> Result<Vec<Token>, TokenizeError> {
155    let mut reader = Reader::from_str(html);
156    reader.config_mut().trim_text(false);
157    reader.config_mut().expand_empty_elements = false;
158
159    let mut buf = Vec::with_capacity(0);
160    let mut tokens = Vec::with_capacity(limits.max_tokens.min(1024));
161
162    // Stack to track nested elements for proper closing
163    let mut element_stack: Vec<ElementType> = Vec::with_capacity(limits.max_nesting.min(64));
164    // Track if we're inside a tag that should be skipped (script, style, head)
165    let mut skip_depth: usize = 0;
166    // Track if we need a paragraph break after current block element
167    let mut pending_paragraph_break: bool = false;
168    // Track if we need a heading close after text content
169    let mut pending_heading_close: Option<u8> = None;
170
171    let mut token_count: usize = 0;
172
173    loop {
174        match reader.read_event_into(&mut buf) {
175            Ok(Event::Start(e)) => {
176                let name = decode_name(e.name().as_ref(), &reader)?;
177
178                // Check if we should skip this element and its children
179                if should_skip_element(&name) {
180                    skip_depth += 1;
181                    continue;
182                }
183
184                // If skipping, don't process anything
185                if skip_depth > 0 {
186                    continue;
187                }
188
189                // Check nesting limit
190                if element_stack.len() >= limits.max_nesting {
191                    return Err(TokenizeError::InvalidStructure(format!(
192                        "Nesting depth exceeds max_nesting ({})",
193                        limits.max_nesting
194                    )));
195                }
196
197                // Flush any pending paragraph break from previous block
198                if pending_paragraph_break && !tokens.is_empty() {
199                    if token_count >= limits.max_tokens {
200                        return Err(TokenizeError::InvalidStructure(format!(
201                            "Token count exceeds max_tokens ({}",
202                            limits.max_tokens
203                        )));
204                    }
205                    tokens.push(Token::ParagraphBreak);
206                    token_count += 1;
207                    pending_paragraph_break = false;
208                }
209
210                // Flush any pending heading close
211                if let Some(level) = pending_heading_close.take() {
212                    if token_count >= limits.max_tokens {
213                        return Err(TokenizeError::InvalidStructure(format!(
214                            "Token count exceeds max_tokens ({}",
215                            limits.max_tokens
216                        )));
217                    }
218                    tokens.push(Token::Heading(level));
219                    token_count += 1;
220                    pending_paragraph_break = true;
221                }
222
223                match name.as_str() {
224                    "p" | "div" => {
225                        element_stack.push(ElementType::Paragraph);
226                    }
227                    "span" => {
228                        element_stack.push(ElementType::Span);
229                    }
230                    h if h.starts_with('h') && h.len() == 2 => {
231                        if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
232                            if (1..=6).contains(&level) {
233                                element_stack.push(ElementType::Heading(level as u8));
234                                pending_heading_close = Some(level as u8);
235                            }
236                        }
237                    }
238                    "em" | "i" => {
239                        element_stack.push(ElementType::Emphasis);
240                        if token_count >= limits.max_tokens {
241                            return Err(TokenizeError::InvalidStructure(format!(
242                                "Token count exceeds max_tokens ({}",
243                                limits.max_tokens
244                            )));
245                        }
246                        tokens.push(Token::Emphasis(true));
247                        token_count += 1;
248                    }
249                    "strong" | "b" => {
250                        element_stack.push(ElementType::Strong);
251                        if token_count >= limits.max_tokens {
252                            return Err(TokenizeError::InvalidStructure(format!(
253                                "Token count exceeds max_tokens ({}",
254                                limits.max_tokens
255                            )));
256                        }
257                        tokens.push(Token::Strong(true));
258                        token_count += 1;
259                    }
260                    "ul" => {
261                        element_stack.push(ElementType::UnorderedList);
262                        if token_count >= limits.max_tokens {
263                            return Err(TokenizeError::InvalidStructure(format!(
264                                "Token count exceeds max_tokens ({}",
265                                limits.max_tokens
266                            )));
267                        }
268                        tokens.push(Token::ListStart(false));
269                        token_count += 1;
270                    }
271                    "ol" => {
272                        element_stack.push(ElementType::OrderedList);
273                        if token_count >= limits.max_tokens {
274                            return Err(TokenizeError::InvalidStructure(format!(
275                                "Token count exceeds max_tokens ({}",
276                                limits.max_tokens
277                            )));
278                        }
279                        tokens.push(Token::ListStart(true));
280                        token_count += 1;
281                    }
282                    "li" => {
283                        element_stack.push(ElementType::ListItem);
284                        if token_count >= limits.max_tokens {
285                            return Err(TokenizeError::InvalidStructure(format!(
286                                "Token count exceeds max_tokens ({}",
287                                limits.max_tokens
288                            )));
289                        }
290                        tokens.push(Token::ListItemStart);
291                        token_count += 1;
292                    }
293                    "a" => {
294                        if let Some(href) = get_attribute(&e, &reader, "href") {
295                            element_stack.push(ElementType::Link);
296                            if token_count >= limits.max_tokens {
297                                return Err(TokenizeError::InvalidStructure(format!(
298                                    "Token count exceeds max_tokens ({}",
299                                    limits.max_tokens
300                                )));
301                            }
302                            tokens.push(Token::LinkStart(href));
303                            token_count += 1;
304                        } else {
305                            // No href — treat as generic container
306                            element_stack.push(ElementType::Generic);
307                        }
308                    }
309                    "img" => {
310                        // <img> as a start tag (non-self-closing)
311                        if let Some(src) = get_attribute(&e, &reader, "src") {
312                            let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
313                            if token_count >= limits.max_tokens {
314                                return Err(TokenizeError::InvalidStructure(format!(
315                                    "Token count exceeds max_tokens ({}",
316                                    limits.max_tokens
317                                )));
318                            }
319                            tokens.push(Token::Image { src, alt });
320                            token_count += 1;
321                        }
322                        element_stack.push(ElementType::Generic);
323                    }
324                    _ => {
325                        // Unknown element, treat as generic container
326                        element_stack.push(ElementType::Generic);
327                    }
328                }
329            }
330            Ok(Event::Text(e)) => {
331                // Skip text if we're inside a script/style/head block
332                if skip_depth > 0 {
333                    continue;
334                }
335
336                let text = e
337                    .decode()
338                    .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
339                    .to_string();
340
341                // Normalize whitespace: collapse multiple spaces/newlines
342                let normalized = normalize_whitespace_limited(&text, limits.max_text_bytes);
343
344                if !normalized.is_empty() {
345                    // Flush any pending heading close
346                    if let Some(level) = pending_heading_close.take() {
347                        if token_count >= limits.max_tokens {
348                            return Err(TokenizeError::InvalidStructure(format!(
349                                "Token count exceeds max_tokens ({}",
350                                limits.max_tokens
351                            )));
352                        }
353                        tokens.push(Token::Heading(level));
354                        token_count += 1;
355                    }
356                    if token_count >= limits.max_tokens {
357                        return Err(TokenizeError::InvalidStructure(format!(
358                            "Token count exceeds max_tokens ({}",
359                            limits.max_tokens
360                        )));
361                    }
362                    tokens.push(Token::Text(normalized));
363                    token_count += 1;
364                }
365            }
366            Ok(Event::End(e)) => {
367                let name = decode_name(e.name().as_ref(), &reader)?;
368
369                // Check if we're ending a skip element
370                if should_skip_element(&name) {
371                    skip_depth = skip_depth.saturating_sub(1);
372                    continue;
373                }
374
375                // If skipping, don't process end tags
376                if skip_depth > 0 {
377                    continue;
378                }
379
380                // Pop the element from stack and emit appropriate close token
381                if let Some(element) = element_stack.pop() {
382                    match element {
383                        ElementType::Paragraph => {
384                            pending_paragraph_break = true;
385                        }
386                        ElementType::Heading(_level) => {
387                            // Heading already emitted on start, just mark for paragraph break
388                            pending_paragraph_break = true;
389                            // Clear any pending close since we already handled it
390                            pending_heading_close = None;
391                        }
392                        ElementType::Emphasis => {
393                            if token_count >= limits.max_tokens {
394                                return Err(TokenizeError::InvalidStructure(format!(
395                                    "Token count exceeds max_tokens ({}",
396                                    limits.max_tokens
397                                )));
398                            }
399                            tokens.push(Token::Emphasis(false));
400                            token_count += 1;
401                        }
402                        ElementType::Strong => {
403                            if token_count >= limits.max_tokens {
404                                return Err(TokenizeError::InvalidStructure(format!(
405                                    "Token count exceeds max_tokens ({}",
406                                    limits.max_tokens
407                                )));
408                            }
409                            tokens.push(Token::Strong(false));
410                            token_count += 1;
411                        }
412                        ElementType::UnorderedList | ElementType::OrderedList => {
413                            if token_count >= limits.max_tokens {
414                                return Err(TokenizeError::InvalidStructure(format!(
415                                    "Token count exceeds max_tokens ({}",
416                                    limits.max_tokens
417                                )));
418                            }
419                            tokens.push(Token::ListEnd);
420                            token_count += 1;
421                        }
422                        ElementType::ListItem => {
423                            if token_count >= limits.max_tokens {
424                                return Err(TokenizeError::InvalidStructure(format!(
425                                    "Token count exceeds max_tokens ({}",
426                                    limits.max_tokens
427                                )));
428                            }
429                            tokens.push(Token::ListItemEnd);
430                            token_count += 1;
431                        }
432                        ElementType::Link => {
433                            if token_count >= limits.max_tokens {
434                                return Err(TokenizeError::InvalidStructure(format!(
435                                    "Token count exceeds max_tokens ({}",
436                                    limits.max_tokens
437                                )));
438                            }
439                            tokens.push(Token::LinkEnd);
440                            token_count += 1;
441                        }
442                        ElementType::Span | ElementType::Generic => {
443                            // No tokens needed for these
444                        }
445                    }
446                }
447            }
448            Ok(Event::Empty(e)) => {
449                let name = decode_name(e.name().as_ref(), &reader)?;
450
451                // Skip empty elements inside script/style blocks
452                if skip_depth > 0 {
453                    continue;
454                }
455
456                // Flush any pending paragraph break
457                if pending_paragraph_break && !tokens.is_empty() {
458                    if token_count >= limits.max_tokens {
459                        return Err(TokenizeError::InvalidStructure(format!(
460                            "Token count exceeds max_tokens ({}",
461                            limits.max_tokens
462                        )));
463                    }
464                    tokens.push(Token::ParagraphBreak);
465                    token_count += 1;
466                    pending_paragraph_break = false;
467                }
468
469                // Flush any pending heading close
470                if let Some(level) = pending_heading_close.take() {
471                    if token_count >= limits.max_tokens {
472                        return Err(TokenizeError::InvalidStructure(format!(
473                            "Token count exceeds max_tokens ({}",
474                            limits.max_tokens
475                        )));
476                    }
477                    tokens.push(Token::Heading(level));
478                    token_count += 1;
479                    pending_paragraph_break = true;
480                }
481
482                match name.as_str() {
483                    "br" => {
484                        if token_count >= limits.max_tokens {
485                            return Err(TokenizeError::InvalidStructure(format!(
486                                "Token count exceeds max_tokens ({}",
487                                limits.max_tokens
488                            )));
489                        }
490                        tokens.push(Token::LineBreak);
491                        token_count += 1;
492                    }
493                    "p" | "div" => {
494                        // Empty paragraph still creates a paragraph break
495                        pending_paragraph_break = true;
496                    }
497                    h if h.starts_with('h') && h.len() == 2 => {
498                        if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
499                            if (1..=6).contains(&level) {
500                                // Empty heading - just emit the heading token
501                                if token_count >= limits.max_tokens {
502                                    return Err(TokenizeError::InvalidStructure(format!(
503                                        "Token count exceeds max_tokens ({}",
504                                        limits.max_tokens
505                                    )));
506                                }
507                                tokens.push(Token::Heading(level as u8));
508                                token_count += 1;
509                                pending_paragraph_break = true;
510                            }
511                        }
512                    }
513                    "img" => {
514                        if let Some(src) = get_attribute(&e, &reader, "src") {
515                            let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
516                            if token_count >= limits.max_tokens {
517                                return Err(TokenizeError::InvalidStructure(format!(
518                                    "Token count exceeds max_tokens ({}",
519                                    limits.max_tokens
520                                )));
521                            }
522                            tokens.push(Token::Image { src, alt });
523                            token_count += 1;
524                        }
525                        // No src → skip
526                    }
527                    _ => {
528                        // Other empty elements are ignored
529                    }
530                }
531            }
532            Ok(Event::CData(e)) => {
533                // CDATA content is treated as raw text
534                if skip_depth == 0 {
535                    let text = reader
536                        .decoder()
537                        .decode(&e)
538                        .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
539                        .to_string();
540
541                    let normalized = normalize_whitespace_limited(&text, limits.max_text_bytes);
542                    if !normalized.is_empty() {
543                        if let Some(level) = pending_heading_close.take() {
544                            if token_count >= limits.max_tokens {
545                                return Err(TokenizeError::InvalidStructure(format!(
546                                    "Token count exceeds max_tokens ({}",
547                                    limits.max_tokens
548                                )));
549                            }
550                            tokens.push(Token::Heading(level));
551                            token_count += 1;
552                        }
553                        if token_count >= limits.max_tokens {
554                            return Err(TokenizeError::InvalidStructure(format!(
555                                "Token count exceeds max_tokens ({}",
556                                limits.max_tokens
557                            )));
558                        }
559                        tokens.push(Token::Text(normalized));
560                        token_count += 1;
561                    }
562                }
563            }
564            Ok(Event::GeneralRef(e)) => {
565                // Entity references: &amp; &lt; &gt; &quot; &apos; &#8220; etc.
566                if skip_depth > 0 {
567                    continue;
568                }
569
570                let entity_name = e
571                    .decode()
572                    .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?;
573                // Reconstruct the entity string and unescape it
574                let entity_str = format!("&{};", entity_name);
575                let resolved = unescape(&entity_str)
576                    .map_err(|e| TokenizeError::ParseError(format!("Unescape error: {:?}", e)))?
577                    .to_string();
578
579                if !resolved.is_empty() {
580                    // Flush any pending heading close
581                    if let Some(level) = pending_heading_close.take() {
582                        if token_count >= limits.max_tokens {
583                            return Err(TokenizeError::InvalidStructure(format!(
584                                "Token count exceeds max_tokens ({}",
585                                limits.max_tokens
586                            )));
587                        }
588                        tokens.push(Token::Heading(level));
589                        token_count += 1;
590                    }
591                    // Append to the last Text token if possible, otherwise create new one
592                    if let Some(Token::Text(ref mut last_text)) = tokens.last_mut() {
593                        if last_text.len() + resolved.len() <= limits.max_text_bytes {
594                            last_text.push_str(&resolved);
595                        }
596                    } else {
597                        if token_count >= limits.max_tokens {
598                            return Err(TokenizeError::InvalidStructure(format!(
599                                "Token count exceeds max_tokens ({}",
600                                limits.max_tokens
601                            )));
602                        }
603                        tokens.push(Token::Text(resolved));
604                        token_count += 1;
605                    }
606                }
607            }
608            Ok(Event::Comment(_)) => {
609                // Comments are ignored
610            }
611            Ok(Event::Decl(_)) => {
612                // XML declaration is ignored
613            }
614            Ok(Event::PI(_)) => {
615                // Processing instructions are ignored
616            }
617            Ok(Event::DocType(_)) => {
618                // DOCTYPE is ignored
619            }
620            Ok(Event::Eof) => break,
621            Err(e) => {
622                return Err(TokenizeError::ParseError(format!("XML error: {:?}", e)));
623            }
624        }
625        buf.clear();
626    }
627
628    // Close any unclosed formatting tags
629    while let Some(element) = element_stack.pop() {
630        match element {
631            ElementType::Emphasis => {
632                if token_count >= limits.max_tokens {
633                    return Err(TokenizeError::InvalidStructure(format!(
634                        "Token count exceeds max_tokens ({}",
635                        limits.max_tokens
636                    )));
637                }
638                tokens.push(Token::Emphasis(false));
639                token_count += 1;
640            }
641            ElementType::Strong => {
642                if token_count >= limits.max_tokens {
643                    return Err(TokenizeError::InvalidStructure(format!(
644                        "Token count exceeds max_tokens ({}",
645                        limits.max_tokens
646                    )));
647                }
648                tokens.push(Token::Strong(false));
649                token_count += 1;
650            }
651            ElementType::UnorderedList | ElementType::OrderedList => {
652                if token_count >= limits.max_tokens {
653                    return Err(TokenizeError::InvalidStructure(format!(
654                        "Token count exceeds max_tokens ({}",
655                        limits.max_tokens
656                    )));
657                }
658                tokens.push(Token::ListEnd);
659                token_count += 1;
660            }
661            ElementType::ListItem => {
662                if token_count >= limits.max_tokens {
663                    return Err(TokenizeError::InvalidStructure(format!(
664                        "Token count exceeds max_tokens ({}",
665                        limits.max_tokens
666                    )));
667                }
668                tokens.push(Token::ListItemEnd);
669                token_count += 1;
670            }
671            ElementType::Link => {
672                if token_count >= limits.max_tokens {
673                    return Err(TokenizeError::InvalidStructure(format!(
674                        "Token count exceeds max_tokens ({}",
675                        limits.max_tokens
676                    )));
677                }
678                tokens.push(Token::LinkEnd);
679                token_count += 1;
680            }
681            ElementType::Paragraph | ElementType::Heading(_) => {
682                // These already handled via pending_paragraph_break
683            }
684            _ => {}
685        }
686    }
687
688    // Flush any pending heading close
689    if let Some(level) = pending_heading_close {
690        if token_count >= limits.max_tokens {
691            return Err(TokenizeError::InvalidStructure(format!(
692                "Token count exceeds max_tokens ({}",
693                limits.max_tokens
694            )));
695        }
696        tokens.push(Token::Heading(level));
697    }
698
699    Ok(tokens)
700}
701
702/// Normalize whitespace with a byte limit.
703fn normalize_whitespace_limited(text: &str, max_bytes: usize) -> String {
704    let mut result = String::with_capacity(text.len().min(max_bytes));
705    let mut prev_was_space = true; // Start true to trim leading whitespace
706
707    for ch in text.chars() {
708        if result.len() >= max_bytes {
709            break;
710        }
711        if ch.is_whitespace() {
712            if !prev_was_space {
713                result.push(' ');
714                prev_was_space = true;
715            }
716        } else {
717            result.push(ch);
718            prev_was_space = false;
719        }
720    }
721
722    // Trim trailing space if present
723    if result.ends_with(' ') {
724        result.pop();
725    }
726
727    result
728}
729
730/// Types of elements we track in the stack
731#[derive(Clone, Debug, PartialEq)]
732enum ElementType {
733    Paragraph,
734    Heading(u8),
735    Emphasis,
736    Strong,
737    Span,
738    UnorderedList,
739    OrderedList,
740    ListItem,
741    Link,
742    Generic,
743}
744
745/// Check if an element should be skipped entirely (with its children)
746fn should_skip_element(name: &str) -> bool {
747    matches!(
748        name,
749        "script" | "style" | "head" | "nav" | "header" | "footer" | "aside" | "noscript"
750    )
751}
752
753/// Normalize whitespace in text content
754/// Collapses multiple spaces/newlines and trims ends
755fn normalize_whitespace(text: &str) -> String {
756    let mut result = String::with_capacity(text.len());
757    let mut prev_was_space = true; // Start true to trim leading whitespace
758
759    for ch in text.chars() {
760        if ch.is_whitespace() {
761            if !prev_was_space {
762                result.push(' ');
763                prev_was_space = true;
764            }
765        } else {
766            result.push(ch);
767            prev_was_space = false;
768        }
769    }
770
771    // Trim trailing space if present
772    if result.ends_with(' ') {
773        result.pop();
774    }
775
776    result
777}
778
779/// Extract a named attribute value from a start/empty element
780fn get_attribute(e: &BytesStart, reader: &Reader<&[u8]>, name: &str) -> Option<String> {
781    for attr in e.attributes().flatten() {
782        let key = reader.decoder().decode(attr.key.as_ref()).ok()?;
783        if key.as_ref() == name {
784            let value = reader.decoder().decode(&attr.value).ok()?;
785            return Some(value.to_string());
786        }
787    }
788    None
789}
790
791/// Decode element name from bytes
792fn decode_name(name: &[u8], reader: &Reader<&[u8]>) -> Result<String, TokenizeError> {
793    reader
794        .decoder()
795        .decode(name)
796        .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))
797        .map(|s| s.to_string())
798}
799
800/// Scratch buffer pool for tokenization to minimize allocations.
801///
802/// Pre-allocated buffers that can be reused across tokenization operations
803/// to avoid repeated allocations in hot paths. This is critical for embedded
804/// environments where allocation overhead must be minimized.
805#[derive(Debug)]
806pub struct TokenizeScratch {
807    /// Buffer for XML parsing
808    pub xml_buf: Vec<u8>,
809    /// Buffer for text accumulation and normalization
810    pub text_buf: String,
811    /// Buffer for element tracking (private, accessed via methods)
812    element_buf: Vec<ElementType>,
813}
814
815impl TokenizeScratch {
816    /// Create scratch buffers with specified capacities.
817    ///
818    /// # Arguments
819    /// * `xml_capacity` - Initial capacity for XML parsing buffer
820    /// * `text_capacity` - Initial capacity for text accumulation buffer
821    ///
822    /// # Example
823    /// ```
824    /// use epub_stream::tokenizer::TokenizeScratch;
825    ///
826    /// let scratch = TokenizeScratch::new(4096, 8192);
827    /// ```
828    pub fn new(xml_capacity: usize, text_capacity: usize) -> Self {
829        Self {
830            xml_buf: Vec::with_capacity(xml_capacity),
831            text_buf: String::with_capacity(text_capacity),
832            element_buf: Vec::with_capacity(64),
833        }
834    }
835
836    /// Create buffers suitable for embedded use (small, bounded).
837    ///
838    /// Uses conservative buffer sizes suitable for constrained environments:
839    /// - XML buffer: 4KB
840    /// - Text buffer: 8KB
841    /// - Element stack: 64 elements
842    pub fn embedded() -> Self {
843        Self::new(4096, 8192)
844    }
845
846    /// Create buffers for desktop use (larger, more performant).
847    ///
848    /// Uses larger buffer sizes for better performance on desktop:
849    /// - XML buffer: 32KB
850    /// - Text buffer: 64KB
851    /// - Element stack: 64 elements
852    pub fn desktop() -> Self {
853        Self::new(32768, 65536)
854    }
855
856    /// Clear all buffers without deallocating.
857    ///
858    /// This preserves the allocated capacity while resetting the length to zero,
859    /// allowing the buffers to be reused for subsequent tokenization operations
860    /// without requiring new allocations.
861    pub fn clear(&mut self) {
862        self.xml_buf.clear();
863        self.text_buf.clear();
864        self.element_buf.clear();
865    }
866
867    /// Ensure text buffer has at least the given capacity.
868    ///
869    /// # Arguments
870    /// * `min_cap` - Minimum capacity required
871    ///
872    /// If the current capacity is less than `min_cap`, the buffer will be
873    /// expanded to at least that capacity.
874    pub fn ensure_text_capacity(&mut self, min_cap: usize) {
875        if self.text_buf.capacity() < min_cap {
876            self.text_buf.reserve(min_cap - self.text_buf.capacity());
877        }
878    }
879}
880
881/// Tokenize XHTML into a caller-provided Vec to avoid allocation.
882///
883/// This variant allows the caller to provide their own `Vec<Token>` buffer,
884/// avoiding the need to allocate a new Vec for each tokenization call.
885/// The existing contents of the Vec are cleared before tokenization begins.
886///
887/// # Allocation behavior
888/// - **Zero token buffer allocations**: Reuses caller's Vec
889/// - Internal buffers: Allocates temporarily during parsing (use `tokenize_html_with_scratch` to avoid)
890/// - Caller buffer required: Yes (tokens_out)
891/// - **Preferred for embedded**: Avoids Vec allocation
892///
893/// # Arguments
894/// * `html` - The XHTML content to tokenize
895/// * `tokens_out` - Output buffer for tokens (will be cleared first)
896///
897/// # Returns
898/// * `Ok(())` on success
899/// * `Err(TokenizeError)` on parse failure
900///
901/// # Example
902/// ```
903/// use epub_stream::tokenizer::{tokenize_html_into, Token};
904///
905/// let html = "<p>Hello <em>world</em></p>";
906/// let mut tokens: Vec<Token> = Vec::with_capacity(0);
907/// tokenize_html_into(html, &mut tokens).unwrap();
908/// ```
909pub fn tokenize_html_into(html: &str, tokens_out: &mut Vec<Token>) -> Result<(), TokenizeError> {
910    let mut scratch = TokenizeScratch::embedded();
911    tokenize_html_with_scratch(html, tokens_out, &mut scratch)
912}
913
914/// Tokenize XHTML with caller-provided scratch buffers for minimal allocations.
915///
916/// This is the most memory-efficient tokenization API. It uses pre-allocated
917/// scratch buffers to minimize allocations during parsing. All internal buffers
918/// are provided by the caller and can be reused across multiple tokenization calls.
919///
920/// # Allocation behavior
921/// - **Minimal allocations**: Reuses all scratch buffers
922/// - Still allocates for Token::Text content (owned String required by Token type)
923/// - Caller buffer required: Yes (tokens_out, scratch)
924/// - **Preferred for embedded**: Minimal allocation path
925///
926/// # Arguments
927/// * `html` - The XHTML content to tokenize
928/// * `tokens_out` - Output buffer for tokens (will be cleared first)
929/// * `scratch` - Scratch buffers for internal parsing state
930///
931/// # Returns
932/// * `Ok(())` on success
933/// * `Err(TokenizeError)` on parse failure
934///
935/// # Example
936/// ```
937/// use epub_stream::tokenizer::{tokenize_html_with_scratch, TokenizeScratch, Token};
938///
939/// let html = "<p>Hello <em>world</em></p>";
940/// let mut tokens: Vec<Token> = Vec::with_capacity(0);
941/// let mut scratch = TokenizeScratch::embedded();
942/// tokenize_html_with_scratch(html, &mut tokens, &mut scratch).unwrap();
943///
944/// // Reuse scratch for subsequent calls
945/// let html2 = "<p>Second paragraph</p>";
946/// tokenize_html_with_scratch(html2, &mut tokens, &mut scratch).unwrap();
947/// ```
948pub fn tokenize_html_with_scratch(
949    html: &str,
950    tokens_out: &mut Vec<Token>,
951    scratch: &mut TokenizeScratch,
952) -> Result<(), TokenizeError> {
953    tokens_out.clear();
954    scratch.clear();
955
956    let mut reader = Reader::from_str(html);
957    reader.config_mut().trim_text(false);
958    reader.config_mut().expand_empty_elements = false;
959
960    // Track if we're inside a tag that should be skipped (script, style, head)
961    let mut skip_depth: usize = 0;
962    // Track if we need a paragraph break after current block element
963    let mut pending_paragraph_break: bool = false;
964    // Track if we need a heading close after text content
965    let mut pending_heading_close: Option<u8> = None;
966
967    loop {
968        match reader.read_event_into(&mut scratch.xml_buf) {
969            Ok(Event::Start(e)) => {
970                let name = decode_name(e.name().as_ref(), &reader)?;
971
972                // Check if we should skip this element and its children
973                if should_skip_element(&name) {
974                    skip_depth += 1;
975                    continue;
976                }
977
978                // If skipping, don't process anything
979                if skip_depth > 0 {
980                    continue;
981                }
982
983                // Flush any pending paragraph break from previous block
984                if pending_paragraph_break && !tokens_out.is_empty() {
985                    tokens_out.push(Token::ParagraphBreak);
986                    pending_paragraph_break = false;
987                }
988
989                // Flush any pending heading close
990                if let Some(level) = pending_heading_close.take() {
991                    tokens_out.push(Token::Heading(level));
992                    pending_paragraph_break = true;
993                }
994
995                match name.as_str() {
996                    "p" | "div" => {
997                        scratch.element_buf.push(ElementType::Paragraph);
998                    }
999                    "span" => {
1000                        scratch.element_buf.push(ElementType::Span);
1001                    }
1002                    h if h.starts_with('h') && h.len() == 2 => {
1003                        if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
1004                            if (1..=6).contains(&level) {
1005                                scratch.element_buf.push(ElementType::Heading(level as u8));
1006                                pending_heading_close = Some(level as u8);
1007                            }
1008                        }
1009                    }
1010                    "em" | "i" => {
1011                        scratch.element_buf.push(ElementType::Emphasis);
1012                        tokens_out.push(Token::Emphasis(true));
1013                    }
1014                    "strong" | "b" => {
1015                        scratch.element_buf.push(ElementType::Strong);
1016                        tokens_out.push(Token::Strong(true));
1017                    }
1018                    "ul" => {
1019                        scratch.element_buf.push(ElementType::UnorderedList);
1020                        tokens_out.push(Token::ListStart(false));
1021                    }
1022                    "ol" => {
1023                        scratch.element_buf.push(ElementType::OrderedList);
1024                        tokens_out.push(Token::ListStart(true));
1025                    }
1026                    "li" => {
1027                        scratch.element_buf.push(ElementType::ListItem);
1028                        tokens_out.push(Token::ListItemStart);
1029                    }
1030                    "a" => {
1031                        if let Some(href) = get_attribute(&e, &reader, "href") {
1032                            scratch.element_buf.push(ElementType::Link);
1033                            tokens_out.push(Token::LinkStart(href));
1034                        } else {
1035                            // No href — treat as generic container
1036                            scratch.element_buf.push(ElementType::Generic);
1037                        }
1038                    }
1039                    "img" => {
1040                        // <img> as a start tag (non-self-closing)
1041                        if let Some(src) = get_attribute(&e, &reader, "src") {
1042                            let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
1043                            tokens_out.push(Token::Image { src, alt });
1044                        }
1045                        scratch.element_buf.push(ElementType::Generic);
1046                    }
1047                    _ => {
1048                        // Unknown element, treat as generic container
1049                        scratch.element_buf.push(ElementType::Generic);
1050                    }
1051                }
1052            }
1053            Ok(Event::Text(e)) => {
1054                // Skip text if we're inside a script/style/head block
1055                if skip_depth > 0 {
1056                    continue;
1057                }
1058
1059                let text = e
1060                    .decode()
1061                    .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
1062                    .to_string();
1063
1064                // Normalize whitespace: collapse multiple spaces/newlines
1065                let normalized = normalize_whitespace(&text);
1066
1067                if !normalized.is_empty() {
1068                    // Flush any pending heading close
1069                    if let Some(level) = pending_heading_close.take() {
1070                        tokens_out.push(Token::Heading(level));
1071                    }
1072                    tokens_out.push(Token::Text(normalized));
1073                }
1074            }
1075            Ok(Event::End(e)) => {
1076                let name = decode_name(e.name().as_ref(), &reader)?;
1077
1078                // Check if we're ending a skip element
1079                if should_skip_element(&name) {
1080                    skip_depth = skip_depth.saturating_sub(1);
1081                    continue;
1082                }
1083
1084                // If skipping, don't process end tags
1085                if skip_depth > 0 {
1086                    continue;
1087                }
1088
1089                // Pop the element from stack and emit appropriate close token
1090                if let Some(element) = scratch.element_buf.pop() {
1091                    match element {
1092                        ElementType::Paragraph => {
1093                            pending_paragraph_break = true;
1094                        }
1095                        ElementType::Heading(_level) => {
1096                            // Heading already emitted on start, just mark for paragraph break
1097                            pending_paragraph_break = true;
1098                            // Clear any pending close since we already handled it
1099                            pending_heading_close = None;
1100                        }
1101                        ElementType::Emphasis => {
1102                            tokens_out.push(Token::Emphasis(false));
1103                        }
1104                        ElementType::Strong => {
1105                            tokens_out.push(Token::Strong(false));
1106                        }
1107                        ElementType::UnorderedList | ElementType::OrderedList => {
1108                            tokens_out.push(Token::ListEnd);
1109                        }
1110                        ElementType::ListItem => {
1111                            tokens_out.push(Token::ListItemEnd);
1112                        }
1113                        ElementType::Link => {
1114                            tokens_out.push(Token::LinkEnd);
1115                        }
1116                        ElementType::Span | ElementType::Generic => {
1117                            // No tokens needed for these
1118                        }
1119                    }
1120                }
1121            }
1122            Ok(Event::Empty(e)) => {
1123                let name = decode_name(e.name().as_ref(), &reader)?;
1124
1125                // Skip empty elements inside script/style blocks
1126                if skip_depth > 0 {
1127                    continue;
1128                }
1129
1130                // Flush any pending paragraph break
1131                if pending_paragraph_break && !tokens_out.is_empty() {
1132                    tokens_out.push(Token::ParagraphBreak);
1133                    pending_paragraph_break = false;
1134                }
1135
1136                // Flush any pending heading close
1137                if let Some(level) = pending_heading_close.take() {
1138                    tokens_out.push(Token::Heading(level));
1139                    pending_paragraph_break = true;
1140                }
1141
1142                match name.as_str() {
1143                    "br" => {
1144                        tokens_out.push(Token::LineBreak);
1145                    }
1146                    "p" | "div" => {
1147                        // Empty paragraph still creates a paragraph break
1148                        pending_paragraph_break = true;
1149                    }
1150                    h if h.starts_with('h') && h.len() == 2 => {
1151                        if let Some(level) = h.chars().nth(1).and_then(|c| c.to_digit(10)) {
1152                            if (1..=6).contains(&level) {
1153                                // Empty heading - just emit the heading token
1154                                tokens_out.push(Token::Heading(level as u8));
1155                                pending_paragraph_break = true;
1156                            }
1157                        }
1158                    }
1159                    "img" => {
1160                        if let Some(src) = get_attribute(&e, &reader, "src") {
1161                            let alt = get_attribute(&e, &reader, "alt").unwrap_or_default();
1162                            tokens_out.push(Token::Image { src, alt });
1163                        }
1164                        // No src → skip
1165                    }
1166                    _ => {
1167                        // Other empty elements are ignored
1168                    }
1169                }
1170            }
1171            Ok(Event::CData(e)) => {
1172                // CDATA content is treated as raw text
1173                if skip_depth == 0 {
1174                    let text = reader
1175                        .decoder()
1176                        .decode(&e)
1177                        .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?
1178                        .to_string();
1179
1180                    let normalized = normalize_whitespace(&text);
1181                    if !normalized.is_empty() {
1182                        // Flush any pending heading close
1183                        if let Some(level) = pending_heading_close.take() {
1184                            tokens_out.push(Token::Heading(level));
1185                        }
1186                        tokens_out.push(Token::Text(normalized));
1187                    }
1188                }
1189            }
1190            Ok(Event::GeneralRef(e)) => {
1191                // Entity references: &amp; &lt; &gt; &quot; &apos; &#8220; etc.
1192                if skip_depth > 0 {
1193                    continue;
1194                }
1195
1196                let entity_name = e
1197                    .decode()
1198                    .map_err(|e| TokenizeError::ParseError(format!("Decode error: {:?}", e)))?;
1199                // Reconstruct the entity string and unescape it
1200                let entity_str = format!("&{};", entity_name);
1201                let resolved = unescape(&entity_str)
1202                    .map_err(|e| TokenizeError::ParseError(format!("Unescape error: {:?}", e)))?
1203                    .to_string();
1204
1205                if !resolved.is_empty() {
1206                    // Flush any pending heading close
1207                    if let Some(level) = pending_heading_close.take() {
1208                        tokens_out.push(Token::Heading(level));
1209                    }
1210                    // Append to the last Text token if possible, otherwise create new one
1211                    if let Some(Token::Text(ref mut last_text)) = tokens_out.last_mut() {
1212                        last_text.push_str(&resolved);
1213                    } else {
1214                        tokens_out.push(Token::Text(resolved));
1215                    }
1216                }
1217            }
1218            Ok(Event::Comment(_)) => {
1219                // Comments are ignored
1220            }
1221            Ok(Event::Decl(_)) => {
1222                // XML declaration is ignored
1223            }
1224            Ok(Event::PI(_)) => {
1225                // Processing instructions are ignored
1226            }
1227            Ok(Event::DocType(_)) => {
1228                // DOCTYPE is ignored
1229            }
1230            Ok(Event::Eof) => break,
1231            Err(e) => {
1232                return Err(TokenizeError::ParseError(format!("XML error: {:?}", e)));
1233            }
1234        }
1235        scratch.xml_buf.clear();
1236    }
1237
1238    // Flush any remaining pending paragraph break
1239    if pending_paragraph_break && !tokens_out.is_empty() {
1240        // Don't add trailing paragraph break
1241        // tokens_out.push(Token::ParagraphBreak);
1242    }
1243
1244    // Close any unclosed formatting tags
1245    while let Some(element) = scratch.element_buf.pop() {
1246        match element {
1247            ElementType::Emphasis => {
1248                tokens_out.push(Token::Emphasis(false));
1249            }
1250            ElementType::Strong => {
1251                tokens_out.push(Token::Strong(false));
1252            }
1253            ElementType::UnorderedList | ElementType::OrderedList => {
1254                tokens_out.push(Token::ListEnd);
1255            }
1256            ElementType::ListItem => {
1257                tokens_out.push(Token::ListItemEnd);
1258            }
1259            ElementType::Link => {
1260                tokens_out.push(Token::LinkEnd);
1261            }
1262            ElementType::Paragraph | ElementType::Heading(_) => {
1263                // These already handled via pending_paragraph_break
1264            }
1265            _ => {}
1266        }
1267    }
1268
1269    // Flush any pending heading close
1270    if let Some(level) = pending_heading_close {
1271        tokens_out.push(Token::Heading(level));
1272    }
1273
1274    Ok(())
1275}
1276
1277#[cfg(test)]
1278mod tests {
1279    use super::*;
1280    use alloc::vec;
1281
1282    #[test]
1283    fn test_tokenize_simple_paragraph() {
1284        let html = "<p>Hello world</p>";
1285        let tokens = tokenize_html(html).unwrap();
1286        // No trailing ParagraphBreak — only emitted between blocks
1287        assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
1288    }
1289
1290    #[test]
1291    fn test_tokenize_emphasis() {
1292        let html = "<p>This is <em>italic</em> and <strong>bold</strong> text.</p>";
1293        let tokens = tokenize_html(html).unwrap();
1294        // normalize_whitespace strips leading/trailing spaces from text nodes
1295        assert_eq!(
1296            tokens,
1297            vec![
1298                Token::Text("This is".to_string()),
1299                Token::Emphasis(true),
1300                Token::Text("italic".to_string()),
1301                Token::Emphasis(false),
1302                Token::Text("and".to_string()),
1303                Token::Strong(true),
1304                Token::Text("bold".to_string()),
1305                Token::Strong(false),
1306                Token::Text("text.".to_string()),
1307            ]
1308        );
1309    }
1310
1311    #[test]
1312    fn test_tokenize_heading_and_paragraphs() {
1313        let html = "<h1>Chapter Title</h1><p>First paragraph.</p><p>Second paragraph.</p>";
1314        let tokens = tokenize_html(html).unwrap();
1315
1316        assert_eq!(
1317            tokens,
1318            vec![
1319                Token::Heading(1),
1320                Token::Text("Chapter Title".to_string()),
1321                Token::ParagraphBreak,
1322                Token::Text("First paragraph.".to_string()),
1323                Token::ParagraphBreak,
1324                Token::Text("Second paragraph.".to_string()),
1325            ]
1326        );
1327    }
1328
1329    #[test]
1330    fn test_tokenize_multiple_headings() {
1331        let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3>";
1332        let tokens = tokenize_html(html).unwrap();
1333
1334        assert_eq!(
1335            tokens,
1336            vec![
1337                Token::Heading(1),
1338                Token::Text("Title".to_string()),
1339                Token::ParagraphBreak,
1340                Token::Heading(2),
1341                Token::Text("Subtitle".to_string()),
1342                Token::ParagraphBreak,
1343                Token::Heading(3),
1344                Token::Text("Section".to_string()),
1345            ]
1346        );
1347    }
1348
1349    #[test]
1350    fn test_tokenize_line_break() {
1351        // XHTML requires self-closing <br/>
1352        let html = "<p>Line one<br/>Line two</p>";
1353        let tokens = tokenize_html(html).unwrap();
1354
1355        assert_eq!(
1356            tokens,
1357            vec![
1358                Token::Text("Line one".to_string()),
1359                Token::LineBreak,
1360                Token::Text("Line two".to_string()),
1361            ]
1362        );
1363    }
1364
1365    #[test]
1366    fn test_tokenize_nested_formatting() {
1367        let html = "<p>Text with <strong>bold and <em>italic nested</em></strong>.</p>";
1368        let tokens = tokenize_html(html).unwrap();
1369
1370        assert_eq!(
1371            tokens,
1372            vec![
1373                Token::Text("Text with".to_string()),
1374                Token::Strong(true),
1375                Token::Text("bold and".to_string()),
1376                Token::Emphasis(true),
1377                Token::Text("italic nested".to_string()),
1378                Token::Emphasis(false),
1379                Token::Strong(false),
1380                Token::Text(".".to_string()),
1381            ]
1382        );
1383    }
1384
1385    #[test]
1386    fn test_strip_script_and_style() {
1387        let html = r#"<p>Visible text</p><script>alert("hidden");</script><style>.hidden{}</style><p>More visible</p>"#;
1388        let tokens = tokenize_html(html).unwrap();
1389
1390        assert_eq!(
1391            tokens,
1392            vec![
1393                Token::Text("Visible text".to_string()),
1394                Token::ParagraphBreak,
1395                Token::Text("More visible".to_string()),
1396            ]
1397        );
1398    }
1399
1400    #[test]
1401    fn test_strip_head() {
1402        let html = "<head><title>Title</title></head><body><p>Content</p></body>";
1403        let tokens = tokenize_html(html).unwrap();
1404
1405        assert_eq!(tokens, vec![Token::Text("Content".to_string())]);
1406    }
1407
1408    #[test]
1409    fn test_whitespace_normalization() {
1410        let html = "<p>  Multiple   spaces   and\n\nnewlines  </p>";
1411        let tokens = tokenize_html(html).unwrap();
1412
1413        assert_eq!(
1414            tokens,
1415            vec![Token::Text("Multiple spaces and newlines".to_string())]
1416        );
1417    }
1418
1419    #[test]
1420    fn test_empty_paragraph() {
1421        let html = "<p></p>";
1422        let tokens = tokenize_html(html).unwrap();
1423        // Empty paragraph with nothing following produces no tokens
1424        assert_eq!(tokens, vec![]);
1425    }
1426
1427    #[test]
1428    fn test_unclosed_tags_rejected() {
1429        // quick-xml is a strict XML parser; mismatched tags are errors
1430        let html = "<p>Text with <em>italic</p>";
1431        assert!(tokenize_html(html).is_err());
1432    }
1433
1434    #[test]
1435    fn test_b_and_i_tags() {
1436        let html = "<p><b>bold</b> and <i>italic</i></p>";
1437        let tokens = tokenize_html(html).unwrap();
1438
1439        assert_eq!(
1440            tokens,
1441            vec![
1442                Token::Strong(true),
1443                Token::Text("bold".to_string()),
1444                Token::Strong(false),
1445                Token::Text("and".to_string()),
1446                Token::Emphasis(true),
1447                Token::Text("italic".to_string()),
1448                Token::Emphasis(false),
1449            ]
1450        );
1451    }
1452
1453    #[test]
1454    fn test_div_handling() {
1455        let html = "<div>Block content</div><div>Another block</div>";
1456        let tokens = tokenize_html(html).unwrap();
1457
1458        assert_eq!(
1459            tokens,
1460            vec![
1461                Token::Text("Block content".to_string()),
1462                Token::ParagraphBreak,
1463                Token::Text("Another block".to_string()),
1464            ]
1465        );
1466    }
1467
1468    #[test]
1469    fn test_span_handling() {
1470        let html = "<p>Text with <span>spanned</span> content</p>";
1471        let tokens = tokenize_html(html).unwrap();
1472
1473        assert_eq!(
1474            tokens,
1475            vec![
1476                Token::Text("Text with".to_string()),
1477                Token::Text("spanned".to_string()),
1478                Token::Text("content".to_string()),
1479            ]
1480        );
1481    }
1482
1483    #[test]
1484    fn test_example_from_spec() {
1485        let html = r#"<p>This is <em>italic</em> and <strong>bold</strong> text.</p>
1486<h1>Chapter Title</h1>
1487<p>Another paragraph.</p>"#;
1488
1489        let tokens = tokenize_html(html).unwrap();
1490
1491        let expected = vec![
1492            Token::Text("This is".to_string()),
1493            Token::Emphasis(true),
1494            Token::Text("italic".to_string()),
1495            Token::Emphasis(false),
1496            Token::Text("and".to_string()),
1497            Token::Strong(true),
1498            Token::Text("bold".to_string()),
1499            Token::Strong(false),
1500            Token::Text("text.".to_string()),
1501            Token::ParagraphBreak,
1502            Token::Heading(1),
1503            Token::Text("Chapter Title".to_string()),
1504            Token::ParagraphBreak,
1505            Token::Text("Another paragraph.".to_string()),
1506        ];
1507
1508        assert_eq!(tokens, expected);
1509    }
1510
1511    #[test]
1512    fn test_all_heading_levels() {
1513        let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
1514        let tokens = tokenize_html(html).unwrap();
1515
1516        assert_eq!(
1517            tokens,
1518            vec![
1519                Token::Heading(1),
1520                Token::Text("H1".to_string()),
1521                Token::ParagraphBreak,
1522                Token::Heading(2),
1523                Token::Text("H2".to_string()),
1524                Token::ParagraphBreak,
1525                Token::Heading(3),
1526                Token::Text("H3".to_string()),
1527                Token::ParagraphBreak,
1528                Token::Heading(4),
1529                Token::Text("H4".to_string()),
1530                Token::ParagraphBreak,
1531                Token::Heading(5),
1532                Token::Text("H5".to_string()),
1533                Token::ParagraphBreak,
1534                Token::Heading(6),
1535                Token::Text("H6".to_string()),
1536            ]
1537        );
1538    }
1539
1540    // ---- List tests ----
1541
1542    #[test]
1543    fn test_simple_unordered_list() {
1544        let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
1545        let tokens = tokenize_html(html).unwrap();
1546
1547        assert_eq!(
1548            tokens,
1549            vec![
1550                Token::ListStart(false),
1551                Token::ListItemStart,
1552                Token::Text("Item 1".to_string()),
1553                Token::ListItemEnd,
1554                Token::ListItemStart,
1555                Token::Text("Item 2".to_string()),
1556                Token::ListItemEnd,
1557                Token::ListEnd,
1558            ]
1559        );
1560    }
1561
1562    #[test]
1563    fn test_simple_ordered_list() {
1564        let html = "<ol><li>First</li><li>Second</li></ol>";
1565        let tokens = tokenize_html(html).unwrap();
1566
1567        assert_eq!(
1568            tokens,
1569            vec![
1570                Token::ListStart(true),
1571                Token::ListItemStart,
1572                Token::Text("First".to_string()),
1573                Token::ListItemEnd,
1574                Token::ListItemStart,
1575                Token::Text("Second".to_string()),
1576                Token::ListItemEnd,
1577                Token::ListEnd,
1578            ]
1579        );
1580    }
1581
1582    #[test]
1583    fn test_nested_lists() {
1584        let html = "<ul><li>A<ul><li>B</li></ul></li></ul>";
1585        let tokens = tokenize_html(html).unwrap();
1586
1587        assert_eq!(
1588            tokens,
1589            vec![
1590                Token::ListStart(false),
1591                Token::ListItemStart,
1592                Token::Text("A".to_string()),
1593                Token::ListStart(false),
1594                Token::ListItemStart,
1595                Token::Text("B".to_string()),
1596                Token::ListItemEnd,
1597                Token::ListEnd,
1598                Token::ListItemEnd,
1599                Token::ListEnd,
1600            ]
1601        );
1602    }
1603
1604    #[test]
1605    fn test_list_with_formatted_text() {
1606        let html = "<ul><li><em>italic</em> item</li></ul>";
1607        let tokens = tokenize_html(html).unwrap();
1608
1609        assert_eq!(
1610            tokens,
1611            vec![
1612                Token::ListStart(false),
1613                Token::ListItemStart,
1614                Token::Emphasis(true),
1615                Token::Text("italic".to_string()),
1616                Token::Emphasis(false),
1617                Token::Text("item".to_string()),
1618                Token::ListItemEnd,
1619                Token::ListEnd,
1620            ]
1621        );
1622    }
1623
1624    #[test]
1625    fn test_empty_list() {
1626        let html = "<ul></ul>";
1627        let tokens = tokenize_html(html).unwrap();
1628
1629        assert_eq!(tokens, vec![Token::ListStart(false), Token::ListEnd]);
1630    }
1631
1632    // ---- Link tests ----
1633
1634    #[test]
1635    fn test_link_with_href() {
1636        let html = r#"<a href="ch2.xhtml">Next Chapter</a>"#;
1637        let tokens = tokenize_html(html).unwrap();
1638
1639        assert_eq!(
1640            tokens,
1641            vec![
1642                Token::LinkStart("ch2.xhtml".to_string()),
1643                Token::Text("Next Chapter".to_string()),
1644                Token::LinkEnd,
1645            ]
1646        );
1647    }
1648
1649    #[test]
1650    fn test_link_without_href() {
1651        let html = "<a>No link</a>";
1652        let tokens = tokenize_html(html).unwrap();
1653
1654        // No href → treated as generic container, no LinkStart/LinkEnd
1655        assert_eq!(tokens, vec![Token::Text("No link".to_string())]);
1656    }
1657
1658    #[test]
1659    fn test_link_with_formatted_text() {
1660        let html = r#"<a href="x.html"><em>italic link</em></a>"#;
1661        let tokens = tokenize_html(html).unwrap();
1662
1663        assert_eq!(
1664            tokens,
1665            vec![
1666                Token::LinkStart("x.html".to_string()),
1667                Token::Emphasis(true),
1668                Token::Text("italic link".to_string()),
1669                Token::Emphasis(false),
1670                Token::LinkEnd,
1671            ]
1672        );
1673    }
1674
1675    // ---- Image tests ----
1676
1677    #[test]
1678    fn test_image_self_closing() {
1679        let html = r#"<img src="cover.jpg" alt="Cover Image"/>"#;
1680        let tokens = tokenize_html(html).unwrap();
1681
1682        assert_eq!(
1683            tokens,
1684            vec![Token::Image {
1685                src: "cover.jpg".to_string(),
1686                alt: "Cover Image".to_string(),
1687            }]
1688        );
1689    }
1690
1691    #[test]
1692    fn test_image_without_alt() {
1693        let html = r#"<img src="photo.jpg"/>"#;
1694        let tokens = tokenize_html(html).unwrap();
1695
1696        assert_eq!(
1697            tokens,
1698            vec![Token::Image {
1699                src: "photo.jpg".to_string(),
1700                alt: String::with_capacity(0),
1701            }]
1702        );
1703    }
1704
1705    #[test]
1706    fn test_image_without_src() {
1707        let html = r#"<img alt="Missing"/>"#;
1708        let tokens = tokenize_html(html).unwrap();
1709
1710        // No src → image is skipped
1711        assert_eq!(tokens, vec![]);
1712    }
1713
1714    #[test]
1715    fn test_image_as_start_tag() {
1716        // Some XHTML may have <img></img> instead of self-closing
1717        let html = r#"<img src="pic.png" alt="Pic"></img>"#;
1718        let tokens = tokenize_html(html).unwrap();
1719
1720        assert_eq!(
1721            tokens,
1722            vec![Token::Image {
1723                src: "pic.png".to_string(),
1724                alt: "Pic".to_string(),
1725            }]
1726        );
1727    }
1728
1729    // ---- Mixed content tests ----
1730
1731    #[test]
1732    fn test_mixed_content() {
1733        let html = r#"<p>See <a href="ch2.xhtml">chapter 2</a> for details.</p><ul><li>Item with <img src="icon.png" alt="icon"/></li></ul>"#;
1734        let tokens = tokenize_html(html).unwrap();
1735
1736        assert_eq!(
1737            tokens,
1738            vec![
1739                Token::Text("See".to_string()),
1740                Token::LinkStart("ch2.xhtml".to_string()),
1741                Token::Text("chapter 2".to_string()),
1742                Token::LinkEnd,
1743                Token::Text("for details.".to_string()),
1744                Token::ParagraphBreak,
1745                Token::ListStart(false),
1746                Token::ListItemStart,
1747                Token::Text("Item with".to_string()),
1748                Token::Image {
1749                    src: "icon.png".to_string(),
1750                    alt: "icon".to_string(),
1751                },
1752                Token::ListItemEnd,
1753                Token::ListEnd,
1754            ]
1755        );
1756    }
1757
1758    // ---- Edge case tests for existing features ----
1759
1760    #[test]
1761    fn test_deeply_nested_formatting() {
1762        let html = "<em><strong><em>triple</em></strong></em>";
1763        let tokens = tokenize_html(html).unwrap();
1764
1765        assert_eq!(
1766            tokens,
1767            vec![
1768                Token::Emphasis(true),
1769                Token::Strong(true),
1770                Token::Emphasis(true),
1771                Token::Text("triple".to_string()),
1772                Token::Emphasis(false),
1773                Token::Strong(false),
1774                Token::Emphasis(false),
1775            ]
1776        );
1777    }
1778
1779    #[test]
1780    fn test_consecutive_headings_same_level() {
1781        let html = "<h2>First</h2><h2>Second</h2>";
1782        let tokens = tokenize_html(html).unwrap();
1783
1784        assert_eq!(
1785            tokens,
1786            vec![
1787                Token::Heading(2),
1788                Token::Text("First".to_string()),
1789                Token::ParagraphBreak,
1790                Token::Heading(2),
1791                Token::Text("Second".to_string()),
1792            ]
1793        );
1794    }
1795
1796    #[test]
1797    fn test_multiple_consecutive_line_breaks() {
1798        let html = "<p>A<br/><br/><br/>B</p>";
1799        let tokens = tokenize_html(html).unwrap();
1800
1801        assert_eq!(
1802            tokens,
1803            vec![
1804                Token::Text("A".to_string()),
1805                Token::LineBreak,
1806                Token::LineBreak,
1807                Token::LineBreak,
1808                Token::Text("B".to_string()),
1809            ]
1810        );
1811    }
1812
1813    #[test]
1814    fn test_cdata_sections() {
1815        let html = "<p><![CDATA[Some raw content]]></p>";
1816        let tokens = tokenize_html(html).unwrap();
1817
1818        assert_eq!(tokens, vec![Token::Text("Some raw content".to_string())]);
1819    }
1820
1821    #[test]
1822    fn test_whitespace_only_text_nodes() {
1823        // Whitespace between block elements should be normalized away
1824        let html = "<p>First</p>   \n   <p>Second</p>";
1825        let tokens = tokenize_html(html).unwrap();
1826
1827        assert_eq!(
1828            tokens,
1829            vec![
1830                Token::Text("First".to_string()),
1831                Token::ParagraphBreak,
1832                Token::Text("Second".to_string()),
1833            ]
1834        );
1835    }
1836
1837    #[test]
1838    fn test_very_long_text() {
1839        // Performance sanity check with long text
1840        let long_word = "word ".repeat(10_000);
1841        let html = format!("<p>{}</p>", long_word);
1842        let tokens = tokenize_html(&html).unwrap();
1843
1844        assert_eq!(tokens.len(), 1);
1845        if let Token::Text(ref text) = tokens[0] {
1846            assert!(text.len() > 40_000);
1847        } else {
1848            panic!("Expected Token::Text");
1849        }
1850    }
1851
1852    #[test]
1853    fn test_mixed_block_and_inline() {
1854        let html = "<div><p><em>text</em></p></div>";
1855        let tokens = tokenize_html(html).unwrap();
1856
1857        assert_eq!(
1858            tokens,
1859            vec![
1860                Token::Emphasis(true),
1861                Token::Text("text".to_string()),
1862                Token::Emphasis(false),
1863            ]
1864        );
1865    }
1866
1867    #[test]
1868    fn test_block_inside_inline_no_crash() {
1869        // Malformed HTML: block element inside inline — should not crash
1870        let html = "<em><p>text</p></em>";
1871        // We just verify it doesn't panic; token output may vary
1872        let result = tokenize_html(html);
1873        assert!(result.is_ok());
1874        let tokens = result.unwrap();
1875        // Should at least contain the text
1876        assert!(tokens
1877            .iter()
1878            .any(|t| matches!(t, Token::Text(s) if s == "text")));
1879    }
1880
1881    #[test]
1882    fn test_link_in_paragraph() {
1883        let html = r#"<p>Click <a href="http://example.com">here</a> to continue.</p>"#;
1884        let tokens = tokenize_html(html).unwrap();
1885
1886        assert_eq!(
1887            tokens,
1888            vec![
1889                Token::Text("Click".to_string()),
1890                Token::LinkStart("http://example.com".to_string()),
1891                Token::Text("here".to_string()),
1892                Token::LinkEnd,
1893                Token::Text("to continue.".to_string()),
1894            ]
1895        );
1896    }
1897
1898    #[test]
1899    fn test_image_in_paragraph() {
1900        let html = r#"<p>An image: <img src="fig1.png" alt="Figure 1"/></p>"#;
1901        let tokens = tokenize_html(html).unwrap();
1902
1903        assert_eq!(
1904            tokens,
1905            vec![
1906                Token::Text("An image:".to_string()),
1907                Token::Image {
1908                    src: "fig1.png".to_string(),
1909                    alt: "Figure 1".to_string(),
1910                },
1911            ]
1912        );
1913    }
1914
1915    #[test]
1916    fn test_list_after_paragraph() {
1917        let html = "<p>Intro:</p><ul><li>One</li><li>Two</li></ul>";
1918        let tokens = tokenize_html(html).unwrap();
1919
1920        assert_eq!(
1921            tokens,
1922            vec![
1923                Token::Text("Intro:".to_string()),
1924                Token::ParagraphBreak,
1925                Token::ListStart(false),
1926                Token::ListItemStart,
1927                Token::Text("One".to_string()),
1928                Token::ListItemEnd,
1929                Token::ListItemStart,
1930                Token::Text("Two".to_string()),
1931                Token::ListItemEnd,
1932                Token::ListEnd,
1933            ]
1934        );
1935    }
1936
1937    #[test]
1938    fn test_ordered_list_with_links() {
1939        let html = r#"<ol><li><a href="ch1.html">Chapter 1</a></li><li><a href="ch2.html">Chapter 2</a></li></ol>"#;
1940        let tokens = tokenize_html(html).unwrap();
1941
1942        assert_eq!(
1943            tokens,
1944            vec![
1945                Token::ListStart(true),
1946                Token::ListItemStart,
1947                Token::LinkStart("ch1.html".to_string()),
1948                Token::Text("Chapter 1".to_string()),
1949                Token::LinkEnd,
1950                Token::ListItemEnd,
1951                Token::ListItemStart,
1952                Token::LinkStart("ch2.html".to_string()),
1953                Token::Text("Chapter 2".to_string()),
1954                Token::LinkEnd,
1955                Token::ListItemEnd,
1956                Token::ListEnd,
1957            ]
1958        );
1959    }
1960
1961    #[test]
1962    fn test_tokenize_html_with_matches_tokenize_html() {
1963        let html = "<h1>T</h1><p>Hello <em>world</em><br/>line 2</p>";
1964        let baseline = tokenize_html(html).unwrap();
1965        let mut streamed = Vec::with_capacity(0);
1966        tokenize_html_with(html, |token| streamed.push(token)).unwrap();
1967        assert_eq!(baseline, streamed);
1968    }
1969}
epub_stream/tokenizer.rs

epub_stream/
tokenizer.rs