Skip to main content

bookforge_epub/
reader.rs

1use std::{
2    collections::HashMap,
3    fs::File,
4    io::Read,
5    path::{Path, PathBuf},
6};
7
8use bookforge_core::{
9    BookforgeError, Result,
10    ir::{
11        Block, BlockId, BlockKind, Book, BookFormat, BookId, DomPath, InlineMark, Metadata,
12        ProtectedSpan, ProtectedSpanKind, Resource, Section, SectionId, SpineItem, TextRun,
13    },
14    marker::{is_marker_token, strip_marker_tokens},
15};
16use quick_xml::{
17    Reader,
18    events::{BytesStart, Event},
19};
20use zip::ZipArchive;
21
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct EpubInspection {
24    pub title: Option<String>,
25    pub spine_count: usize,
26    pub manifest_count: usize,
27    pub xhtml_count: usize,
28    pub has_nav: bool,
29    pub has_toc: bool,
30    pub resource_count: usize,
31    pub package_path: String,
32    pub xhtml_spine_count: usize,
33}
34
35/// How much of the document's visible text the reader actually captures
36/// into translatable blocks. Text that lives outside the recognized block
37/// elements (for example directly inside `<div>`) is parsed over but never
38/// extracted, and ships untranslated; this metric makes that visible
39/// before any tokens are spent.
40#[derive(Debug, Clone, PartialEq, Eq, Default)]
41pub struct TextCoverage {
42    pub total_chars: usize,
43    pub captured_chars: usize,
44    pub files: Vec<FileTextCoverage>,
45}
46
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct FileTextCoverage {
49    pub href: String,
50    pub total_chars: usize,
51    pub captured_chars: usize,
52}
53
54impl TextCoverage {
55    pub fn percent(&self) -> f64 {
56        coverage_percent(self.captured_chars, self.total_chars)
57    }
58}
59
60impl FileTextCoverage {
61    pub fn percent(&self) -> f64 {
62        coverage_percent(self.captured_chars, self.total_chars)
63    }
64
65    pub fn uncaptured_chars(&self) -> usize {
66        self.total_chars.saturating_sub(self.captured_chars)
67    }
68}
69
70fn coverage_percent(captured: usize, total: usize) -> f64 {
71    if total == 0 {
72        100.0
73    } else {
74        (captured.min(total) as f64 / total as f64) * 100.0
75    }
76}
77
78#[derive(Debug, Clone)]
79struct PackageDocument {
80    metadata: Metadata,
81    manifest: Vec<Resource>,
82    spine: Vec<SpineItem>,
83    toc_id: Option<String>,
84}
85
86pub fn read_epub(path: &Path) -> Result<Book> {
87    let mut archive = open_archive(path)?;
88    validate_mimetype(&mut archive)?;
89    let package_path = locate_package(&mut archive)?;
90    let package_xml = read_archive_text(&mut archive, &package_path)?;
91    let mut package = parse_package(&package_xml)?;
92    let package_dir = package_base_dir(&package_path);
93    let manifest_by_id = package
94        .manifest
95        .iter()
96        .map(|item| (item.id.as_str(), item))
97        .collect::<HashMap<_, _>>();
98    let mut sections = Vec::new();
99    let mut blocks = Vec::new();
100
101    let package_section_id = SectionId("sec_metadata_opf".to_string());
102    let mut package_blocks =
103        extract_package_title_blocks(&package_xml, &package_section_id, blocks.len())?;
104    if !package_blocks.is_empty() {
105        let block_ids = package_blocks
106            .iter()
107            .map(|block| block.id.clone())
108            .collect::<Vec<_>>();
109        sections.push(Section {
110            id: package_section_id,
111            href: package_path.clone(),
112            spine_index: 0,
113            title: Some("OPF metadata".to_string()),
114            heading_level: None,
115            block_ids,
116            prev: None,
117            next: None,
118        });
119        blocks.append(&mut package_blocks);
120    }
121
122    for (toc_index, resource) in package
123        .manifest
124        .iter()
125        .filter(|item| item.media_type == "application/x-dtbncx+xml")
126        .enumerate()
127    {
128        let href = join_epub_path(&package_dir, &resource.href);
129        let ncx = read_archive_text(&mut archive, &href)?;
130        let section_id = SectionId(format!("sec_toc_{toc_index:06}"));
131        let mut toc_blocks = extract_ncx_text_blocks(&ncx, &section_id, blocks.len())?;
132        if toc_blocks.is_empty() {
133            continue;
134        }
135        let block_ids = toc_blocks
136            .iter()
137            .map(|block| block.id.clone())
138            .collect::<Vec<_>>();
139        sections.push(Section {
140            id: section_id,
141            href,
142            spine_index: 0,
143            title: Some("NCX table of contents".to_string()),
144            heading_level: None,
145            block_ids,
146            prev: None,
147            next: None,
148        });
149        blocks.append(&mut toc_blocks);
150    }
151
152    for (spine_index, spine_item) in package.spine.iter_mut().enumerate() {
153        let Some(resource) = manifest_by_id.get(spine_item.idref.as_str()) else {
154            return Err(BookforgeError::InvalidInput(format!(
155                "spine item references missing manifest id '{}'",
156                spine_item.idref
157            )));
158        };
159
160        let href = join_epub_path(&package_dir, &resource.href);
161        spine_item.href = Some(href.clone());
162
163        if !is_xhtml_media_type(&resource.media_type) {
164            continue;
165        }
166
167        let xhtml = read_archive_text(&mut archive, &href)?;
168        let section_id = SectionId(format!("sec_{spine_index:06}"));
169        let mut section_blocks = extract_blocks(&xhtml, &href, &section_id, blocks.len())?;
170        if section_blocks.is_empty() {
171            continue;
172        }
173        let block_ids = section_blocks
174            .iter()
175            .map(|block| block.id.clone())
176            .collect::<Vec<_>>();
177        let (title, heading_level) = first_heading(&section_blocks);
178
179        sections.push(Section {
180            id: section_id,
181            href,
182            spine_index,
183            title,
184            heading_level,
185            block_ids,
186            prev: None,
187            next: None,
188        });
189        blocks.append(&mut section_blocks);
190    }
191
192    link_sections(&mut sections);
193
194    if blocks.is_empty() {
195        return Err(BookforgeError::InvalidInput(
196            "EPUB contains no translatable blocks".to_string(),
197        ));
198    }
199
200    Ok(Book {
201        source_path: Some(path.to_path_buf()),
202        id: BookId(package_path),
203        format: BookFormat::Epub,
204        metadata: package.metadata,
205        manifest: package.manifest,
206        spine: package.spine,
207        sections,
208        blocks,
209    })
210}
211
212pub fn inspect_epub(path: &Path) -> Result<EpubInspection> {
213    let mut archive = open_archive(path)?;
214    validate_mimetype(&mut archive)?;
215
216    let package_path = locate_package(&mut archive)?;
217    let package_xml = read_archive_text(&mut archive, &package_path)?;
218    let package = parse_package(&package_xml)?;
219    let manifest_by_id = package
220        .manifest
221        .iter()
222        .map(|item| (item.id.as_str(), item))
223        .collect::<HashMap<_, _>>();
224
225    let package_dir = package_base_dir(&package_path);
226    let xhtml_count = package
227        .manifest
228        .iter()
229        .filter(|item| is_xhtml_media_type(&item.media_type))
230        .count();
231    let has_nav = package.manifest.iter().any(is_nav_item);
232    let has_toc = package
233        .toc_id
234        .as_deref()
235        .and_then(|toc_id| manifest_by_id.get(toc_id))
236        .is_some_and(|item| item.media_type == "application/x-dtbncx+xml")
237        || package
238            .manifest
239            .iter()
240            .any(|item| item.media_type == "application/x-dtbncx+xml");
241
242    let mut xhtml_spine_count = 0;
243    for item in &package.spine {
244        let Some(resource) = manifest_by_id.get(item.idref.as_str()) else {
245            return Err(BookforgeError::InvalidInput(format!(
246                "spine item references missing manifest id '{}'",
247                item.idref
248            )));
249        };
250
251        if is_xhtml_media_type(&resource.media_type) {
252            let href = join_epub_path(&package_dir, &resource.href);
253            read_archive_text(&mut archive, &href)?;
254            xhtml_spine_count += 1;
255        }
256    }
257
258    Ok(EpubInspection {
259        title: package.metadata.title,
260        spine_count: package.spine.len(),
261        manifest_count: package.manifest.len(),
262        xhtml_count,
263        has_nav,
264        has_toc,
265        resource_count: package
266            .manifest
267            .iter()
268            .filter(|item| !is_xhtml_media_type(&item.media_type))
269            .count(),
270        package_path,
271        xhtml_spine_count,
272    })
273}
274
275/// Measure how much visible body text each XHTML spine document contributes
276/// versus how much the block extractor captures. Counts non-whitespace
277/// characters so block boundaries and indentation do not skew the ratio.
278pub fn text_coverage(path: &Path) -> Result<TextCoverage> {
279    let mut archive = open_archive(path)?;
280    validate_mimetype(&mut archive)?;
281    let package_path = locate_package(&mut archive)?;
282    let package_xml = read_archive_text(&mut archive, &package_path)?;
283    let package = parse_package(&package_xml)?;
284    let package_dir = package_base_dir(&package_path);
285    let manifest_by_id = package
286        .manifest
287        .iter()
288        .map(|item| (item.id.as_str(), item))
289        .collect::<HashMap<_, _>>();
290
291    let mut coverage = TextCoverage::default();
292    for (spine_index, spine_item) in package.spine.iter().enumerate() {
293        let Some(resource) = manifest_by_id.get(spine_item.idref.as_str()) else {
294            return Err(BookforgeError::InvalidInput(format!(
295                "spine item references missing manifest id '{}'",
296                spine_item.idref
297            )));
298        };
299        if !is_xhtml_media_type(&resource.media_type) {
300            continue;
301        }
302
303        let href = join_epub_path(&package_dir, &resource.href);
304        let xhtml = read_archive_text(&mut archive, &href)?;
305        let section_id = SectionId(format!("sec_{spine_index:06}"));
306        let blocks = extract_blocks(&xhtml, &href, &section_id, 0)?;
307        let captured_chars = blocks
308            .iter()
309            .map(|block| non_whitespace_chars(&block_visible_text(block)))
310            .sum::<usize>();
311        let total_chars = visible_body_chars(&xhtml)?;
312
313        coverage.total_chars += total_chars;
314        coverage.captured_chars += captured_chars;
315        coverage.files.push(FileTextCoverage {
316            href,
317            total_chars,
318            captured_chars,
319        });
320    }
321
322    Ok(coverage)
323}
324
325/// Non-whitespace character count of all reader-visible text: everything
326/// inside `<body>` (minus `<script>`/`<style>` content) plus the document
327/// `<title>`, which the extractor also captures and translates. Keeping
328/// the numerator and denominator in sync stops per-file coverage from
329/// exceeding 100% on title-bearing chapters.
330fn visible_body_chars(xhtml: &str) -> Result<usize> {
331    let mut reader = Reader::from_str(xhtml);
332    reader.config_mut().trim_text(false);
333    let mut in_body = false;
334    let mut in_title = false;
335    let mut skip_depth = 0usize;
336    let mut count = 0usize;
337
338    loop {
339        let counting = (in_body || in_title) && skip_depth == 0;
340        match reader.read_event()? {
341            Event::Start(element) => match local_name(element.name().as_ref()) {
342                b"body" => in_body = true,
343                b"title" if !in_body => in_title = true,
344                b"script" | b"style" if in_body => skip_depth += 1,
345                _ => {}
346            },
347            Event::End(element) => match local_name(element.name().as_ref()) {
348                b"body" => in_body = false,
349                b"title" => in_title = false,
350                b"script" | b"style" if skip_depth > 0 => skip_depth -= 1,
351                _ => {}
352            },
353            Event::Text(text) if counting => {
354                let value = text
355                    .html_content()
356                    .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
357                count += non_whitespace_chars(&value);
358            }
359            Event::CData(text) if counting => {
360                let value = text
361                    .decode()
362                    .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
363                count += non_whitespace_chars(&value);
364            }
365            Event::GeneralRef(reference) if counting => {
366                if let Some(value) = resolve_general_ref(&reference)? {
367                    count += non_whitespace_chars(&value);
368                }
369            }
370            Event::Eof => break,
371            _ => {}
372        }
373    }
374
375    Ok(count)
376}
377
378fn non_whitespace_chars(text: &str) -> usize {
379    text.chars().filter(|ch| !ch.is_whitespace()).count()
380}
381
382fn open_archive(path: &Path) -> Result<ZipArchive<File>> {
383    let file = File::open(path)?;
384    Ok(ZipArchive::new(file)?)
385}
386
387fn validate_mimetype(archive: &mut ZipArchive<File>) -> Result<()> {
388    let mut mimetype = String::new();
389    archive.by_name("mimetype")?.read_to_string(&mut mimetype)?;
390
391    if mimetype.trim() != "application/epub+zip" {
392        return Err(BookforgeError::InvalidInput(
393            "EPUB mimetype must be application/epub+zip".to_string(),
394        ));
395    }
396
397    Ok(())
398}
399
400fn locate_package(archive: &mut ZipArchive<File>) -> Result<String> {
401    let container = read_archive_text(archive, "META-INF/container.xml")?;
402    let mut reader = Reader::from_str(&container);
403    reader.config_mut().trim_text(true);
404
405    loop {
406        match reader.read_event()? {
407            Event::Empty(element) | Event::Start(element)
408                if local_name(element.name().as_ref()) == b"rootfile" =>
409            {
410                if let Some(path) = attr_value(&reader, &element, b"full-path")? {
411                    return Ok(path);
412                }
413            }
414            Event::Eof => break,
415            _ => {}
416        }
417    }
418
419    Err(BookforgeError::InvalidInput(
420        "META-INF/container.xml does not contain a rootfile full-path".to_string(),
421    ))
422}
423
424fn parse_package(xml: &str) -> Result<PackageDocument> {
425    let mut reader = Reader::from_str(xml);
426    reader.config_mut().trim_text(true);
427
428    let mut metadata = Metadata::default();
429    let mut manifest = Vec::new();
430    let mut spine = Vec::new();
431    let mut toc_id = None;
432    let mut current_text_element: Option<Vec<u8>> = None;
433
434    loop {
435        match reader.read_event()? {
436            Event::Start(element) => match local_name(element.name().as_ref()) {
437                b"title" | b"creator" | b"language" => {
438                    current_text_element = Some(local_name(element.name().as_ref()).to_vec());
439                }
440                b"spine" => {
441                    toc_id = attr_value(&reader, &element, b"toc")?;
442                }
443                b"itemref" => {
444                    spine.push(parse_spine_item(&reader, &element)?);
445                }
446                _ => {}
447            },
448            Event::Empty(element) => match local_name(element.name().as_ref()) {
449                b"item" => manifest.push(parse_manifest_item(&reader, &element)?),
450                b"itemref" => spine.push(parse_spine_item(&reader, &element)?),
451                _ => {}
452            },
453            Event::Text(text) => {
454                if let Some(name) = current_text_element.as_deref() {
455                    let value = text
456                        .html_content()
457                        .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?
458                        .trim()
459                        .to_string();
460                    if !value.is_empty() {
461                        match name {
462                            b"title" if metadata.title.is_none() => metadata.title = Some(value),
463                            b"creator" => metadata.creators.push(value),
464                            b"language" if metadata.language.is_none() => {
465                                metadata.language = Some(value)
466                            }
467                            _ => {}
468                        }
469                    }
470                }
471            }
472            Event::End(element)
473                if current_text_element
474                    .as_deref()
475                    .is_some_and(|name| name == local_name(element.name().as_ref())) =>
476            {
477                current_text_element = None;
478            }
479            Event::Eof => break,
480            _ => {}
481        }
482    }
483
484    if manifest.is_empty() {
485        return Err(BookforgeError::InvalidInput(
486            "OPF manifest is empty".to_string(),
487        ));
488    }
489
490    if spine.is_empty() {
491        return Err(BookforgeError::InvalidInput(
492            "OPF spine is empty".to_string(),
493        ));
494    }
495
496    Ok(PackageDocument {
497        metadata,
498        manifest,
499        spine,
500        toc_id,
501    })
502}
503
504fn parse_manifest_item(reader: &Reader<&[u8]>, element: &BytesStart<'_>) -> Result<Resource> {
505    let id = required_attr(reader, element, b"id", "manifest item id")?;
506    let href = required_attr(reader, element, b"href", "manifest item href")?;
507    let media_type = required_attr(reader, element, b"media-type", "manifest item media-type")?;
508
509    Ok(Resource {
510        id,
511        href,
512        media_type,
513        properties: attr_value(reader, element, b"properties")?
514            .map(|value| {
515                value
516                    .split_ascii_whitespace()
517                    .map(ToOwned::to_owned)
518                    .collect()
519            })
520            .unwrap_or_default(),
521    })
522}
523
524fn parse_spine_item(reader: &Reader<&[u8]>, element: &BytesStart<'_>) -> Result<SpineItem> {
525    let idref = required_attr(reader, element, b"idref", "spine item idref")?;
526    let linear = attr_value(reader, element, b"linear")?.is_none_or(|value| value != "no");
527
528    Ok(SpineItem {
529        idref,
530        href: None,
531        linear,
532    })
533}
534
535fn required_attr(
536    reader: &Reader<&[u8]>,
537    element: &BytesStart<'_>,
538    attr_name: &[u8],
539    label: &str,
540) -> Result<String> {
541    attr_value(reader, element, attr_name)?.ok_or_else(|| {
542        BookforgeError::InvalidInput(format!(
543            "missing required {label} attribute '{}'",
544            String::from_utf8_lossy(attr_name)
545        ))
546    })
547}
548
549fn attr_value(
550    reader: &Reader<&[u8]>,
551    element: &BytesStart<'_>,
552    attr_name: &[u8],
553) -> Result<Option<String>> {
554    for attr in element.attributes() {
555        let attr = attr.map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
556        if local_name(attr.key.as_ref()) == attr_name {
557            return Ok(Some(
558                attr.decode_and_unescape_value(reader.decoder())?
559                    .into_owned(),
560            ));
561        }
562    }
563
564    Ok(None)
565}
566
567#[derive(Debug)]
568struct ElementFrame {
569    name: Vec<u8>,
570    path: Vec<usize>,
571    child_count: usize,
572    text_count: usize,
573}
574
575struct TextCapture {
576    depth: usize,
577    path: Vec<usize>,
578    text: String,
579}
580
581fn extract_package_title_blocks(
582    xml: &str,
583    section_id: &SectionId,
584    initial_block_count: usize,
585) -> Result<Vec<Block>> {
586    extract_xml_text_element_blocks(xml, section_id, initial_block_count, |name| {
587        name == b"title"
588    })
589}
590
591fn extract_ncx_text_blocks(
592    xml: &str,
593    section_id: &SectionId,
594    initial_block_count: usize,
595) -> Result<Vec<Block>> {
596    extract_xml_text_element_blocks(xml, section_id, initial_block_count, |name| name == b"text")
597}
598
599fn extract_xml_text_element_blocks(
600    xml: &str,
601    section_id: &SectionId,
602    initial_block_count: usize,
603    should_capture: impl Fn(&[u8]) -> bool,
604) -> Result<Vec<Block>> {
605    let mut reader = Reader::from_str(xml);
606    reader.config_mut().trim_text(false);
607
608    let mut element_stack = Vec::<ElementFrame>::new();
609    let mut active_capture: Option<TextCapture> = None;
610    let mut blocks = Vec::new();
611
612    loop {
613        match reader.read_event()? {
614            Event::Start(element) => {
615                let name = local_name(element.name().as_ref()).to_vec();
616                let path = enter_element(&mut element_stack, &name);
617                if active_capture.is_none() && should_capture(&name) {
618                    active_capture = Some(TextCapture {
619                        depth: element_stack.len(),
620                        path,
621                        text: String::new(),
622                    });
623                }
624            }
625            Event::Empty(_) => {
626                next_child_path(&mut element_stack);
627            }
628            Event::Text(text) => {
629                if let Some(capture) = active_capture.as_mut() {
630                    let value = text
631                        .html_content()
632                        .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
633                    capture.text.push_str(&value);
634                }
635            }
636            Event::CData(text) => {
637                if let Some(capture) = active_capture.as_mut() {
638                    let value = text
639                        .decode()
640                        .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
641                    capture.text.push_str(&value);
642                }
643            }
644            Event::GeneralRef(reference) => {
645                if let Some(capture) = active_capture.as_mut()
646                    && let Some(value) = resolve_general_ref(&reference)?
647                {
648                    capture.text.push_str(&value);
649                }
650            }
651            Event::End(_) => {
652                if active_capture
653                    .as_ref()
654                    .is_some_and(|capture| element_stack.len() == capture.depth)
655                {
656                    let capture = active_capture.take().expect("checked above");
657                    let visible = normalize_space(&capture.text);
658                    if !visible.is_empty() {
659                        blocks.push(build_block(
660                            section_id,
661                            initial_block_count + blocks.len(),
662                            BlockKind::Paragraph,
663                            DomPath(capture.path),
664                            Vec::new(),
665                            Vec::new(),
666                            visible,
667                        ));
668                    }
669                }
670                element_stack.pop();
671            }
672            Event::Eof => break,
673            _ => {}
674        }
675    }
676
677    Ok(blocks)
678}
679
680#[derive(Debug)]
681struct BlockBuilder {
682    /// Stack depth of the element this block is anchored to. The block
683    /// closes when an End event arrives while the stack is exactly this
684    /// deep — name-independent, so nested same-name elements (li > ul >
685    /// li, nested blockquotes) stay inside the block as inline markers
686    /// instead of ending it early.
687    anchor_depth: usize,
688    kind: BlockKind,
689    dom_path: DomPath,
690    ordinal: usize,
691    text_runs: Vec<TextRun>,
692    inline_marks: Vec<InlineMark>,
693    inline_stack: Vec<String>,
694    visible_text: String,
695    next_run: usize,
696    next_marker: usize,
697}
698
699impl BlockBuilder {
700    fn new(anchor_depth: usize, kind: BlockKind, dom_path: DomPath, ordinal: usize) -> Self {
701        Self {
702            anchor_depth,
703            kind,
704            dom_path,
705            ordinal,
706            text_runs: Vec::new(),
707            inline_marks: Vec::new(),
708            inline_stack: Vec::new(),
709            visible_text: String::new(),
710            next_run: 0,
711            next_marker: 0,
712        }
713    }
714
715    fn push_text(&mut self, text: &str) {
716        let Some(mut text) = normalize_text_fragment(text) else {
717            // Whitespace-only fragment (e.g. a resolved &nbsp; entity
718            // reference): it still separates words, so keep one space
719            // between non-empty neighbors instead of dropping the
720            // boundary.
721            if !text.is_empty()
722                && !self.visible_text.is_empty()
723                && !self.visible_text.ends_with(' ')
724            {
725                self.visible_text.push(' ');
726                if let Some(run) = self
727                    .text_runs
728                    .iter_mut()
729                    .rev()
730                    .find(|run| !is_marker_token(&run.text))
731                {
732                    run.text.push(' ');
733                }
734            }
735            return;
736        };
737
738        if self.visible_text.is_empty() {
739            text = text.trim_start().to_string();
740        }
741
742        if text.is_empty() {
743            return;
744        }
745
746        self.visible_text.push_str(&text);
747        self.push_run(text);
748    }
749
750    fn push_inline_start(&mut self, name: &[u8]) {
751        let id = marker_id("m", self.next_marker);
752        self.next_marker += 1;
753        self.inline_marks.push(InlineMark {
754            id: id.clone(),
755            kind: String::from_utf8_lossy(name).into_owned(),
756        });
757        self.inline_stack.push(id.clone());
758        self.push_run(format!("<{id}>"));
759    }
760
761    fn push_inline_empty(&mut self, name: &[u8]) {
762        let id = marker_id("r", self.next_marker);
763        self.next_marker += 1;
764        self.inline_marks.push(InlineMark {
765            id: id.clone(),
766            kind: String::from_utf8_lossy(name).into_owned(),
767        });
768        self.push_run(format!("<{id}/>"));
769    }
770
771    fn push_inline_end(&mut self) {
772        if let Some(id) = self.inline_stack.pop() {
773            self.push_run(format!("</{id}>"));
774        }
775    }
776
777    fn finish(mut self, section_id: &SectionId) -> Option<Block> {
778        self.trim_trailing_text();
779        let visible_text = normalize_space(&self.visible_text);
780        if visible_text.is_empty() {
781            return None;
782        }
783
784        Some(build_block(
785            section_id,
786            self.ordinal,
787            self.kind,
788            self.dom_path,
789            self.text_runs,
790            self.inline_marks,
791            visible_text,
792        ))
793    }
794
795    fn push_run(&mut self, text: String) {
796        self.text_runs.push(TextRun {
797            id: format!("r{:06}_{:03}", self.ordinal, self.next_run),
798            text,
799        });
800        self.next_run += 1;
801    }
802
803    fn trim_trailing_text(&mut self) {
804        if let Some(run) = self
805            .text_runs
806            .iter_mut()
807            .rev()
808            .find(|run| !is_marker_token(&run.text))
809        {
810            run.text = run.text.trim_end().to_string();
811        }
812
813        self.text_runs.retain(|run| !run.text.is_empty());
814    }
815}
816
817fn extract_blocks(
818    xhtml: &str,
819    _href: &str,
820    section_id: &SectionId,
821    initial_block_count: usize,
822) -> Result<Vec<Block>> {
823    let mut reader = Reader::from_str(xhtml);
824    reader.config_mut().trim_text(false);
825
826    let mut element_stack = Vec::<ElementFrame>::new();
827    let mut active_block: Option<BlockBuilder> = None;
828    let mut blocks = Vec::new();
829    // Depth of never-translate ancestors (script/style/head/svg/...).
830    // While positive, loose text is not captured into lazy blocks.
831    let mut suppress_depth = 0usize;
832
833    loop {
834        match reader.read_event()? {
835            Event::Start(element) => {
836                let name = local_name(element.name().as_ref()).to_vec();
837                let path = enter_element(&mut element_stack, &name);
838                if never_translate_element(&name) {
839                    suppress_depth += 1;
840                }
841
842                if active_block.is_none()
843                    && let Some(kind) = block_kind(&name, &element)?
844                {
845                    active_block = Some(BlockBuilder::new(
846                        element_stack.len(),
847                        kind,
848                        DomPath(path),
849                        initial_block_count + blocks.len(),
850                    ));
851                } else if let Some(block) = active_block.as_mut() {
852                    block.push_inline_start(&name);
853                }
854            }
855            Event::Empty(element) => {
856                let name = local_name(element.name().as_ref()).to_vec();
857                // Sibling bookkeeping must advance even though self-closing
858                // block elements (<p/>, <td/>) carry no text and therefore
859                // produce no block — emitting one would send an empty
860                // source to the model and invite hallucinated output.
861                next_child_path(&mut element_stack);
862
863                if let Some(block) = active_block.as_mut() {
864                    block.push_inline_empty(&name);
865                }
866            }
867            Event::Text(text) => {
868                let value = text
869                    .html_content()
870                    .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
871                handle_text(
872                    &value,
873                    &mut active_block,
874                    &mut element_stack,
875                    &mut blocks,
876                    section_id,
877                    initial_block_count,
878                    suppress_depth > 0,
879                    true,
880                );
881            }
882            Event::CData(text) => {
883                let value = text
884                    .decode()
885                    .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
886                handle_text(
887                    &value,
888                    &mut active_block,
889                    &mut element_stack,
890                    &mut blocks,
891                    section_id,
892                    initial_block_count,
893                    suppress_depth > 0,
894                    true,
895                );
896            }
897            // quick-xml surfaces entity references (&nbsp; &mdash; ...) as
898            // separate events rather than resolving them inside Text.
899            // Resolve numeric and HTML5 named references; the resolved
900            // text joins the active block or may anchor a lazy one, but
901            // never consumes a stray text-node index — the writer counts
902            // Text events only, and indices must stay aligned.
903            Event::GeneralRef(reference) => {
904                if let Some(value) = resolve_general_ref(&reference)? {
905                    handle_text(
906                        &value,
907                        &mut active_block,
908                        &mut element_stack,
909                        &mut blocks,
910                        section_id,
911                        initial_block_count,
912                        suppress_depth > 0,
913                        false,
914                    );
915                }
916            }
917            Event::End(_) => {
918                let should_finish = active_block
919                    .as_ref()
920                    .is_some_and(|block| element_stack.len() == block.anchor_depth);
921
922                if should_finish {
923                    let block = active_block.take().expect("checked above");
924                    if let Some(block) = block.finish(section_id) {
925                        blocks.push(block);
926                    }
927                } else if let Some(block) = active_block.as_mut() {
928                    block.push_inline_end();
929                }
930
931                if element_stack
932                    .pop()
933                    .is_some_and(|frame| never_translate_element(&frame.name))
934                {
935                    suppress_depth = suppress_depth.saturating_sub(1);
936                }
937            }
938            Event::Eof => break,
939            _ => {}
940        }
941    }
942
943    Ok(blocks)
944}
945
946/// Resolve a general entity reference to its replacement text: numeric
947/// character references and the HTML5 named set. Unknown entities are
948/// dropped with a warning rather than failing the whole book.
949fn resolve_general_ref(reference: &quick_xml::events::BytesRef<'_>) -> Result<Option<String>> {
950    if let Some(ch) = reference
951        .resolve_char_ref()
952        .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?
953    {
954        return Ok(Some(ch.to_string()));
955    }
956    let name = reference
957        .decode()
958        .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
959    let resolved = quick_xml::escape::resolve_html5_entity(&name).map(ToString::to_string);
960    if resolved.is_none() {
961        tracing::warn!(entity = %name, "dropping unresolvable entity reference");
962    }
963    Ok(resolved)
964}
965
966/// Route a decoded text fragment: into the active block if there is one,
967/// otherwise — for non-whitespace text the block whitelist missed — start
968/// a block anchored on the enclosing element (text-bearing `<div>`,
969/// `<dt>`, `<dd>`, ...) or, when earlier element children make whole-
970/// element patching unsafe, record a standalone text-node block the
971/// writer can address directly. Without this fallback such text silently
972/// shipped untranslated.
973#[allow(clippy::too_many_arguments)]
974fn handle_text(
975    value: &str,
976    active_block: &mut Option<BlockBuilder>,
977    element_stack: &mut [ElementFrame],
978    blocks: &mut Vec<Block>,
979    section_id: &SectionId,
980    initial_block_count: usize,
981    suppressed: bool,
982    allow_stray: bool,
983) {
984    if let Some(block) = active_block.as_mut() {
985        block.push_text(value);
986        return;
987    }
988    if suppressed || value.trim().is_empty() {
989        return;
990    }
991    let depth = element_stack.len();
992    let Some(frame) = element_stack.last_mut() else {
993        return;
994    };
995    if frame.child_count == 0 && anchors_text_block(&frame.name) {
996        let mut block = BlockBuilder::new(
997            depth,
998            BlockKind::Paragraph,
999            DomPath(frame.path.clone()),
1000            initial_block_count + blocks.len(),
1001        );
1002        block.push_text(value);
1003        *active_block = Some(block);
1004        return;
1005    }
1006    if !allow_stray {
1007        return;
1008    }
1009    // Stray text node: prior element siblings (or a wrapper element)
1010    // make whole-element patching unsafe, so the text node itself
1011    // becomes the patch target. The writer counts non-whitespace text
1012    // nodes per frame with the same rule.
1013    let mut path = frame.path.clone();
1014    path.push(bookforge_core::ir::TEXT_NODE_PATH_BASE + frame.text_count);
1015    frame.text_count += 1;
1016    let visible = normalize_space(value);
1017    if visible.is_empty() {
1018        return;
1019    }
1020    blocks.push(build_block(
1021        section_id,
1022        initial_block_count + blocks.len(),
1023        BlockKind::Paragraph,
1024        DomPath(path),
1025        Vec::new(),
1026        Vec::new(),
1027        visible,
1028    ));
1029}
1030
1031/// Elements whose text must never be translated.
1032fn never_translate_element(name: &[u8]) -> bool {
1033    matches!(name, b"script" | b"style" | b"svg" | b"math")
1034}
1035
1036/// Elements safe to anchor a lazily-started text block on. Structural
1037/// wrappers are excluded: anchoring on them would swallow every nested
1038/// block element into one giant marker-laden block. Their direct text is
1039/// handled as stray text nodes instead.
1040fn anchors_text_block(name: &[u8]) -> bool {
1041    !matches!(
1042        name,
1043        b"body"
1044            | b"html"
1045            | b"section"
1046            | b"article"
1047            | b"main"
1048            | b"nav"
1049            | b"head"
1050            | b"header"
1051            | b"footer"
1052            | b"aside"
1053            | b"figure"
1054            | b"ul"
1055            | b"ol"
1056            | b"dl"
1057            | b"table"
1058            | b"thead"
1059            | b"tbody"
1060            | b"tfoot"
1061            | b"colgroup"
1062    )
1063}
1064
1065fn enter_element(stack: &mut Vec<ElementFrame>, name: &[u8]) -> Vec<usize> {
1066    let path = next_child_path(stack);
1067    stack.push(ElementFrame {
1068        name: name.to_vec(),
1069        path: path.clone(),
1070        child_count: 0,
1071        text_count: 0,
1072    });
1073    path
1074}
1075
1076fn next_child_path(stack: &mut [ElementFrame]) -> Vec<usize> {
1077    let Some(parent) = stack.last_mut() else {
1078        return vec![0];
1079    };
1080    let child_index = parent.child_count;
1081    parent.child_count += 1;
1082    let mut path = parent.path.clone();
1083    path.push(child_index);
1084    path
1085}
1086
1087fn block_kind(name: &[u8], element: &BytesStart<'_>) -> Result<Option<BlockKind>> {
1088    Ok(match name {
1089        b"h1" => Some(BlockKind::Heading(1)),
1090        b"h2" => Some(BlockKind::Heading(2)),
1091        b"h3" => Some(BlockKind::Heading(3)),
1092        b"h4" => Some(BlockKind::Heading(4)),
1093        b"h5" => Some(BlockKind::Heading(5)),
1094        b"h6" => Some(BlockKind::Heading(6)),
1095        b"p" => Some(BlockKind::Paragraph),
1096        b"li" => Some(BlockKind::ListItem),
1097        b"blockquote" => Some(BlockKind::Quote),
1098        b"td" | b"th" => Some(BlockKind::TableCell),
1099        b"tr" => Some(BlockKind::TableRow),
1100        b"figcaption" | b"caption" => Some(BlockKind::Caption),
1101        b"pre" | b"code" => Some(BlockKind::Code),
1102        b"aside" if has_epub_type(element, b"footnote")? => Some(BlockKind::Footnote),
1103        _ => None,
1104    })
1105}
1106
1107fn has_epub_type(element: &BytesStart<'_>, expected: &[u8]) -> Result<bool> {
1108    for attr in element.attributes() {
1109        let attr = attr.map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
1110        if local_name(attr.key.as_ref()) == b"type" {
1111            let value = attr.unescape_value()?.into_owned();
1112            return Ok(value
1113                .split_ascii_whitespace()
1114                .any(|item| item.as_bytes() == expected));
1115        }
1116    }
1117    Ok(false)
1118}
1119
1120fn build_block(
1121    section_id: &SectionId,
1122    ordinal: usize,
1123    kind: BlockKind,
1124    dom_path: DomPath,
1125    text_runs: Vec<TextRun>,
1126    inline_marks: Vec<InlineMark>,
1127    visible_text: String,
1128) -> Block {
1129    let text_runs = if text_runs.is_empty() {
1130        vec![TextRun {
1131            id: format!("r{ordinal:06}_000"),
1132            text: visible_text.clone(),
1133        }]
1134    } else {
1135        text_runs
1136    };
1137    let protected_spans = detect_protected_spans(&visible_text);
1138
1139    Block {
1140        id: BlockId(format!("b_{ordinal:06}")),
1141        section_id: section_id.clone(),
1142        kind,
1143        dom_path,
1144        text_runs,
1145        inline_marks,
1146        protected_spans,
1147        token_estimate: estimate_tokens(&visible_text),
1148    }
1149}
1150
1151fn first_heading(blocks: &[Block]) -> (Option<String>, Option<u8>) {
1152    blocks
1153        .iter()
1154        .find_map(|block| match block.kind {
1155            BlockKind::Heading(level) => Some((Some(block_visible_text(block)), Some(level))),
1156            _ => None,
1157        })
1158        .unwrap_or((None, None))
1159}
1160
1161fn link_sections(sections: &mut [Section]) {
1162    let ids = sections
1163        .iter()
1164        .map(|section| section.id.clone())
1165        .collect::<Vec<_>>();
1166
1167    for (index, section) in sections.iter_mut().enumerate() {
1168        section.prev = index.checked_sub(1).and_then(|prev| ids.get(prev).cloned());
1169        section.next = ids.get(index + 1).cloned();
1170    }
1171}
1172
1173fn normalize_space(text: &str) -> String {
1174    text.split_whitespace().collect::<Vec<_>>().join(" ")
1175}
1176
1177fn normalize_text_fragment(text: &str) -> Option<String> {
1178    if text.trim().is_empty() {
1179        return None;
1180    }
1181
1182    let mut normalized = normalize_space(text);
1183    if text.chars().next().is_some_and(char::is_whitespace) {
1184        normalized.insert(0, ' ');
1185    }
1186    if text.chars().last().is_some_and(char::is_whitespace) {
1187        normalized.push(' ');
1188    }
1189    Some(normalized)
1190}
1191
1192fn block_visible_text(block: &Block) -> String {
1193    let marked = block
1194        .text_runs
1195        .iter()
1196        .map(|run| run.text.as_str())
1197        .collect::<Vec<_>>()
1198        .join("");
1199    normalize_space(&strip_marker_tokens(&marked))
1200}
1201
1202fn marker_id(prefix: &str, marker_ordinal: usize) -> String {
1203    format!("{prefix}{}", marker_ordinal + 1)
1204}
1205
1206fn estimate_tokens(text: &str) -> usize {
1207    let words = text.split_whitespace().count();
1208    words.saturating_mul(4).div_ceil(3).max(1)
1209}
1210
1211fn detect_protected_spans(text: &str) -> Vec<ProtectedSpan> {
1212    let mut spans = text
1213        .split_whitespace()
1214        .filter_map(|raw| {
1215            let value = trim_token(raw);
1216            protected_span_kind(value).map(|kind| ProtectedSpan {
1217                kind,
1218                text: value.to_string(),
1219            })
1220        })
1221        .collect::<Vec<_>>();
1222    spans.sort_by(|left, right| left.text.cmp(&right.text));
1223    spans.dedup_by(|left, right| left.kind == right.kind && left.text == right.text);
1224    spans
1225}
1226
1227fn protected_span_kind(value: &str) -> Option<ProtectedSpanKind> {
1228    if value.is_empty() {
1229        None
1230    } else if value.starts_with("http://") || value.starts_with("https://") {
1231        Some(ProtectedSpanKind::Url)
1232    } else if value.starts_with('#') && value.len() > 1 {
1233        Some(ProtectedSpanKind::InternalAnchor)
1234    } else if looks_like_email(value) {
1235        Some(ProtectedSpanKind::Email)
1236    } else if looks_like_citation(value) {
1237        Some(ProtectedSpanKind::Citation)
1238    } else if looks_like_protected_number(value) {
1239        Some(ProtectedSpanKind::Number)
1240    } else if looks_like_filename(value) {
1241        Some(ProtectedSpanKind::Filename)
1242    } else {
1243        None
1244    }
1245}
1246
1247fn trim_token(raw: &str) -> &str {
1248    let trimmed = raw.trim_matches(|ch: char| {
1249        matches!(
1250            ch,
1251            ',' | ';' | ':' | '.' | '!' | '?' | '(' | ')' | '"' | '\''
1252        )
1253    });
1254    if trimmed.starts_with("[@") && trimmed.ends_with(']') {
1255        trimmed
1256    } else {
1257        trimmed.trim_matches(|ch: char| matches!(ch, '[' | ']'))
1258    }
1259}
1260
1261fn looks_like_email(value: &str) -> bool {
1262    let Some((local, domain)) = value.split_once('@') else {
1263        return false;
1264    };
1265    !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
1266}
1267
1268fn looks_like_citation(value: &str) -> bool {
1269    (value.starts_with('@') && value.len() > 1)
1270        || (value.starts_with("[@") && value.ends_with(']') && value.len() > 3)
1271}
1272
1273fn looks_like_filename(value: &str) -> bool {
1274    let Some((stem, ext)) = value.rsplit_once('.') else {
1275        return false;
1276    };
1277    const COMMON_EXTENSIONS: &[&str] = &[
1278        "azw", "css", "csv", "epub", "gif", "htm", "html", "jpeg", "jpg", "js", "json", "md",
1279        "mobi", "ncx", "opf", "pdf", "png", "svg", "txt", "xhtml", "xml", "zip",
1280    ];
1281    let ext = ext.to_ascii_lowercase();
1282    !stem.is_empty()
1283        && COMMON_EXTENSIONS.contains(&ext.as_str())
1284        && stem
1285            .chars()
1286            .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '/' | '.'))
1287}
1288
1289fn looks_like_protected_number(value: &str) -> bool {
1290    let digit_count = value.chars().filter(|ch| ch.is_ascii_digit()).count();
1291    if digit_count == 0 {
1292        return false;
1293    }
1294    if digit_count >= 2 {
1295        return value.chars().all(|ch| {
1296            ch.is_ascii_digit()
1297                || matches!(
1298                    ch,
1299                    '.' | ',' | ':' | ';' | '/' | '-' | '+' | '%' | '$' | '\u{20ac}' | '\u{00a3}'
1300                )
1301        });
1302    }
1303    value.ends_with("st") || value.ends_with("nd") || value.ends_with("rd") || value.ends_with("th")
1304}
1305
1306fn read_archive_text(archive: &mut ZipArchive<File>, path: &str) -> Result<String> {
1307    let mut file = archive.by_name(path)?;
1308    let mut text = String::new();
1309    file.read_to_string(&mut text)?;
1310    Ok(text)
1311}
1312
1313fn is_xhtml_media_type(media_type: &str) -> bool {
1314    matches!(
1315        media_type,
1316        "application/xhtml+xml" | "text/html" | "application/xml"
1317    )
1318}
1319
1320fn is_nav_item(item: &Resource) -> bool {
1321    item.media_type == "application/xhtml+xml"
1322        && (item.properties.iter().any(|property| property == "nav")
1323            || item.href.ends_with("nav.xhtml"))
1324}
1325
1326fn package_base_dir(package_path: &str) -> String {
1327    Path::new(package_path)
1328        .parent()
1329        .and_then(Path::to_str)
1330        .unwrap_or("")
1331        .to_string()
1332}
1333
1334fn join_epub_path(base: &str, href: &str) -> String {
1335    if base.is_empty() {
1336        normalize_epub_path(href)
1337    } else {
1338        normalize_epub_path(&format!("{base}/{href}"))
1339    }
1340}
1341
1342fn normalize_epub_path(path: &str) -> String {
1343    let mut normalized = PathBuf::new();
1344    for component in Path::new(path).components() {
1345        normalized.push(component.as_os_str());
1346    }
1347    normalized.to_string_lossy().replace('\\', "/")
1348}
1349
1350fn local_name(name: &[u8]) -> &[u8] {
1351    name.rsplit(|byte| *byte == b':').next().unwrap_or(name)
1352}
1353
1354#[cfg(test)]
1355mod tests {
1356    use super::*;
1357
1358    #[test]
1359    fn extracts_inline_marks_and_marker_text_runs() {
1360        let section_id = SectionId("sec_000000".to_string());
1361        let blocks = extract_blocks(
1362            "<html><body><p>Hello <em>world</em>!</p></body></html>",
1363            "chapter.xhtml",
1364            &section_id,
1365            0,
1366        )
1367        .expect("block extraction should succeed");
1368
1369        assert_eq!(blocks.len(), 1);
1370        let text = block_text(&blocks[0]);
1371        assert_eq!(text, "Hello <m1>world</m1>!");
1372        assert_eq!(blocks[0].inline_marks.len(), 1);
1373        assert_eq!(blocks[0].inline_marks[0].id, "m1");
1374        assert_eq!(blocks[0].inline_marks[0].kind, "em");
1375        assert_eq!(blocks[0].token_estimate, estimate_tokens("Hello world!"));
1376    }
1377
1378    #[test]
1379    fn extracts_empty_inline_marker() {
1380        let section_id = SectionId("sec_000000".to_string());
1381        let blocks = extract_blocks(
1382            "<html><body><p>Line<br/>break</p></body></html>",
1383            "chapter.xhtml",
1384            &section_id,
1385            4,
1386        )
1387        .expect("block extraction should succeed");
1388
1389        assert_eq!(blocks.len(), 1);
1390        assert_eq!(blocks[0].id.0, "b_000004");
1391        assert_eq!(block_text(&blocks[0]), "Line<r1/>break");
1392        assert_eq!(blocks[0].inline_marks[0].id, "r1");
1393        assert_eq!(blocks[0].inline_marks[0].kind, "br");
1394    }
1395
1396    #[test]
1397    fn extracts_text_anchored_block_from_div() {
1398        let section_id = SectionId("sec_000000".to_string());
1399        let blocks = extract_blocks(
1400            "<html><body><div class=\"x\">Bare div text with <em>emphasis</em>.</div></body></html>",
1401            "chapter.xhtml",
1402            &section_id,
1403            0,
1404        )
1405        .expect("block extraction should succeed");
1406
1407        assert_eq!(blocks.len(), 1);
1408        assert_eq!(
1409            block_text(&blocks[0]),
1410            "Bare div text with <m1>emphasis</m1>."
1411        );
1412        assert_eq!(blocks[0].kind, BlockKind::Paragraph);
1413    }
1414
1415    #[test]
1416    fn extracts_dt_and_dd_text() {
1417        let section_id = SectionId("sec_000000".to_string());
1418        let blocks = extract_blocks(
1419            "<html><body><dl><dt>Term</dt><dd>Definition</dd></dl></body></html>",
1420            "chapter.xhtml",
1421            &section_id,
1422            0,
1423        )
1424        .expect("block extraction should succeed");
1425
1426        let texts: Vec<String> = blocks.iter().map(block_text).collect();
1427        assert_eq!(texts, vec!["Term".to_string(), "Definition".to_string()]);
1428    }
1429
1430    #[test]
1431    fn stray_text_after_children_becomes_addressable_block() {
1432        let section_id = SectionId("sec_000000".to_string());
1433        let blocks = extract_blocks(
1434            "<html><body><p>Captured</p>Naked tail text</body></html>",
1435            "chapter.xhtml",
1436            &section_id,
1437            0,
1438        )
1439        .expect("block extraction should succeed");
1440
1441        assert_eq!(blocks.len(), 2);
1442        assert_eq!(block_text(&blocks[1]), "Naked tail text");
1443        let last = *blocks[1]
1444            .dom_path
1445            .0
1446            .last()
1447            .expect("path should not be empty");
1448        assert!(
1449            last >= bookforge_core::ir::TEXT_NODE_PATH_BASE,
1450            "stray text block must use a text-node path component, got {last}"
1451        );
1452    }
1453
1454    #[test]
1455    fn nested_same_name_blocks_stay_in_one_block() {
1456        let section_id = SectionId("sec_000000".to_string());
1457        let blocks = extract_blocks(
1458            "<html><body><ul><li>Outer <ul><li>Inner</li></ul> tail</li><li>Sibling</li></ul></body></html>",
1459            "chapter.xhtml",
1460            &section_id,
1461            0,
1462        )
1463        .expect("block extraction should succeed");
1464
1465        assert_eq!(blocks.len(), 2, "outer li (with nested list) + sibling li");
1466        let outer = block_text(&blocks[0]);
1467        assert!(outer.contains("Outer"), "got: {outer}");
1468        assert!(
1469            outer.contains("Inner"),
1470            "nested li text stays inside the outer block: {outer}"
1471        );
1472        assert!(
1473            outer.contains("tail"),
1474            "text after the nested list must not be lost: {outer}"
1475        );
1476        assert_eq!(block_text(&blocks[1]), "Sibling");
1477    }
1478
1479    #[test]
1480    fn named_html_entities_decode_in_text() {
1481        let section_id = SectionId("sec_000000".to_string());
1482        let blocks = extract_blocks(
1483            "<html><body><p>one&nbsp;two&mdash;three</p></body></html>",
1484            "chapter.xhtml",
1485            &section_id,
1486            0,
1487        )
1488        .expect("named entities must not fail extraction");
1489
1490        assert_eq!(blocks.len(), 1);
1491        assert_eq!(block_text(&blocks[0]), "one two\u{2014}three");
1492    }
1493
1494    #[test]
1495    fn script_and_style_text_is_never_extracted() {
1496        let section_id = SectionId("sec_000000".to_string());
1497        let blocks = extract_blocks(
1498            "<html><head><title>Meta</title><style>p { color: red; }</style></head><body><script>var x = 1;</script><div>Real</div></body></html>",
1499            "chapter.xhtml",
1500            &section_id,
1501            0,
1502        )
1503        .expect("block extraction should succeed");
1504
1505        let texts = blocks.iter().map(block_text).collect::<Vec<_>>();
1506        assert_eq!(texts, vec!["Meta", "Real"]);
1507        assert!(!texts.iter().any(|text| text.contains("color")));
1508        assert!(!texts.iter().any(|text| text.contains("var x")));
1509    }
1510
1511    #[test]
1512    fn visible_body_chars_counts_body_and_title_but_not_style() {
1513        let xhtml = r#"<html><head><title>Heading</title><style>p { color: red; }</style></head>
1514<body><p>captured</p><div>div text</div></body></html>"#;
1515        let total = visible_body_chars(xhtml).expect("count should succeed");
1516        // "Heading" (7) + "captured" (8) + "divtext" (7); style is excluded.
1517        // The head title counts because the extractor translates it too.
1518        assert_eq!(total, 22);
1519    }
1520
1521    #[test]
1522    fn coverage_captures_div_text_and_reports_svg_labels_uncaptured() {
1523        let section_id = SectionId("sec_000000".to_string());
1524        let xhtml =
1525            "<html><body><p>in a block</p><div>also in a div</div><svg><text>diagram label</text></svg></body></html>"
1526                .to_string();
1527        let blocks =
1528            extract_blocks(&xhtml, "chapter.xhtml", &section_id, 0).expect("blocks should parse");
1529        let captured = blocks
1530            .iter()
1531            .map(|block| non_whitespace_chars(&block_visible_text(block)))
1532            .sum::<usize>();
1533        let total = visible_body_chars(&xhtml).expect("count should succeed");
1534
1535        assert_eq!(
1536            captured,
1537            non_whitespace_chars("in a block") + non_whitespace_chars("also in a div"),
1538            "prose in p and div must both be captured"
1539        );
1540        assert_eq!(
1541            total - captured,
1542            non_whitespace_chars("diagram label"),
1543            "svg text stays uncaptured and visible in the coverage gap"
1544        );
1545    }
1546
1547    #[test]
1548    fn protected_spans_do_not_overflag_single_digits() {
1549        let spans = detect_protected_spans(
1550            "Chapter 1 cites https://example.com, file.txt, #anchor, and pages 12-14.",
1551        );
1552        let texts = spans
1553            .iter()
1554            .map(|span| span.text.as_str())
1555            .collect::<Vec<_>>();
1556
1557        assert!(!texts.contains(&"1"));
1558        assert!(texts.contains(&"https://example.com"));
1559        assert!(texts.contains(&"file.txt"));
1560        assert!(texts.contains(&"#anchor"));
1561        assert!(texts.contains(&"12-14"));
1562    }
1563
1564    #[test]
1565    fn protected_spans_do_not_treat_sentence_fragments_as_filenames() {
1566        let spans = detect_protected_spans(
1567            "case.Fedor bow.At said:“The file.txt chapter.xhtml [@tolstoy1886] @note1",
1568        );
1569        let texts = spans
1570            .iter()
1571            .map(|span| span.text.as_str())
1572            .collect::<Vec<_>>();
1573
1574        assert!(!texts.contains(&"case.Fedor"));
1575        assert!(!texts.contains(&"bow.At"));
1576        assert!(!texts.contains(&"said:“The"));
1577        assert!(texts.contains(&"file.txt"));
1578        assert!(texts.contains(&"chapter.xhtml"));
1579        assert!(texts.contains(&"[@tolstoy1886]"));
1580        assert!(texts.contains(&"@note1"));
1581    }
1582
1583    fn block_text(block: &Block) -> String {
1584        block
1585            .text_runs
1586            .iter()
1587            .map(|run| run.text.as_str())
1588            .collect::<Vec<_>>()
1589            .join("")
1590    }
1591}