1use std::{
2 collections::HashMap,
3 fs::File,
4 io::Read,
5 path::{Path, PathBuf},
6};
7
8use bookforge_core::{
9 BookforgeError, Result,
10 ir::{
11 Block, BlockId, BlockKind, Book, BookFormat, BookId, DomPath, InlineMark, Metadata,
12 ProtectedSpan, ProtectedSpanKind, Resource, Section, SectionId, SpineItem, TextRun,
13 },
14 marker::{is_marker_token, strip_marker_tokens},
15};
16use quick_xml::{
17 Reader,
18 events::{BytesStart, Event},
19};
20use zip::ZipArchive;
21
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct EpubInspection {
24 pub title: Option<String>,
25 pub spine_count: usize,
26 pub manifest_count: usize,
27 pub xhtml_count: usize,
28 pub has_nav: bool,
29 pub has_toc: bool,
30 pub resource_count: usize,
31 pub package_path: String,
32 pub xhtml_spine_count: usize,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq, Default)]
41pub struct TextCoverage {
42 pub total_chars: usize,
43 pub captured_chars: usize,
44 pub files: Vec<FileTextCoverage>,
45}
46
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct FileTextCoverage {
49 pub href: String,
50 pub total_chars: usize,
51 pub captured_chars: usize,
52}
53
54impl TextCoverage {
55 pub fn percent(&self) -> f64 {
56 coverage_percent(self.captured_chars, self.total_chars)
57 }
58}
59
60impl FileTextCoverage {
61 pub fn percent(&self) -> f64 {
62 coverage_percent(self.captured_chars, self.total_chars)
63 }
64
65 pub fn uncaptured_chars(&self) -> usize {
66 self.total_chars.saturating_sub(self.captured_chars)
67 }
68}
69
70fn coverage_percent(captured: usize, total: usize) -> f64 {
71 if total == 0 {
72 100.0
73 } else {
74 (captured.min(total) as f64 / total as f64) * 100.0
75 }
76}
77
78#[derive(Debug, Clone)]
79struct PackageDocument {
80 metadata: Metadata,
81 manifest: Vec<Resource>,
82 spine: Vec<SpineItem>,
83 toc_id: Option<String>,
84}
85
86pub fn read_epub(path: &Path) -> Result<Book> {
87 let mut archive = open_archive(path)?;
88 validate_mimetype(&mut archive)?;
89 let package_path = locate_package(&mut archive)?;
90 let package_xml = read_archive_text(&mut archive, &package_path)?;
91 let mut package = parse_package(&package_xml)?;
92 let package_dir = package_base_dir(&package_path);
93 let manifest_by_id = package
94 .manifest
95 .iter()
96 .map(|item| (item.id.as_str(), item))
97 .collect::<HashMap<_, _>>();
98 let mut sections = Vec::new();
99 let mut blocks = Vec::new();
100
101 let package_section_id = SectionId("sec_metadata_opf".to_string());
102 let mut package_blocks =
103 extract_package_title_blocks(&package_xml, &package_section_id, blocks.len())?;
104 if !package_blocks.is_empty() {
105 let block_ids = package_blocks
106 .iter()
107 .map(|block| block.id.clone())
108 .collect::<Vec<_>>();
109 sections.push(Section {
110 id: package_section_id,
111 href: package_path.clone(),
112 spine_index: 0,
113 title: Some("OPF metadata".to_string()),
114 heading_level: None,
115 block_ids,
116 prev: None,
117 next: None,
118 });
119 blocks.append(&mut package_blocks);
120 }
121
122 for (toc_index, resource) in package
123 .manifest
124 .iter()
125 .filter(|item| item.media_type == "application/x-dtbncx+xml")
126 .enumerate()
127 {
128 let href = join_epub_path(&package_dir, &resource.href);
129 let ncx = read_archive_text(&mut archive, &href)?;
130 let section_id = SectionId(format!("sec_toc_{toc_index:06}"));
131 let mut toc_blocks = extract_ncx_text_blocks(&ncx, §ion_id, blocks.len())?;
132 if toc_blocks.is_empty() {
133 continue;
134 }
135 let block_ids = toc_blocks
136 .iter()
137 .map(|block| block.id.clone())
138 .collect::<Vec<_>>();
139 sections.push(Section {
140 id: section_id,
141 href,
142 spine_index: 0,
143 title: Some("NCX table of contents".to_string()),
144 heading_level: None,
145 block_ids,
146 prev: None,
147 next: None,
148 });
149 blocks.append(&mut toc_blocks);
150 }
151
152 for (spine_index, spine_item) in package.spine.iter_mut().enumerate() {
153 let Some(resource) = manifest_by_id.get(spine_item.idref.as_str()) else {
154 return Err(BookforgeError::InvalidInput(format!(
155 "spine item references missing manifest id '{}'",
156 spine_item.idref
157 )));
158 };
159
160 let href = join_epub_path(&package_dir, &resource.href);
161 spine_item.href = Some(href.clone());
162
163 if !is_xhtml_media_type(&resource.media_type) {
164 continue;
165 }
166
167 let xhtml = read_archive_text(&mut archive, &href)?;
168 let section_id = SectionId(format!("sec_{spine_index:06}"));
169 let mut section_blocks = extract_blocks(&xhtml, &href, §ion_id, blocks.len())?;
170 if section_blocks.is_empty() {
171 continue;
172 }
173 let block_ids = section_blocks
174 .iter()
175 .map(|block| block.id.clone())
176 .collect::<Vec<_>>();
177 let (title, heading_level) = first_heading(§ion_blocks);
178
179 sections.push(Section {
180 id: section_id,
181 href,
182 spine_index,
183 title,
184 heading_level,
185 block_ids,
186 prev: None,
187 next: None,
188 });
189 blocks.append(&mut section_blocks);
190 }
191
192 link_sections(&mut sections);
193
194 if blocks.is_empty() {
195 return Err(BookforgeError::InvalidInput(
196 "EPUB contains no translatable blocks".to_string(),
197 ));
198 }
199
200 Ok(Book {
201 source_path: Some(path.to_path_buf()),
202 id: BookId(package_path),
203 format: BookFormat::Epub,
204 metadata: package.metadata,
205 manifest: package.manifest,
206 spine: package.spine,
207 sections,
208 blocks,
209 })
210}
211
212pub fn inspect_epub(path: &Path) -> Result<EpubInspection> {
213 let mut archive = open_archive(path)?;
214 validate_mimetype(&mut archive)?;
215
216 let package_path = locate_package(&mut archive)?;
217 let package_xml = read_archive_text(&mut archive, &package_path)?;
218 let package = parse_package(&package_xml)?;
219 let manifest_by_id = package
220 .manifest
221 .iter()
222 .map(|item| (item.id.as_str(), item))
223 .collect::<HashMap<_, _>>();
224
225 let package_dir = package_base_dir(&package_path);
226 let xhtml_count = package
227 .manifest
228 .iter()
229 .filter(|item| is_xhtml_media_type(&item.media_type))
230 .count();
231 let has_nav = package.manifest.iter().any(is_nav_item);
232 let has_toc = package
233 .toc_id
234 .as_deref()
235 .and_then(|toc_id| manifest_by_id.get(toc_id))
236 .is_some_and(|item| item.media_type == "application/x-dtbncx+xml")
237 || package
238 .manifest
239 .iter()
240 .any(|item| item.media_type == "application/x-dtbncx+xml");
241
242 let mut xhtml_spine_count = 0;
243 for item in &package.spine {
244 let Some(resource) = manifest_by_id.get(item.idref.as_str()) else {
245 return Err(BookforgeError::InvalidInput(format!(
246 "spine item references missing manifest id '{}'",
247 item.idref
248 )));
249 };
250
251 if is_xhtml_media_type(&resource.media_type) {
252 let href = join_epub_path(&package_dir, &resource.href);
253 read_archive_text(&mut archive, &href)?;
254 xhtml_spine_count += 1;
255 }
256 }
257
258 Ok(EpubInspection {
259 title: package.metadata.title,
260 spine_count: package.spine.len(),
261 manifest_count: package.manifest.len(),
262 xhtml_count,
263 has_nav,
264 has_toc,
265 resource_count: package
266 .manifest
267 .iter()
268 .filter(|item| !is_xhtml_media_type(&item.media_type))
269 .count(),
270 package_path,
271 xhtml_spine_count,
272 })
273}
274
275pub fn text_coverage(path: &Path) -> Result<TextCoverage> {
279 let mut archive = open_archive(path)?;
280 validate_mimetype(&mut archive)?;
281 let package_path = locate_package(&mut archive)?;
282 let package_xml = read_archive_text(&mut archive, &package_path)?;
283 let package = parse_package(&package_xml)?;
284 let package_dir = package_base_dir(&package_path);
285 let manifest_by_id = package
286 .manifest
287 .iter()
288 .map(|item| (item.id.as_str(), item))
289 .collect::<HashMap<_, _>>();
290
291 let mut coverage = TextCoverage::default();
292 for (spine_index, spine_item) in package.spine.iter().enumerate() {
293 let Some(resource) = manifest_by_id.get(spine_item.idref.as_str()) else {
294 return Err(BookforgeError::InvalidInput(format!(
295 "spine item references missing manifest id '{}'",
296 spine_item.idref
297 )));
298 };
299 if !is_xhtml_media_type(&resource.media_type) {
300 continue;
301 }
302
303 let href = join_epub_path(&package_dir, &resource.href);
304 let xhtml = read_archive_text(&mut archive, &href)?;
305 let section_id = SectionId(format!("sec_{spine_index:06}"));
306 let blocks = extract_blocks(&xhtml, &href, §ion_id, 0)?;
307 let captured_chars = blocks
308 .iter()
309 .map(|block| non_whitespace_chars(&block_visible_text(block)))
310 .sum::<usize>();
311 let total_chars = visible_body_chars(&xhtml)?;
312
313 coverage.total_chars += total_chars;
314 coverage.captured_chars += captured_chars;
315 coverage.files.push(FileTextCoverage {
316 href,
317 total_chars,
318 captured_chars,
319 });
320 }
321
322 Ok(coverage)
323}
324
325fn visible_body_chars(xhtml: &str) -> Result<usize> {
331 let mut reader = Reader::from_str(xhtml);
332 reader.config_mut().trim_text(false);
333 let mut in_body = false;
334 let mut in_title = false;
335 let mut skip_depth = 0usize;
336 let mut count = 0usize;
337
338 loop {
339 let counting = (in_body || in_title) && skip_depth == 0;
340 match reader.read_event()? {
341 Event::Start(element) => match local_name(element.name().as_ref()) {
342 b"body" => in_body = true,
343 b"title" if !in_body => in_title = true,
344 b"script" | b"style" if in_body => skip_depth += 1,
345 _ => {}
346 },
347 Event::End(element) => match local_name(element.name().as_ref()) {
348 b"body" => in_body = false,
349 b"title" => in_title = false,
350 b"script" | b"style" if skip_depth > 0 => skip_depth -= 1,
351 _ => {}
352 },
353 Event::Text(text) if counting => {
354 let value = text
355 .html_content()
356 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
357 count += non_whitespace_chars(&value);
358 }
359 Event::CData(text) if counting => {
360 let value = text
361 .decode()
362 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
363 count += non_whitespace_chars(&value);
364 }
365 Event::GeneralRef(reference) if counting => {
366 if let Some(value) = resolve_general_ref(&reference)? {
367 count += non_whitespace_chars(&value);
368 }
369 }
370 Event::Eof => break,
371 _ => {}
372 }
373 }
374
375 Ok(count)
376}
377
378fn non_whitespace_chars(text: &str) -> usize {
379 text.chars().filter(|ch| !ch.is_whitespace()).count()
380}
381
382fn open_archive(path: &Path) -> Result<ZipArchive<File>> {
383 let file = File::open(path)?;
384 Ok(ZipArchive::new(file)?)
385}
386
387fn validate_mimetype(archive: &mut ZipArchive<File>) -> Result<()> {
388 let mut mimetype = String::new();
389 archive.by_name("mimetype")?.read_to_string(&mut mimetype)?;
390
391 if mimetype.trim() != "application/epub+zip" {
392 return Err(BookforgeError::InvalidInput(
393 "EPUB mimetype must be application/epub+zip".to_string(),
394 ));
395 }
396
397 Ok(())
398}
399
400fn locate_package(archive: &mut ZipArchive<File>) -> Result<String> {
401 let container = read_archive_text(archive, "META-INF/container.xml")?;
402 let mut reader = Reader::from_str(&container);
403 reader.config_mut().trim_text(true);
404
405 loop {
406 match reader.read_event()? {
407 Event::Empty(element) | Event::Start(element)
408 if local_name(element.name().as_ref()) == b"rootfile" =>
409 {
410 if let Some(path) = attr_value(&reader, &element, b"full-path")? {
411 return Ok(path);
412 }
413 }
414 Event::Eof => break,
415 _ => {}
416 }
417 }
418
419 Err(BookforgeError::InvalidInput(
420 "META-INF/container.xml does not contain a rootfile full-path".to_string(),
421 ))
422}
423
424fn parse_package(xml: &str) -> Result<PackageDocument> {
425 let mut reader = Reader::from_str(xml);
426 reader.config_mut().trim_text(true);
427
428 let mut metadata = Metadata::default();
429 let mut manifest = Vec::new();
430 let mut spine = Vec::new();
431 let mut toc_id = None;
432 let mut current_text_element: Option<Vec<u8>> = None;
433
434 loop {
435 match reader.read_event()? {
436 Event::Start(element) => match local_name(element.name().as_ref()) {
437 b"title" | b"creator" | b"language" => {
438 current_text_element = Some(local_name(element.name().as_ref()).to_vec());
439 }
440 b"spine" => {
441 toc_id = attr_value(&reader, &element, b"toc")?;
442 }
443 b"itemref" => {
444 spine.push(parse_spine_item(&reader, &element)?);
445 }
446 _ => {}
447 },
448 Event::Empty(element) => match local_name(element.name().as_ref()) {
449 b"item" => manifest.push(parse_manifest_item(&reader, &element)?),
450 b"itemref" => spine.push(parse_spine_item(&reader, &element)?),
451 _ => {}
452 },
453 Event::Text(text) => {
454 if let Some(name) = current_text_element.as_deref() {
455 let value = text
456 .html_content()
457 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?
458 .trim()
459 .to_string();
460 if !value.is_empty() {
461 match name {
462 b"title" if metadata.title.is_none() => metadata.title = Some(value),
463 b"creator" => metadata.creators.push(value),
464 b"language" if metadata.language.is_none() => {
465 metadata.language = Some(value)
466 }
467 _ => {}
468 }
469 }
470 }
471 }
472 Event::End(element)
473 if current_text_element
474 .as_deref()
475 .is_some_and(|name| name == local_name(element.name().as_ref())) =>
476 {
477 current_text_element = None;
478 }
479 Event::Eof => break,
480 _ => {}
481 }
482 }
483
484 if manifest.is_empty() {
485 return Err(BookforgeError::InvalidInput(
486 "OPF manifest is empty".to_string(),
487 ));
488 }
489
490 if spine.is_empty() {
491 return Err(BookforgeError::InvalidInput(
492 "OPF spine is empty".to_string(),
493 ));
494 }
495
496 Ok(PackageDocument {
497 metadata,
498 manifest,
499 spine,
500 toc_id,
501 })
502}
503
504fn parse_manifest_item(reader: &Reader<&[u8]>, element: &BytesStart<'_>) -> Result<Resource> {
505 let id = required_attr(reader, element, b"id", "manifest item id")?;
506 let href = required_attr(reader, element, b"href", "manifest item href")?;
507 let media_type = required_attr(reader, element, b"media-type", "manifest item media-type")?;
508
509 Ok(Resource {
510 id,
511 href,
512 media_type,
513 properties: attr_value(reader, element, b"properties")?
514 .map(|value| {
515 value
516 .split_ascii_whitespace()
517 .map(ToOwned::to_owned)
518 .collect()
519 })
520 .unwrap_or_default(),
521 })
522}
523
524fn parse_spine_item(reader: &Reader<&[u8]>, element: &BytesStart<'_>) -> Result<SpineItem> {
525 let idref = required_attr(reader, element, b"idref", "spine item idref")?;
526 let linear = attr_value(reader, element, b"linear")?.is_none_or(|value| value != "no");
527
528 Ok(SpineItem {
529 idref,
530 href: None,
531 linear,
532 })
533}
534
535fn required_attr(
536 reader: &Reader<&[u8]>,
537 element: &BytesStart<'_>,
538 attr_name: &[u8],
539 label: &str,
540) -> Result<String> {
541 attr_value(reader, element, attr_name)?.ok_or_else(|| {
542 BookforgeError::InvalidInput(format!(
543 "missing required {label} attribute '{}'",
544 String::from_utf8_lossy(attr_name)
545 ))
546 })
547}
548
549fn attr_value(
550 reader: &Reader<&[u8]>,
551 element: &BytesStart<'_>,
552 attr_name: &[u8],
553) -> Result<Option<String>> {
554 for attr in element.attributes() {
555 let attr = attr.map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
556 if local_name(attr.key.as_ref()) == attr_name {
557 return Ok(Some(
558 attr.decode_and_unescape_value(reader.decoder())?
559 .into_owned(),
560 ));
561 }
562 }
563
564 Ok(None)
565}
566
567#[derive(Debug)]
568struct ElementFrame {
569 name: Vec<u8>,
570 path: Vec<usize>,
571 child_count: usize,
572 text_count: usize,
573}
574
575struct TextCapture {
576 depth: usize,
577 path: Vec<usize>,
578 text: String,
579}
580
581fn extract_package_title_blocks(
582 xml: &str,
583 section_id: &SectionId,
584 initial_block_count: usize,
585) -> Result<Vec<Block>> {
586 extract_xml_text_element_blocks(xml, section_id, initial_block_count, |name| {
587 name == b"title"
588 })
589}
590
591fn extract_ncx_text_blocks(
592 xml: &str,
593 section_id: &SectionId,
594 initial_block_count: usize,
595) -> Result<Vec<Block>> {
596 extract_xml_text_element_blocks(xml, section_id, initial_block_count, |name| name == b"text")
597}
598
599fn extract_xml_text_element_blocks(
600 xml: &str,
601 section_id: &SectionId,
602 initial_block_count: usize,
603 should_capture: impl Fn(&[u8]) -> bool,
604) -> Result<Vec<Block>> {
605 let mut reader = Reader::from_str(xml);
606 reader.config_mut().trim_text(false);
607
608 let mut element_stack = Vec::<ElementFrame>::new();
609 let mut active_capture: Option<TextCapture> = None;
610 let mut blocks = Vec::new();
611
612 loop {
613 match reader.read_event()? {
614 Event::Start(element) => {
615 let name = local_name(element.name().as_ref()).to_vec();
616 let path = enter_element(&mut element_stack, &name);
617 if active_capture.is_none() && should_capture(&name) {
618 active_capture = Some(TextCapture {
619 depth: element_stack.len(),
620 path,
621 text: String::new(),
622 });
623 }
624 }
625 Event::Empty(_) => {
626 next_child_path(&mut element_stack);
627 }
628 Event::Text(text) => {
629 if let Some(capture) = active_capture.as_mut() {
630 let value = text
631 .html_content()
632 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
633 capture.text.push_str(&value);
634 }
635 }
636 Event::CData(text) => {
637 if let Some(capture) = active_capture.as_mut() {
638 let value = text
639 .decode()
640 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
641 capture.text.push_str(&value);
642 }
643 }
644 Event::GeneralRef(reference) => {
645 if let Some(capture) = active_capture.as_mut()
646 && let Some(value) = resolve_general_ref(&reference)?
647 {
648 capture.text.push_str(&value);
649 }
650 }
651 Event::End(_) => {
652 if active_capture
653 .as_ref()
654 .is_some_and(|capture| element_stack.len() == capture.depth)
655 {
656 let capture = active_capture.take().expect("checked above");
657 let visible = normalize_space(&capture.text);
658 if !visible.is_empty() {
659 blocks.push(build_block(
660 section_id,
661 initial_block_count + blocks.len(),
662 BlockKind::Paragraph,
663 DomPath(capture.path),
664 Vec::new(),
665 Vec::new(),
666 visible,
667 ));
668 }
669 }
670 element_stack.pop();
671 }
672 Event::Eof => break,
673 _ => {}
674 }
675 }
676
677 Ok(blocks)
678}
679
680#[derive(Debug)]
681struct BlockBuilder {
682 anchor_depth: usize,
688 kind: BlockKind,
689 dom_path: DomPath,
690 ordinal: usize,
691 text_runs: Vec<TextRun>,
692 inline_marks: Vec<InlineMark>,
693 inline_stack: Vec<String>,
694 visible_text: String,
695 next_run: usize,
696 next_marker: usize,
697}
698
699impl BlockBuilder {
700 fn new(anchor_depth: usize, kind: BlockKind, dom_path: DomPath, ordinal: usize) -> Self {
701 Self {
702 anchor_depth,
703 kind,
704 dom_path,
705 ordinal,
706 text_runs: Vec::new(),
707 inline_marks: Vec::new(),
708 inline_stack: Vec::new(),
709 visible_text: String::new(),
710 next_run: 0,
711 next_marker: 0,
712 }
713 }
714
715 fn push_text(&mut self, text: &str) {
716 let Some(mut text) = normalize_text_fragment(text) else {
717 if !text.is_empty()
722 && !self.visible_text.is_empty()
723 && !self.visible_text.ends_with(' ')
724 {
725 self.visible_text.push(' ');
726 if let Some(run) = self
727 .text_runs
728 .iter_mut()
729 .rev()
730 .find(|run| !is_marker_token(&run.text))
731 {
732 run.text.push(' ');
733 }
734 }
735 return;
736 };
737
738 if self.visible_text.is_empty() {
739 text = text.trim_start().to_string();
740 }
741
742 if text.is_empty() {
743 return;
744 }
745
746 self.visible_text.push_str(&text);
747 self.push_run(text);
748 }
749
750 fn push_inline_start(&mut self, name: &[u8]) {
751 let id = marker_id("m", self.next_marker);
752 self.next_marker += 1;
753 self.inline_marks.push(InlineMark {
754 id: id.clone(),
755 kind: String::from_utf8_lossy(name).into_owned(),
756 });
757 self.inline_stack.push(id.clone());
758 self.push_run(format!("<{id}>"));
759 }
760
761 fn push_inline_empty(&mut self, name: &[u8]) {
762 let id = marker_id("r", self.next_marker);
763 self.next_marker += 1;
764 self.inline_marks.push(InlineMark {
765 id: id.clone(),
766 kind: String::from_utf8_lossy(name).into_owned(),
767 });
768 self.push_run(format!("<{id}/>"));
769 }
770
771 fn push_inline_end(&mut self) {
772 if let Some(id) = self.inline_stack.pop() {
773 self.push_run(format!("</{id}>"));
774 }
775 }
776
777 fn finish(mut self, section_id: &SectionId) -> Option<Block> {
778 self.trim_trailing_text();
779 let visible_text = normalize_space(&self.visible_text);
780 if visible_text.is_empty() {
781 return None;
782 }
783
784 Some(build_block(
785 section_id,
786 self.ordinal,
787 self.kind,
788 self.dom_path,
789 self.text_runs,
790 self.inline_marks,
791 visible_text,
792 ))
793 }
794
795 fn push_run(&mut self, text: String) {
796 self.text_runs.push(TextRun {
797 id: format!("r{:06}_{:03}", self.ordinal, self.next_run),
798 text,
799 });
800 self.next_run += 1;
801 }
802
803 fn trim_trailing_text(&mut self) {
804 if let Some(run) = self
805 .text_runs
806 .iter_mut()
807 .rev()
808 .find(|run| !is_marker_token(&run.text))
809 {
810 run.text = run.text.trim_end().to_string();
811 }
812
813 self.text_runs.retain(|run| !run.text.is_empty());
814 }
815}
816
817fn extract_blocks(
818 xhtml: &str,
819 _href: &str,
820 section_id: &SectionId,
821 initial_block_count: usize,
822) -> Result<Vec<Block>> {
823 let mut reader = Reader::from_str(xhtml);
824 reader.config_mut().trim_text(false);
825
826 let mut element_stack = Vec::<ElementFrame>::new();
827 let mut active_block: Option<BlockBuilder> = None;
828 let mut blocks = Vec::new();
829 let mut suppress_depth = 0usize;
832
833 loop {
834 match reader.read_event()? {
835 Event::Start(element) => {
836 let name = local_name(element.name().as_ref()).to_vec();
837 let path = enter_element(&mut element_stack, &name);
838 if never_translate_element(&name) {
839 suppress_depth += 1;
840 }
841
842 if active_block.is_none()
843 && let Some(kind) = block_kind(&name, &element)?
844 {
845 active_block = Some(BlockBuilder::new(
846 element_stack.len(),
847 kind,
848 DomPath(path),
849 initial_block_count + blocks.len(),
850 ));
851 } else if let Some(block) = active_block.as_mut() {
852 block.push_inline_start(&name);
853 }
854 }
855 Event::Empty(element) => {
856 let name = local_name(element.name().as_ref()).to_vec();
857 next_child_path(&mut element_stack);
862
863 if let Some(block) = active_block.as_mut() {
864 block.push_inline_empty(&name);
865 }
866 }
867 Event::Text(text) => {
868 let value = text
869 .html_content()
870 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
871 handle_text(
872 &value,
873 &mut active_block,
874 &mut element_stack,
875 &mut blocks,
876 section_id,
877 initial_block_count,
878 suppress_depth > 0,
879 true,
880 );
881 }
882 Event::CData(text) => {
883 let value = text
884 .decode()
885 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
886 handle_text(
887 &value,
888 &mut active_block,
889 &mut element_stack,
890 &mut blocks,
891 section_id,
892 initial_block_count,
893 suppress_depth > 0,
894 true,
895 );
896 }
897 Event::GeneralRef(reference) => {
904 if let Some(value) = resolve_general_ref(&reference)? {
905 handle_text(
906 &value,
907 &mut active_block,
908 &mut element_stack,
909 &mut blocks,
910 section_id,
911 initial_block_count,
912 suppress_depth > 0,
913 false,
914 );
915 }
916 }
917 Event::End(_) => {
918 let should_finish = active_block
919 .as_ref()
920 .is_some_and(|block| element_stack.len() == block.anchor_depth);
921
922 if should_finish {
923 let block = active_block.take().expect("checked above");
924 if let Some(block) = block.finish(section_id) {
925 blocks.push(block);
926 }
927 } else if let Some(block) = active_block.as_mut() {
928 block.push_inline_end();
929 }
930
931 if element_stack
932 .pop()
933 .is_some_and(|frame| never_translate_element(&frame.name))
934 {
935 suppress_depth = suppress_depth.saturating_sub(1);
936 }
937 }
938 Event::Eof => break,
939 _ => {}
940 }
941 }
942
943 Ok(blocks)
944}
945
946fn resolve_general_ref(reference: &quick_xml::events::BytesRef<'_>) -> Result<Option<String>> {
950 if let Some(ch) = reference
951 .resolve_char_ref()
952 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?
953 {
954 return Ok(Some(ch.to_string()));
955 }
956 let name = reference
957 .decode()
958 .map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
959 let resolved = quick_xml::escape::resolve_html5_entity(&name).map(ToString::to_string);
960 if resolved.is_none() {
961 tracing::warn!(entity = %name, "dropping unresolvable entity reference");
962 }
963 Ok(resolved)
964}
965
966#[allow(clippy::too_many_arguments)]
974fn handle_text(
975 value: &str,
976 active_block: &mut Option<BlockBuilder>,
977 element_stack: &mut [ElementFrame],
978 blocks: &mut Vec<Block>,
979 section_id: &SectionId,
980 initial_block_count: usize,
981 suppressed: bool,
982 allow_stray: bool,
983) {
984 if let Some(block) = active_block.as_mut() {
985 block.push_text(value);
986 return;
987 }
988 if suppressed || value.trim().is_empty() {
989 return;
990 }
991 let depth = element_stack.len();
992 let Some(frame) = element_stack.last_mut() else {
993 return;
994 };
995 if frame.child_count == 0 && anchors_text_block(&frame.name) {
996 let mut block = BlockBuilder::new(
997 depth,
998 BlockKind::Paragraph,
999 DomPath(frame.path.clone()),
1000 initial_block_count + blocks.len(),
1001 );
1002 block.push_text(value);
1003 *active_block = Some(block);
1004 return;
1005 }
1006 if !allow_stray {
1007 return;
1008 }
1009 let mut path = frame.path.clone();
1014 path.push(bookforge_core::ir::TEXT_NODE_PATH_BASE + frame.text_count);
1015 frame.text_count += 1;
1016 let visible = normalize_space(value);
1017 if visible.is_empty() {
1018 return;
1019 }
1020 blocks.push(build_block(
1021 section_id,
1022 initial_block_count + blocks.len(),
1023 BlockKind::Paragraph,
1024 DomPath(path),
1025 Vec::new(),
1026 Vec::new(),
1027 visible,
1028 ));
1029}
1030
1031fn never_translate_element(name: &[u8]) -> bool {
1033 matches!(name, b"script" | b"style" | b"svg" | b"math")
1034}
1035
1036fn anchors_text_block(name: &[u8]) -> bool {
1041 !matches!(
1042 name,
1043 b"body"
1044 | b"html"
1045 | b"section"
1046 | b"article"
1047 | b"main"
1048 | b"nav"
1049 | b"head"
1050 | b"header"
1051 | b"footer"
1052 | b"aside"
1053 | b"figure"
1054 | b"ul"
1055 | b"ol"
1056 | b"dl"
1057 | b"table"
1058 | b"thead"
1059 | b"tbody"
1060 | b"tfoot"
1061 | b"colgroup"
1062 )
1063}
1064
1065fn enter_element(stack: &mut Vec<ElementFrame>, name: &[u8]) -> Vec<usize> {
1066 let path = next_child_path(stack);
1067 stack.push(ElementFrame {
1068 name: name.to_vec(),
1069 path: path.clone(),
1070 child_count: 0,
1071 text_count: 0,
1072 });
1073 path
1074}
1075
1076fn next_child_path(stack: &mut [ElementFrame]) -> Vec<usize> {
1077 let Some(parent) = stack.last_mut() else {
1078 return vec![0];
1079 };
1080 let child_index = parent.child_count;
1081 parent.child_count += 1;
1082 let mut path = parent.path.clone();
1083 path.push(child_index);
1084 path
1085}
1086
1087fn block_kind(name: &[u8], element: &BytesStart<'_>) -> Result<Option<BlockKind>> {
1088 Ok(match name {
1089 b"h1" => Some(BlockKind::Heading(1)),
1090 b"h2" => Some(BlockKind::Heading(2)),
1091 b"h3" => Some(BlockKind::Heading(3)),
1092 b"h4" => Some(BlockKind::Heading(4)),
1093 b"h5" => Some(BlockKind::Heading(5)),
1094 b"h6" => Some(BlockKind::Heading(6)),
1095 b"p" => Some(BlockKind::Paragraph),
1096 b"li" => Some(BlockKind::ListItem),
1097 b"blockquote" => Some(BlockKind::Quote),
1098 b"td" | b"th" => Some(BlockKind::TableCell),
1099 b"tr" => Some(BlockKind::TableRow),
1100 b"figcaption" | b"caption" => Some(BlockKind::Caption),
1101 b"pre" | b"code" => Some(BlockKind::Code),
1102 b"aside" if has_epub_type(element, b"footnote")? => Some(BlockKind::Footnote),
1103 _ => None,
1104 })
1105}
1106
1107fn has_epub_type(element: &BytesStart<'_>, expected: &[u8]) -> Result<bool> {
1108 for attr in element.attributes() {
1109 let attr = attr.map_err(|err| BookforgeError::InvalidInput(err.to_string()))?;
1110 if local_name(attr.key.as_ref()) == b"type" {
1111 let value = attr.unescape_value()?.into_owned();
1112 return Ok(value
1113 .split_ascii_whitespace()
1114 .any(|item| item.as_bytes() == expected));
1115 }
1116 }
1117 Ok(false)
1118}
1119
1120fn build_block(
1121 section_id: &SectionId,
1122 ordinal: usize,
1123 kind: BlockKind,
1124 dom_path: DomPath,
1125 text_runs: Vec<TextRun>,
1126 inline_marks: Vec<InlineMark>,
1127 visible_text: String,
1128) -> Block {
1129 let text_runs = if text_runs.is_empty() {
1130 vec![TextRun {
1131 id: format!("r{ordinal:06}_000"),
1132 text: visible_text.clone(),
1133 }]
1134 } else {
1135 text_runs
1136 };
1137 let protected_spans = detect_protected_spans(&visible_text);
1138
1139 Block {
1140 id: BlockId(format!("b_{ordinal:06}")),
1141 section_id: section_id.clone(),
1142 kind,
1143 dom_path,
1144 text_runs,
1145 inline_marks,
1146 protected_spans,
1147 token_estimate: estimate_tokens(&visible_text),
1148 }
1149}
1150
1151fn first_heading(blocks: &[Block]) -> (Option<String>, Option<u8>) {
1152 blocks
1153 .iter()
1154 .find_map(|block| match block.kind {
1155 BlockKind::Heading(level) => Some((Some(block_visible_text(block)), Some(level))),
1156 _ => None,
1157 })
1158 .unwrap_or((None, None))
1159}
1160
1161fn link_sections(sections: &mut [Section]) {
1162 let ids = sections
1163 .iter()
1164 .map(|section| section.id.clone())
1165 .collect::<Vec<_>>();
1166
1167 for (index, section) in sections.iter_mut().enumerate() {
1168 section.prev = index.checked_sub(1).and_then(|prev| ids.get(prev).cloned());
1169 section.next = ids.get(index + 1).cloned();
1170 }
1171}
1172
1173fn normalize_space(text: &str) -> String {
1174 text.split_whitespace().collect::<Vec<_>>().join(" ")
1175}
1176
1177fn normalize_text_fragment(text: &str) -> Option<String> {
1178 if text.trim().is_empty() {
1179 return None;
1180 }
1181
1182 let mut normalized = normalize_space(text);
1183 if text.chars().next().is_some_and(char::is_whitespace) {
1184 normalized.insert(0, ' ');
1185 }
1186 if text.chars().last().is_some_and(char::is_whitespace) {
1187 normalized.push(' ');
1188 }
1189 Some(normalized)
1190}
1191
1192fn block_visible_text(block: &Block) -> String {
1193 let marked = block
1194 .text_runs
1195 .iter()
1196 .map(|run| run.text.as_str())
1197 .collect::<Vec<_>>()
1198 .join("");
1199 normalize_space(&strip_marker_tokens(&marked))
1200}
1201
1202fn marker_id(prefix: &str, marker_ordinal: usize) -> String {
1203 format!("{prefix}{}", marker_ordinal + 1)
1204}
1205
1206fn estimate_tokens(text: &str) -> usize {
1207 let words = text.split_whitespace().count();
1208 words.saturating_mul(4).div_ceil(3).max(1)
1209}
1210
1211fn detect_protected_spans(text: &str) -> Vec<ProtectedSpan> {
1212 let mut spans = text
1213 .split_whitespace()
1214 .filter_map(|raw| {
1215 let value = trim_token(raw);
1216 protected_span_kind(value).map(|kind| ProtectedSpan {
1217 kind,
1218 text: value.to_string(),
1219 })
1220 })
1221 .collect::<Vec<_>>();
1222 spans.sort_by(|left, right| left.text.cmp(&right.text));
1223 spans.dedup_by(|left, right| left.kind == right.kind && left.text == right.text);
1224 spans
1225}
1226
1227fn protected_span_kind(value: &str) -> Option<ProtectedSpanKind> {
1228 if value.is_empty() {
1229 None
1230 } else if value.starts_with("http://") || value.starts_with("https://") {
1231 Some(ProtectedSpanKind::Url)
1232 } else if value.starts_with('#') && value.len() > 1 {
1233 Some(ProtectedSpanKind::InternalAnchor)
1234 } else if looks_like_email(value) {
1235 Some(ProtectedSpanKind::Email)
1236 } else if looks_like_citation(value) {
1237 Some(ProtectedSpanKind::Citation)
1238 } else if looks_like_protected_number(value) {
1239 Some(ProtectedSpanKind::Number)
1240 } else if looks_like_filename(value) {
1241 Some(ProtectedSpanKind::Filename)
1242 } else {
1243 None
1244 }
1245}
1246
1247fn trim_token(raw: &str) -> &str {
1248 let trimmed = raw.trim_matches(|ch: char| {
1249 matches!(
1250 ch,
1251 ',' | ';' | ':' | '.' | '!' | '?' | '(' | ')' | '"' | '\''
1252 )
1253 });
1254 if trimmed.starts_with("[@") && trimmed.ends_with(']') {
1255 trimmed
1256 } else {
1257 trimmed.trim_matches(|ch: char| matches!(ch, '[' | ']'))
1258 }
1259}
1260
1261fn looks_like_email(value: &str) -> bool {
1262 let Some((local, domain)) = value.split_once('@') else {
1263 return false;
1264 };
1265 !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
1266}
1267
1268fn looks_like_citation(value: &str) -> bool {
1269 (value.starts_with('@') && value.len() > 1)
1270 || (value.starts_with("[@") && value.ends_with(']') && value.len() > 3)
1271}
1272
1273fn looks_like_filename(value: &str) -> bool {
1274 let Some((stem, ext)) = value.rsplit_once('.') else {
1275 return false;
1276 };
1277 const COMMON_EXTENSIONS: &[&str] = &[
1278 "azw", "css", "csv", "epub", "gif", "htm", "html", "jpeg", "jpg", "js", "json", "md",
1279 "mobi", "ncx", "opf", "pdf", "png", "svg", "txt", "xhtml", "xml", "zip",
1280 ];
1281 let ext = ext.to_ascii_lowercase();
1282 !stem.is_empty()
1283 && COMMON_EXTENSIONS.contains(&ext.as_str())
1284 && stem
1285 .chars()
1286 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '/' | '.'))
1287}
1288
1289fn looks_like_protected_number(value: &str) -> bool {
1290 let digit_count = value.chars().filter(|ch| ch.is_ascii_digit()).count();
1291 if digit_count == 0 {
1292 return false;
1293 }
1294 if digit_count >= 2 {
1295 return value.chars().all(|ch| {
1296 ch.is_ascii_digit()
1297 || matches!(
1298 ch,
1299 '.' | ',' | ':' | ';' | '/' | '-' | '+' | '%' | '$' | '\u{20ac}' | '\u{00a3}'
1300 )
1301 });
1302 }
1303 value.ends_with("st") || value.ends_with("nd") || value.ends_with("rd") || value.ends_with("th")
1304}
1305
1306fn read_archive_text(archive: &mut ZipArchive<File>, path: &str) -> Result<String> {
1307 let mut file = archive.by_name(path)?;
1308 let mut text = String::new();
1309 file.read_to_string(&mut text)?;
1310 Ok(text)
1311}
1312
1313fn is_xhtml_media_type(media_type: &str) -> bool {
1314 matches!(
1315 media_type,
1316 "application/xhtml+xml" | "text/html" | "application/xml"
1317 )
1318}
1319
1320fn is_nav_item(item: &Resource) -> bool {
1321 item.media_type == "application/xhtml+xml"
1322 && (item.properties.iter().any(|property| property == "nav")
1323 || item.href.ends_with("nav.xhtml"))
1324}
1325
1326fn package_base_dir(package_path: &str) -> String {
1327 Path::new(package_path)
1328 .parent()
1329 .and_then(Path::to_str)
1330 .unwrap_or("")
1331 .to_string()
1332}
1333
1334fn join_epub_path(base: &str, href: &str) -> String {
1335 if base.is_empty() {
1336 normalize_epub_path(href)
1337 } else {
1338 normalize_epub_path(&format!("{base}/{href}"))
1339 }
1340}
1341
1342fn normalize_epub_path(path: &str) -> String {
1343 let mut normalized = PathBuf::new();
1344 for component in Path::new(path).components() {
1345 normalized.push(component.as_os_str());
1346 }
1347 normalized.to_string_lossy().replace('\\', "/")
1348}
1349
1350fn local_name(name: &[u8]) -> &[u8] {
1351 name.rsplit(|byte| *byte == b':').next().unwrap_or(name)
1352}
1353
1354#[cfg(test)]
1355mod tests {
1356 use super::*;
1357
1358 #[test]
1359 fn extracts_inline_marks_and_marker_text_runs() {
1360 let section_id = SectionId("sec_000000".to_string());
1361 let blocks = extract_blocks(
1362 "<html><body><p>Hello <em>world</em>!</p></body></html>",
1363 "chapter.xhtml",
1364 §ion_id,
1365 0,
1366 )
1367 .expect("block extraction should succeed");
1368
1369 assert_eq!(blocks.len(), 1);
1370 let text = block_text(&blocks[0]);
1371 assert_eq!(text, "Hello <m1>world</m1>!");
1372 assert_eq!(blocks[0].inline_marks.len(), 1);
1373 assert_eq!(blocks[0].inline_marks[0].id, "m1");
1374 assert_eq!(blocks[0].inline_marks[0].kind, "em");
1375 assert_eq!(blocks[0].token_estimate, estimate_tokens("Hello world!"));
1376 }
1377
1378 #[test]
1379 fn extracts_empty_inline_marker() {
1380 let section_id = SectionId("sec_000000".to_string());
1381 let blocks = extract_blocks(
1382 "<html><body><p>Line<br/>break</p></body></html>",
1383 "chapter.xhtml",
1384 §ion_id,
1385 4,
1386 )
1387 .expect("block extraction should succeed");
1388
1389 assert_eq!(blocks.len(), 1);
1390 assert_eq!(blocks[0].id.0, "b_000004");
1391 assert_eq!(block_text(&blocks[0]), "Line<r1/>break");
1392 assert_eq!(blocks[0].inline_marks[0].id, "r1");
1393 assert_eq!(blocks[0].inline_marks[0].kind, "br");
1394 }
1395
1396 #[test]
1397 fn extracts_text_anchored_block_from_div() {
1398 let section_id = SectionId("sec_000000".to_string());
1399 let blocks = extract_blocks(
1400 "<html><body><div class=\"x\">Bare div text with <em>emphasis</em>.</div></body></html>",
1401 "chapter.xhtml",
1402 §ion_id,
1403 0,
1404 )
1405 .expect("block extraction should succeed");
1406
1407 assert_eq!(blocks.len(), 1);
1408 assert_eq!(
1409 block_text(&blocks[0]),
1410 "Bare div text with <m1>emphasis</m1>."
1411 );
1412 assert_eq!(blocks[0].kind, BlockKind::Paragraph);
1413 }
1414
1415 #[test]
1416 fn extracts_dt_and_dd_text() {
1417 let section_id = SectionId("sec_000000".to_string());
1418 let blocks = extract_blocks(
1419 "<html><body><dl><dt>Term</dt><dd>Definition</dd></dl></body></html>",
1420 "chapter.xhtml",
1421 §ion_id,
1422 0,
1423 )
1424 .expect("block extraction should succeed");
1425
1426 let texts: Vec<String> = blocks.iter().map(block_text).collect();
1427 assert_eq!(texts, vec!["Term".to_string(), "Definition".to_string()]);
1428 }
1429
1430 #[test]
1431 fn stray_text_after_children_becomes_addressable_block() {
1432 let section_id = SectionId("sec_000000".to_string());
1433 let blocks = extract_blocks(
1434 "<html><body><p>Captured</p>Naked tail text</body></html>",
1435 "chapter.xhtml",
1436 §ion_id,
1437 0,
1438 )
1439 .expect("block extraction should succeed");
1440
1441 assert_eq!(blocks.len(), 2);
1442 assert_eq!(block_text(&blocks[1]), "Naked tail text");
1443 let last = *blocks[1]
1444 .dom_path
1445 .0
1446 .last()
1447 .expect("path should not be empty");
1448 assert!(
1449 last >= bookforge_core::ir::TEXT_NODE_PATH_BASE,
1450 "stray text block must use a text-node path component, got {last}"
1451 );
1452 }
1453
1454 #[test]
1455 fn nested_same_name_blocks_stay_in_one_block() {
1456 let section_id = SectionId("sec_000000".to_string());
1457 let blocks = extract_blocks(
1458 "<html><body><ul><li>Outer <ul><li>Inner</li></ul> tail</li><li>Sibling</li></ul></body></html>",
1459 "chapter.xhtml",
1460 §ion_id,
1461 0,
1462 )
1463 .expect("block extraction should succeed");
1464
1465 assert_eq!(blocks.len(), 2, "outer li (with nested list) + sibling li");
1466 let outer = block_text(&blocks[0]);
1467 assert!(outer.contains("Outer"), "got: {outer}");
1468 assert!(
1469 outer.contains("Inner"),
1470 "nested li text stays inside the outer block: {outer}"
1471 );
1472 assert!(
1473 outer.contains("tail"),
1474 "text after the nested list must not be lost: {outer}"
1475 );
1476 assert_eq!(block_text(&blocks[1]), "Sibling");
1477 }
1478
1479 #[test]
1480 fn named_html_entities_decode_in_text() {
1481 let section_id = SectionId("sec_000000".to_string());
1482 let blocks = extract_blocks(
1483 "<html><body><p>one two—three</p></body></html>",
1484 "chapter.xhtml",
1485 §ion_id,
1486 0,
1487 )
1488 .expect("named entities must not fail extraction");
1489
1490 assert_eq!(blocks.len(), 1);
1491 assert_eq!(block_text(&blocks[0]), "one two\u{2014}three");
1492 }
1493
1494 #[test]
1495 fn script_and_style_text_is_never_extracted() {
1496 let section_id = SectionId("sec_000000".to_string());
1497 let blocks = extract_blocks(
1498 "<html><head><title>Meta</title><style>p { color: red; }</style></head><body><script>var x = 1;</script><div>Real</div></body></html>",
1499 "chapter.xhtml",
1500 §ion_id,
1501 0,
1502 )
1503 .expect("block extraction should succeed");
1504
1505 let texts = blocks.iter().map(block_text).collect::<Vec<_>>();
1506 assert_eq!(texts, vec!["Meta", "Real"]);
1507 assert!(!texts.iter().any(|text| text.contains("color")));
1508 assert!(!texts.iter().any(|text| text.contains("var x")));
1509 }
1510
1511 #[test]
1512 fn visible_body_chars_counts_body_and_title_but_not_style() {
1513 let xhtml = r#"<html><head><title>Heading</title><style>p { color: red; }</style></head>
1514<body><p>captured</p><div>div text</div></body></html>"#;
1515 let total = visible_body_chars(xhtml).expect("count should succeed");
1516 assert_eq!(total, 22);
1519 }
1520
1521 #[test]
1522 fn coverage_captures_div_text_and_reports_svg_labels_uncaptured() {
1523 let section_id = SectionId("sec_000000".to_string());
1524 let xhtml =
1525 "<html><body><p>in a block</p><div>also in a div</div><svg><text>diagram label</text></svg></body></html>"
1526 .to_string();
1527 let blocks =
1528 extract_blocks(&xhtml, "chapter.xhtml", §ion_id, 0).expect("blocks should parse");
1529 let captured = blocks
1530 .iter()
1531 .map(|block| non_whitespace_chars(&block_visible_text(block)))
1532 .sum::<usize>();
1533 let total = visible_body_chars(&xhtml).expect("count should succeed");
1534
1535 assert_eq!(
1536 captured,
1537 non_whitespace_chars("in a block") + non_whitespace_chars("also in a div"),
1538 "prose in p and div must both be captured"
1539 );
1540 assert_eq!(
1541 total - captured,
1542 non_whitespace_chars("diagram label"),
1543 "svg text stays uncaptured and visible in the coverage gap"
1544 );
1545 }
1546
1547 #[test]
1548 fn protected_spans_do_not_overflag_single_digits() {
1549 let spans = detect_protected_spans(
1550 "Chapter 1 cites https://example.com, file.txt, #anchor, and pages 12-14.",
1551 );
1552 let texts = spans
1553 .iter()
1554 .map(|span| span.text.as_str())
1555 .collect::<Vec<_>>();
1556
1557 assert!(!texts.contains(&"1"));
1558 assert!(texts.contains(&"https://example.com"));
1559 assert!(texts.contains(&"file.txt"));
1560 assert!(texts.contains(&"#anchor"));
1561 assert!(texts.contains(&"12-14"));
1562 }
1563
1564 #[test]
1565 fn protected_spans_do_not_treat_sentence_fragments_as_filenames() {
1566 let spans = detect_protected_spans(
1567 "case.Fedor bow.At said:“The file.txt chapter.xhtml [@tolstoy1886] @note1",
1568 );
1569 let texts = spans
1570 .iter()
1571 .map(|span| span.text.as_str())
1572 .collect::<Vec<_>>();
1573
1574 assert!(!texts.contains(&"case.Fedor"));
1575 assert!(!texts.contains(&"bow.At"));
1576 assert!(!texts.contains(&"said:“The"));
1577 assert!(texts.contains(&"file.txt"));
1578 assert!(texts.contains(&"chapter.xhtml"));
1579 assert!(texts.contains(&"[@tolstoy1886]"));
1580 assert!(texts.contains(&"@note1"));
1581 }
1582
1583 fn block_text(block: &Block) -> String {
1584 block
1585 .text_runs
1586 .iter()
1587 .map(|run| run.text.as_str())
1588 .collect::<Vec<_>>()
1589 .join("")
1590 }
1591}