1use anyhow::{Context, Result};
2use once_cell::sync::Lazy;
3use rbook::Epub;
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6use std::path::{Path, PathBuf};
7use walkdir::WalkDir;
8
9use kuchiki::NodeRef;
10
11mod collect;
12mod export;
13mod heading;
14mod postprocess;
15mod render;
16
17use collect::{
18 collect_image_hrefs, collect_media_hrefs, collect_readable_spine_docs, collect_toc_entries,
19 load_content,
20};
21use export::{write_manifest_export, write_markdown_outputs, write_quality_report};
22use heading::{detect_heading_candidates, prettify_section_name};
23use postprocess::{cleanup_toc_entries, postprocess_sections};
24use render::{
25 build_style_header, collect_css, extract_image, extract_media_file,
26 render_partial_with_anchors, resolve_and_extract_image,
27};
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
30pub enum FormatMode {
31 Plain,
32 Rich,
33}
34
35#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
36pub enum CssMode {
37 Inline,
38 External,
39}
40
41#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
42pub enum ChapterFallbackMode {
43 Off,
44 Auto,
45 Force,
46}
47
48#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
49pub enum NotesMode {
50 Inline,
51 ChapterEnd,
52 Global,
53}
54
55#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
56pub enum ExportMode {
57 Off,
58 V1,
59}
60
61#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
62pub enum OcrCleanupMode {
63 Off,
64 Basic,
65 Aggressive,
66}
67
68#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
69pub enum NavCleanupMode {
70 Off,
71 Auto,
72}
73
74#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
75pub enum FilenameScheme {
76 Index,
77 Hash,
78}
79
80#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
81pub enum MediaMode {
82 None,
83 Image,
84 All,
85}
86
87#[derive(Clone, Debug)]
88pub struct ConvertOptions {
89 pub input: PathBuf,
90 pub output: PathBuf,
91 pub media: MediaMode,
92 pub format: FormatMode,
93 pub css: CssMode,
94 pub split_chapters: bool,
95 pub chapter_fallback: ChapterFallbackMode,
96 pub notes_mode: NotesMode,
97 pub export_manifest: ExportMode,
98 pub quality_report: ExportMode,
99 pub ocr_cleanup: OcrCleanupMode,
100 pub nav_cleanup: NavCleanupMode,
101 pub filename_scheme: FilenameScheme,
102}
103
104impl ConvertOptions {
105 pub fn new(input: PathBuf, output: PathBuf) -> Self {
106 Self {
107 input,
108 output,
109 media: MediaMode::Image,
110 format: FormatMode::Plain,
111 css: CssMode::Inline,
112 split_chapters: false,
113 chapter_fallback: ChapterFallbackMode::Auto,
114 notes_mode: NotesMode::Inline,
115 export_manifest: ExportMode::Off,
116 quality_report: ExportMode::Off,
117 ocr_cleanup: OcrCleanupMode::Off,
118 nav_cleanup: NavCleanupMode::Auto,
119 filename_scheme: FilenameScheme::Index,
120 }
121 }
122}
123
124#[derive(Clone, Copy, Debug, PartialEq, Eq)]
125pub enum DiagnosticLevel {
126 Info,
127 Warning,
128 Error,
129}
130
131#[derive(Clone, Debug)]
132pub struct Diagnostic {
133 pub level: DiagnosticLevel,
134 pub message: String,
135}
136
137#[derive(Clone, Debug)]
138pub struct BookConversionResult {
139 pub input_path: PathBuf,
140 pub title: String,
141 pub output_path: Option<PathBuf>,
142 pub diagnostics: Vec<Diagnostic>,
143}
144
145#[derive(Clone, Debug, Default)]
146pub struct ConversionSummary {
147 pub books: Vec<BookConversionResult>,
148}
149
150impl ConversionSummary {
151 pub fn failure_count(&self) -> usize {
152 self.books
153 .iter()
154 .filter(|book| book.output_path.is_none())
155 .count()
156 }
157
158 pub fn success_count(&self) -> usize {
159 self.books.len().saturating_sub(self.failure_count())
160 }
161}
162
163#[derive(Clone, Debug)]
164struct TocEntryInfo {
165 label: String,
166 href_path: String,
167 fragment: Option<String>,
168}
169
170#[derive(Clone, Debug)]
171struct ContentDoc {
172 href_path: String,
173 document: NodeRef,
174}
175
176#[derive(Clone, Debug)]
177struct ReadableSpineDoc {
178 href_path: String,
179 label: String,
180}
181
182#[derive(Clone, Debug)]
183struct HeadingCandidate {
184 spine_idx: usize,
185 score: f32,
186 label: String,
187}
188
189#[derive(Clone, Debug)]
190struct SectionRecord {
191 title: String,
192 text: String,
193 start_href: String,
194 start_fragment: Option<String>,
195 end_href: Option<String>,
196 end_fragment: Option<String>,
197 spine_start: usize,
198 spine_end: usize,
199 anchors: Vec<String>,
200 section_id: String,
201 output_path: String,
202}
203
204#[derive(Clone, Debug, Default)]
205struct PostprocessStats {
206 link_rewritten: usize,
207 link_unresolved: usize,
208 cleanup_changes: usize,
209 notes_written: usize,
210 global_note_lines: Vec<String>,
211}
212
213const COMPLEX_HTML_TAGS: &[&str] = &[
214 "table",
215 "thead",
216 "tbody",
217 "tr",
218 "td",
219 "th",
220 "figure",
221 "figcaption",
222 "svg",
223 "math",
224];
225
226static MAJOR_HEADING_RE: Lazy<Regex> = Lazy::new(|| {
227 Regex::new(
228 r"(?i)\b(?:chapter|book|part)\s+(?:[ivxlcdm]+|\d+)\b|\b(?:preface|prologue|epilogue|introduction|foreword|afterword)\b",
229 )
230 .expect("valid heading regex")
231});
232static MAJOR_HEADING_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
233 Regex::new(
234 r"(?i)\b(?:chapter|book|part)\s+(?:[ivxlcdm]+|\d+)(?:\s*[:.-]?\s*[a-z0-9][a-z0-9' -]{0,70})?|\b(?:preface|prologue|epilogue|introduction|foreword|afterword)\b",
235 )
236 .expect("valid heading label regex")
237});
238static OCR_NOISE_RE: Lazy<Regex> = Lazy::new(|| {
239 Regex::new(r"(?i)estimated\s+to\s+be\s+only\s+\d+(?:\.\d+)?%\s+accurate")
240 .expect("valid ocr regex")
241});
242static MARKDOWN_LINK_RE: Lazy<Regex> =
243 Lazy::new(|| Regex::new(r"(!?)\[([^\]]+)\]\(([^)]+)\)").expect("valid markdown link regex"));
244static HTML_HREF_RE: Lazy<Regex> = Lazy::new(|| {
245 Regex::new(r#"(?i)(<a\b[^>]*?\bhref=")([^"]+)(")"#).expect("valid html href regex")
246});
247static FOOTNOTE_DEF_RE: Lazy<Regex> =
248 Lazy::new(|| Regex::new(r"^\[\^([^\]]+)\]:\s*(.*)$").expect("valid footnote regex"));
249
250pub fn convert_all(options: &ConvertOptions) -> Result<ConversionSummary> {
251 let epub_paths = collect_input_epubs(&options.input)?;
252
253 let mut summary = ConversionSummary::default();
254 for epub_path in epub_paths {
255 match convert_epub_result(&epub_path, options) {
256 Ok(result) => summary.books.push(result),
257 Err(err) => {
258 summary.books.push(BookConversionResult {
259 input_path: epub_path.clone(),
260 title: epub_path
261 .file_stem()
262 .and_then(|s| s.to_str())
263 .unwrap_or("book")
264 .to_string(),
265 output_path: None,
266 diagnostics: vec![Diagnostic {
267 level: DiagnosticLevel::Error,
268 message: format!("Failed to parse {}: {err}", epub_path.display()),
269 }],
270 });
271 }
272 }
273 }
274
275 Ok(summary)
276}
277
278pub fn convert_epub(epub_path: &Path, options: &ConvertOptions) -> Result<PathBuf> {
279 let result = convert_epub_result(epub_path, options)?;
280 result
281 .output_path
282 .ok_or_else(|| anyhow::anyhow!("No output path generated for {}", epub_path.display()))
283}
284
285pub fn convert_epub_result(
286 epub_path: &Path,
287 options: &ConvertOptions,
288) -> Result<BookConversionResult> {
289 let epub = Epub::open(epub_path)
290 .with_context(|| format!("Failed to open epub {}", epub_path.display()))?;
291
292 let title = epub
293 .metadata()
294 .title()
295 .map(|t| t.value().to_string())
296 .unwrap_or_else(|| {
297 epub_path
298 .file_stem()
299 .and_then(|s| s.to_str())
300 .unwrap_or("book")
301 .to_string()
302 });
303
304 let author = epub
305 .metadata()
306 .creators()
307 .next()
308 .map(|c| c.value().to_string());
309
310 let book_slug = slugify(&title);
311 let book_dir = options.output.join(&book_slug);
312 let image_root = book_dir.join("images");
313 let media_root = book_dir.join("media");
314 let style_root = book_dir.join("styles");
315 let image_link_prefix = if options.split_chapters {
316 "./images".to_string()
317 } else {
318 format!("./{book_slug}/images")
319 };
320 let media_link_prefix = if options.split_chapters {
321 "./media".to_string()
322 } else {
323 format!("./{book_slug}/media")
324 };
325 let style_link_prefix = if options.split_chapters {
326 "./styles".to_string()
327 } else {
328 format!("./{book_slug}/styles")
329 };
330
331 let mut extracted_images: HashMap<String, String> = HashMap::new();
332 let mut extracted_media: HashMap<String, String> = HashMap::new();
333 let mut extracted_count = 0usize;
334 let mut extracted_media_count = 0usize;
335
336 let mut css_hrefs: HashSet<String> = HashSet::new();
337 let mut inline_styles: Vec<String> = Vec::new();
338 let mut warnings: Vec<String> = Vec::new();
339 let mut errors: Vec<String> = Vec::new();
340
341 let mut warn = |message: String| {
342 warnings.push(message);
343 };
344
345 if options.media == MediaMode::All {
346 for href in collect_image_hrefs(&epub) {
347 let _ = extract_image(
348 &epub,
349 &href,
350 &image_root,
351 &image_link_prefix,
352 &mut extracted_images,
353 &mut extracted_count,
354 );
355 }
356 for href in collect_media_hrefs(&epub) {
357 let _ = extract_media_file(
358 &epub,
359 &href,
360 &media_root,
361 &media_link_prefix,
362 &mut extracted_media,
363 &mut extracted_media_count,
364 );
365 }
366 }
367
368 let mut content_cache: HashMap<String, ContentDoc> = HashMap::new();
369
370 let mut image_resolver = |src: &str, base_href: &str| -> Option<String> {
371 match options.media {
372 MediaMode::None => Some(src.to_string()),
373 MediaMode::Image | MediaMode::All => resolve_and_extract_image(
374 &epub,
375 src,
376 base_href,
377 &image_root,
378 &image_link_prefix,
379 &mut extracted_images,
380 &mut extracted_count,
381 ),
382 }
383 };
384
385 let toc_entries_raw = collect_toc_entries(&epub);
386 let (toc_entries, nav_removed) = cleanup_toc_entries(toc_entries_raw, options.nav_cleanup);
387 let spine_docs = collect_readable_spine_docs(&epub);
388 let spine_hrefs: Vec<String> = spine_docs.iter().map(|doc| doc.href_path.clone()).collect();
389 let spine_index_by_href: HashMap<String, usize> = spine_hrefs
390 .iter()
391 .enumerate()
392 .map(|(idx, href)| (href.clone(), idx))
393 .collect();
394 let (toc_is_degenerate, toc_entry_count, toc_unique_count, toc_coverage_ratio) =
395 toc_degeneracy_stats(&toc_entries, spine_hrefs.len());
396 let mut sections: Vec<SectionRecord> = Vec::new();
397
398 let mut use_heading_fallback = false;
399 let attempt_heading_fallback = match options.chapter_fallback {
400 ChapterFallbackMode::Off => false,
401 ChapterFallbackMode::Auto => {
402 if toc_is_degenerate {
403 true
404 } else {
405 warn(format!(
406 "heading fallback skipped for {}: TOC not degenerate (entries={}, unique_hrefs={}, coverage={:.2}).",
407 title, toc_entry_count, toc_unique_count, toc_coverage_ratio
408 ));
409 false
410 }
411 }
412 ChapterFallbackMode::Force => true,
413 };
414
415 if attempt_heading_fallback {
416 let heading_candidates = detect_heading_candidates(&spine_hrefs, &mut content_cache, &epub);
417 let confident_candidates: Vec<HeadingCandidate> = heading_candidates
418 .into_iter()
419 .filter(|candidate| candidate.spine_idx > 0)
420 .collect();
421 if !confident_candidates.is_empty() {
422 let first_label = toc_entries
423 .first()
424 .map(|entry| entry.label.clone())
425 .filter(|label| !label.trim().is_empty())
426 .unwrap_or_else(|| {
427 spine_hrefs
428 .first()
429 .map(|href| prettify_section_name(href))
430 .unwrap_or_else(|| "Section 1".to_string())
431 });
432 let mut starts: Vec<(usize, String)> = vec![(0, first_label)];
433 for candidate in &confident_candidates {
434 let label = if candidate.label.trim().is_empty() {
435 format!("Section {}", starts.len() + 1)
436 } else {
437 candidate.label.clone()
438 };
439 starts.push((candidate.spine_idx, label));
440 }
441
442 warn(format!(
443 "using heading fallback for {} (mode={:?}, toc_entries={}, spine_docs={}, detected_starts={}).",
444 title,
445 options.chapter_fallback,
446 toc_entry_count,
447 spine_hrefs.len(),
448 confident_candidates.len()
449 ));
450 use_heading_fallback = true;
451
452 for (start_pos, (start_idx, section_label)) in starts.iter().enumerate() {
453 let next_start = starts
454 .get(start_pos + 1)
455 .map(|(idx, _)| *idx)
456 .unwrap_or(spine_hrefs.len());
457 if next_start == 0 || next_start <= *start_idx {
458 continue;
459 }
460 let end_idx = next_start - 1;
461 let mut chunks: Vec<String> = Vec::new();
462 let mut anchors: HashSet<String> = HashSet::new();
463 for spine_idx in *start_idx..=end_idx {
464 let Some(href) = spine_hrefs.get(spine_idx) else {
465 continue;
466 };
467 let content = match load_content(&epub, href, &mut content_cache) {
468 Ok(content) => content,
469 Err(err) => {
470 errors.push(err.to_string());
471 continue;
472 }
473 };
474 if options.format == FormatMode::Rich {
475 collect_css(content, href, &mut css_hrefs, &mut inline_styles);
476 }
477 let (part, part_anchors) = render_partial_with_anchors(
478 content,
479 options.format,
480 None,
481 None,
482 &mut image_resolver,
483 );
484 for anchor in part_anchors {
485 anchors.insert(anchor);
486 }
487 if let Some(part) = part {
488 if !part.trim().is_empty() {
489 chunks.push(part);
490 }
491 }
492 }
493 let text = chunks.join("\n\n").trim().to_string();
494 if !text.is_empty() {
495 sections.push(SectionRecord {
496 title: section_label.clone(),
497 text,
498 start_href: spine_hrefs[*start_idx].clone(),
499 start_fragment: None,
500 end_href: Some(spine_hrefs[end_idx].clone()),
501 end_fragment: None,
502 spine_start: *start_idx,
503 spine_end: end_idx,
504 anchors: {
505 let mut values: Vec<String> = anchors.into_iter().collect();
506 values.sort();
507 values
508 },
509 section_id: String::new(),
510 output_path: String::new(),
511 });
512 }
513 }
514 } else {
515 warn(format!(
516 "heading fallback skipped for {}: insufficient heading confidence.",
517 title
518 ));
519 }
520 }
521
522 if !use_heading_fallback && !toc_entries.is_empty() {
523 for (idx, entry) in toc_entries.iter().enumerate() {
524 let Some(start_idx) = spine_index_by_href.get(&entry.href_path).copied() else {
525 continue;
526 };
527 let next_entry = toc_entries.get(idx + 1);
528 let end_idx = if let Some(next) = next_entry {
529 spine_index_by_href
530 .get(&next.href_path)
531 .copied()
532 .unwrap_or(spine_hrefs.len().saturating_sub(1))
533 } else {
534 spine_hrefs.len().saturating_sub(1)
535 };
536 if end_idx < start_idx {
537 continue;
538 }
539
540 let mut chunks: Vec<String> = Vec::new();
541 let mut section_anchors: HashSet<String> = HashSet::new();
542 for spine_idx in start_idx..=end_idx {
543 let Some(href) = spine_hrefs.get(spine_idx) else {
544 continue;
545 };
546 let content = match load_content(&epub, href, &mut content_cache) {
547 Ok(content) => content,
548 Err(err) => {
549 errors.push(err.to_string());
550 continue;
551 }
552 };
553 if options.format == FormatMode::Rich {
554 collect_css(content, href, &mut css_hrefs, &mut inline_styles);
555 }
556
557 if let Some(next) = next_entry {
558 if spine_idx == end_idx && next.fragment.is_none() {
559 continue;
561 }
562 }
563
564 let start_fragment = if spine_idx == start_idx {
565 entry.fragment.as_deref()
566 } else {
567 None
568 };
569 let end_fragment = if let Some(next) = next_entry {
570 if spine_idx == end_idx {
571 next.fragment.as_deref()
572 } else {
573 None
574 }
575 } else {
576 None
577 };
578
579 let (part, part_anchors) = render_partial_with_anchors(
580 content,
581 options.format,
582 start_fragment,
583 end_fragment,
584 &mut image_resolver,
585 );
586 for anchor in part_anchors {
587 section_anchors.insert(anchor);
588 }
589 if let Some(part) = part {
590 if !part.trim().is_empty() {
591 chunks.push(part);
592 }
593 }
594 }
595
596 let text = chunks.join("\n\n").trim().to_string();
597 if !text.is_empty() {
598 sections.push(SectionRecord {
599 title: entry.label.clone(),
600 text,
601 start_href: entry.href_path.clone(),
602 start_fragment: entry.fragment.clone(),
603 end_href: next_entry.map(|n| n.href_path.clone()),
604 end_fragment: next_entry.and_then(|n| n.fragment.clone()),
605 spine_start: start_idx,
606 spine_end: end_idx,
607 anchors: {
608 let mut values: Vec<String> = section_anchors.into_iter().collect();
609 values.sort();
610 values
611 },
612 section_id: String::new(),
613 output_path: String::new(),
614 });
615 }
616 }
617 } else if !use_heading_fallback {
618 for spine_doc in &spine_docs {
619 let href_path = spine_doc.href_path.clone();
620 let label = spine_doc.label.clone();
621 let content = match load_content(&epub, &href_path, &mut content_cache) {
622 Ok(content) => content,
623 Err(err) => {
624 errors.push(err.to_string());
625 continue;
626 }
627 };
628 if options.format == FormatMode::Rich {
629 collect_css(content, &href_path, &mut css_hrefs, &mut inline_styles);
630 }
631 let (text_opt, anchors) = render_partial_with_anchors(
632 content,
633 options.format,
634 None,
635 None,
636 &mut image_resolver,
637 );
638 if let Some(text) = text_opt {
639 if !text.trim().is_empty() {
640 sections.push(SectionRecord {
641 title: label,
642 text,
643 start_href: href_path,
644 start_fragment: None,
645 end_href: None,
646 end_fragment: None,
647 spine_start: spine_index_by_href
648 .get(&content.href_path)
649 .copied()
650 .unwrap_or(0),
651 spine_end: spine_index_by_href
652 .get(&content.href_path)
653 .copied()
654 .unwrap_or(0),
655 anchors,
656 section_id: String::new(),
657 output_path: String::new(),
658 });
659 }
660 }
661 }
662 }
663
664 if sections.is_empty() {
665 anyhow::bail!("No readable sections found in {}", epub_path.display());
666 }
667
668 let stats = postprocess_sections(
669 &mut sections,
670 options.split_chapters,
671 options.filename_scheme,
672 &book_slug,
673 options.ocr_cleanup,
674 options.notes_mode,
675 );
676 if stats.link_unresolved > 0 {
677 warn(format!(
678 "{}: unresolved internal links detected ({}).",
679 title, stats.link_unresolved
680 ));
681 }
682
683 let style_header_lines = if options.format == FormatMode::Rich {
684 build_style_header(
685 &epub,
686 &css_hrefs,
687 &inline_styles,
688 &style_root,
689 &style_link_prefix,
690 options.css,
691 )?
692 } else {
693 Vec::new()
694 };
695
696 let return_path = write_markdown_outputs(
697 §ions,
698 options,
699 &options.output,
700 &book_dir,
701 &book_slug,
702 &title,
703 author.as_ref(),
704 &style_header_lines,
705 &stats.global_note_lines,
706 )?;
707
708 write_manifest_export(
709 options.export_manifest,
710 &book_dir,
711 &title,
712 author.as_ref(),
713 &book_slug,
714 &spine_hrefs,
715 &toc_entries,
716 §ions,
717 &extracted_images,
718 &extracted_media,
719 options,
720 )?;
721 write_quality_report(
722 options.quality_report,
723 &book_dir,
724 toc_entry_count,
725 toc_unique_count,
726 toc_coverage_ratio,
727 toc_is_degenerate,
728 use_heading_fallback,
729 options,
730 &stats,
731 extracted_count,
732 extracted_media_count,
733 nav_removed,
734 &warnings,
735 &errors,
736 )?;
737
738 let mut diagnostics = Vec::new();
739 if extracted_count > 0 {
740 diagnostics.push(Diagnostic {
741 level: DiagnosticLevel::Info,
742 message: format!("Extracted {extracted_count} images for {title}"),
743 });
744 }
745 if extracted_media_count > 0 {
746 diagnostics.push(Diagnostic {
747 level: DiagnosticLevel::Info,
748 message: format!("Extracted {extracted_media_count} media files for {title}"),
749 });
750 }
751 diagnostics.extend(warnings.into_iter().map(|message| Diagnostic {
752 level: DiagnosticLevel::Warning,
753 message,
754 }));
755 diagnostics.extend(errors.into_iter().map(|message| Diagnostic {
756 level: DiagnosticLevel::Error,
757 message,
758 }));
759
760 Ok(BookConversionResult {
761 input_path: epub_path.to_path_buf(),
762 title,
763 output_path: Some(return_path),
764 diagnostics,
765 })
766}
767
768fn toc_degeneracy_stats(
769 toc_entries: &[TocEntryInfo],
770 spine_doc_count: usize,
771) -> (bool, usize, usize, f32) {
772 let toc_entry_count = toc_entries.len();
773 let unique_toc_hrefs: HashSet<&str> = toc_entries
774 .iter()
775 .map(|entry| entry.href_path.as_str())
776 .collect();
777 let unique_count = unique_toc_hrefs.len();
778 let coverage_ratio = if spine_doc_count > 0 {
779 unique_count as f32 / spine_doc_count as f32
780 } else {
781 0.0
782 };
783 let is_degenerate = toc_entry_count <= 1 || unique_count < 3 || coverage_ratio < 0.15;
784 (is_degenerate, toc_entry_count, unique_count, coverage_ratio)
785}
786
787fn collect_input_epubs(input: &Path) -> Result<Vec<PathBuf>> {
788 let metadata = std::fs::metadata(input)
789 .with_context(|| format!("Failed to access {}", input.display()))?;
790
791 if metadata.is_file() {
792 if input.extension().and_then(|ext| ext.to_str()) == Some("epub") {
793 return Ok(vec![input.to_path_buf()]);
794 }
795 anyhow::bail!(
796 "Input path {} is a file, but not an .epub file",
797 input.display()
798 );
799 }
800
801 if !metadata.is_dir() {
802 anyhow::bail!(
803 "Input path {} is neither a regular file nor a directory",
804 input.display()
805 );
806 }
807
808 let mut epub_paths = Vec::new();
809 for entry in WalkDir::new(input)
810 .follow_links(false)
811 .into_iter()
812 .filter_map(|entry| entry.ok())
813 {
814 if entry.file_type().is_file() {
815 let path = entry.path();
816 if path.extension().and_then(|ext| ext.to_str()) == Some("epub") {
817 epub_paths.push(path.to_path_buf());
818 }
819 }
820 }
821
822 if epub_paths.is_empty() {
823 anyhow::bail!("No EPUB files found under {}", input.display());
824 }
825
826 Ok(epub_paths)
827}
828
829fn normalize_space(text: &str) -> String {
830 text.split_whitespace().collect::<Vec<_>>().join(" ")
831}
832
833fn clean_heading_label(text: &str) -> String {
834 let normalized = normalize_space(text);
835 normalized
836 .trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
837 .to_string()
838}
839
840fn extract_major_heading_label(text: &str) -> Option<String> {
841 MAJOR_HEADING_LABEL_RE
842 .find(text)
843 .map(|m| clean_heading_label(m.as_str()))
844 .filter(|label| !label.is_empty())
845}
846
847fn is_heading_like_line(line: &str) -> bool {
848 let normalized = normalize_space(line);
849 if normalized.is_empty() || normalized.chars().count() > 80 {
850 return false;
851 }
852 let words: Vec<&str> = normalized
853 .split_whitespace()
854 .filter(|word| word.chars().any(|c| c.is_alphabetic()))
855 .collect();
856 if words.is_empty() {
857 return false;
858 }
859 let letters: Vec<char> = normalized.chars().filter(|c| c.is_alphabetic()).collect();
860 if letters.is_empty() {
861 return false;
862 }
863 let all_caps = letters.iter().all(|c| !c.is_lowercase());
864 let title_like = words
865 .iter()
866 .filter(|word| {
867 word.chars()
868 .next()
869 .map(|c| c.is_uppercase())
870 .unwrap_or(false)
871 })
872 .count()
873 >= std::cmp::max(1, (words.len() * 8) / 10);
874 all_caps || title_like
875}
876
877fn resolve_href(base_href: &str, rel: &str) -> String {
878 if rel.starts_with('/') {
879 normalize_path(rel)
880 } else {
881 let base_dir = base_href.rsplit_once('/').map(|(dir, _)| dir).unwrap_or("");
882 let combined = format!("{base_dir}/{rel}");
883 normalize_path(&combined)
884 }
885}
886
887fn normalize_path(path: &str) -> String {
888 let mut parts = Vec::new();
889 let absolute = path.starts_with('/');
890 for part in path.split('/') {
891 match part {
892 "" | "." => {}
893 ".." => {
894 parts.pop();
895 }
896 _ => parts.push(part),
897 }
898 }
899 let joined = parts.join("/");
900 if absolute {
901 format!("/{joined}")
902 } else {
903 joined
904 }
905}
906
907fn decode_path(path: &str) -> String {
908 let trimmed = path.trim_start_matches('/');
909 urlencoding::decode(trimmed)
910 .map(|s| s.into_owned())
911 .unwrap_or_else(|_| trimmed.to_string())
912}
913
914fn is_external(value: &str) -> bool {
915 let lower = value.to_lowercase();
916 lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("data:")
917}
918
919fn slugify(value: &str) -> String {
920 let mut out = String::new();
921 let mut prev_underscore = false;
922 for ch in value.chars() {
923 if ch.is_ascii_alphanumeric() || ch == '.' || ch == '-' {
924 out.push(ch);
925 prev_underscore = false;
926 } else if !prev_underscore {
927 out.push('_');
928 prev_underscore = true;
929 }
930 }
931 let trimmed = out.trim_matches(&['_', '.', '-'][..]).to_string();
932 if trimmed.is_empty() {
933 "book".to_string()
934 } else {
935 trimmed
936 }
937}