Skip to main content

rbook_utils/
lib.rs

1use anyhow::{Context, Result};
2use once_cell::sync::Lazy;
3use rbook::Epub;
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6use std::path::{Path, PathBuf};
7use walkdir::WalkDir;
8
9use kuchiki::NodeRef;
10
11mod collect;
12mod export;
13mod heading;
14mod postprocess;
15mod render;
16
17use collect::{
18    collect_image_hrefs, collect_media_hrefs, collect_readable_spine_docs, collect_toc_entries,
19    load_content,
20};
21use export::{write_manifest_export, write_markdown_outputs, write_quality_report};
22use heading::{detect_heading_candidates, prettify_section_name};
23use postprocess::{cleanup_toc_entries, postprocess_sections};
24use render::{
25    build_style_header, collect_css, extract_image, extract_media_file,
26    render_partial_with_anchors, resolve_and_extract_image,
27};
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
30pub enum FormatMode {
31    Plain,
32    Rich,
33}
34
35#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
36pub enum CssMode {
37    Inline,
38    External,
39}
40
41#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
42pub enum ChapterFallbackMode {
43    Off,
44    Auto,
45    Force,
46}
47
48#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
49pub enum NotesMode {
50    Inline,
51    ChapterEnd,
52    Global,
53}
54
55#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
56pub enum ExportMode {
57    Off,
58    V1,
59}
60
61#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
62pub enum OcrCleanupMode {
63    Off,
64    Basic,
65    Aggressive,
66}
67
68#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
69pub enum NavCleanupMode {
70    Off,
71    Auto,
72}
73
74#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
75pub enum FilenameScheme {
76    Index,
77    Hash,
78}
79
80#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
81pub enum MediaMode {
82    None,
83    Image,
84    All,
85}
86
87#[derive(Clone, Debug)]
88pub struct ConvertOptions {
89    pub input: PathBuf,
90    pub output: PathBuf,
91    pub media: MediaMode,
92    pub format: FormatMode,
93    pub css: CssMode,
94    pub split_chapters: bool,
95    pub chapter_fallback: ChapterFallbackMode,
96    pub notes_mode: NotesMode,
97    pub export_manifest: ExportMode,
98    pub quality_report: ExportMode,
99    pub ocr_cleanup: OcrCleanupMode,
100    pub nav_cleanup: NavCleanupMode,
101    pub filename_scheme: FilenameScheme,
102}
103
104impl ConvertOptions {
105    pub fn new(input: PathBuf, output: PathBuf) -> Self {
106        Self {
107            input,
108            output,
109            media: MediaMode::Image,
110            format: FormatMode::Plain,
111            css: CssMode::Inline,
112            split_chapters: false,
113            chapter_fallback: ChapterFallbackMode::Auto,
114            notes_mode: NotesMode::Inline,
115            export_manifest: ExportMode::Off,
116            quality_report: ExportMode::Off,
117            ocr_cleanup: OcrCleanupMode::Off,
118            nav_cleanup: NavCleanupMode::Auto,
119            filename_scheme: FilenameScheme::Index,
120        }
121    }
122}
123
124#[derive(Clone, Copy, Debug, PartialEq, Eq)]
125pub enum DiagnosticLevel {
126    Info,
127    Warning,
128    Error,
129}
130
131#[derive(Clone, Debug)]
132pub struct Diagnostic {
133    pub level: DiagnosticLevel,
134    pub message: String,
135}
136
137#[derive(Clone, Debug)]
138pub struct BookConversionResult {
139    pub input_path: PathBuf,
140    pub title: String,
141    pub output_path: Option<PathBuf>,
142    pub diagnostics: Vec<Diagnostic>,
143}
144
145#[derive(Clone, Debug, Default)]
146pub struct ConversionSummary {
147    pub books: Vec<BookConversionResult>,
148}
149
150impl ConversionSummary {
151    pub fn failure_count(&self) -> usize {
152        self.books
153            .iter()
154            .filter(|book| book.output_path.is_none())
155            .count()
156    }
157
158    pub fn success_count(&self) -> usize {
159        self.books.len().saturating_sub(self.failure_count())
160    }
161}
162
163#[derive(Clone, Debug)]
164struct TocEntryInfo {
165    label: String,
166    href_path: String,
167    fragment: Option<String>,
168}
169
170#[derive(Clone, Debug)]
171struct ContentDoc {
172    href_path: String,
173    document: NodeRef,
174}
175
176#[derive(Clone, Debug)]
177struct ReadableSpineDoc {
178    href_path: String,
179    label: String,
180}
181
182#[derive(Clone, Debug)]
183struct HeadingCandidate {
184    spine_idx: usize,
185    score: f32,
186    label: String,
187}
188
189#[derive(Clone, Debug)]
190struct SectionRecord {
191    title: String,
192    text: String,
193    start_href: String,
194    start_fragment: Option<String>,
195    end_href: Option<String>,
196    end_fragment: Option<String>,
197    spine_start: usize,
198    spine_end: usize,
199    anchors: Vec<String>,
200    section_id: String,
201    output_path: String,
202}
203
204#[derive(Clone, Debug, Default)]
205struct PostprocessStats {
206    link_rewritten: usize,
207    link_unresolved: usize,
208    cleanup_changes: usize,
209    notes_written: usize,
210    global_note_lines: Vec<String>,
211}
212
213const COMPLEX_HTML_TAGS: &[&str] = &[
214    "table",
215    "thead",
216    "tbody",
217    "tr",
218    "td",
219    "th",
220    "figure",
221    "figcaption",
222    "svg",
223    "math",
224];
225
226static MAJOR_HEADING_RE: Lazy<Regex> = Lazy::new(|| {
227    Regex::new(
228        r"(?i)\b(?:chapter|book|part)\s+(?:[ivxlcdm]+|\d+)\b|\b(?:preface|prologue|epilogue|introduction|foreword|afterword)\b",
229    )
230    .expect("valid heading regex")
231});
232static MAJOR_HEADING_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
233    Regex::new(
234        r"(?i)\b(?:chapter|book|part)\s+(?:[ivxlcdm]+|\d+)(?:\s*[:.-]?\s*[a-z0-9][a-z0-9' -]{0,70})?|\b(?:preface|prologue|epilogue|introduction|foreword|afterword)\b",
235    )
236    .expect("valid heading label regex")
237});
238static OCR_NOISE_RE: Lazy<Regex> = Lazy::new(|| {
239    Regex::new(r"(?i)estimated\s+to\s+be\s+only\s+\d+(?:\.\d+)?%\s+accurate")
240        .expect("valid ocr regex")
241});
242static MARKDOWN_LINK_RE: Lazy<Regex> =
243    Lazy::new(|| Regex::new(r"(!?)\[([^\]]+)\]\(([^)]+)\)").expect("valid markdown link regex"));
244static HTML_HREF_RE: Lazy<Regex> = Lazy::new(|| {
245    Regex::new(r#"(?i)(<a\b[^>]*?\bhref=")([^"]+)(")"#).expect("valid html href regex")
246});
247static FOOTNOTE_DEF_RE: Lazy<Regex> =
248    Lazy::new(|| Regex::new(r"^\[\^([^\]]+)\]:\s*(.*)$").expect("valid footnote regex"));
249
250pub fn convert_all(options: &ConvertOptions) -> Result<ConversionSummary> {
251    let epub_paths = collect_input_epubs(&options.input)?;
252
253    let mut summary = ConversionSummary::default();
254    for epub_path in epub_paths {
255        match convert_epub_result(&epub_path, options) {
256            Ok(result) => summary.books.push(result),
257            Err(err) => {
258                summary.books.push(BookConversionResult {
259                    input_path: epub_path.clone(),
260                    title: epub_path
261                        .file_stem()
262                        .and_then(|s| s.to_str())
263                        .unwrap_or("book")
264                        .to_string(),
265                    output_path: None,
266                    diagnostics: vec![Diagnostic {
267                        level: DiagnosticLevel::Error,
268                        message: format!("Failed to parse {}: {err}", epub_path.display()),
269                    }],
270                });
271            }
272        }
273    }
274
275    Ok(summary)
276}
277
278pub fn convert_epub(epub_path: &Path, options: &ConvertOptions) -> Result<PathBuf> {
279    let result = convert_epub_result(epub_path, options)?;
280    result
281        .output_path
282        .ok_or_else(|| anyhow::anyhow!("No output path generated for {}", epub_path.display()))
283}
284
285pub fn convert_epub_result(
286    epub_path: &Path,
287    options: &ConvertOptions,
288) -> Result<BookConversionResult> {
289    let epub = Epub::open(epub_path)
290        .with_context(|| format!("Failed to open epub {}", epub_path.display()))?;
291
292    let title = epub
293        .metadata()
294        .title()
295        .map(|t| t.value().to_string())
296        .unwrap_or_else(|| {
297            epub_path
298                .file_stem()
299                .and_then(|s| s.to_str())
300                .unwrap_or("book")
301                .to_string()
302        });
303
304    let author = epub
305        .metadata()
306        .creators()
307        .next()
308        .map(|c| c.value().to_string());
309
310    let book_slug = slugify(&title);
311    let book_dir = options.output.join(&book_slug);
312    let image_root = book_dir.join("images");
313    let media_root = book_dir.join("media");
314    let style_root = book_dir.join("styles");
315    let image_link_prefix = if options.split_chapters {
316        "./images".to_string()
317    } else {
318        format!("./{book_slug}/images")
319    };
320    let media_link_prefix = if options.split_chapters {
321        "./media".to_string()
322    } else {
323        format!("./{book_slug}/media")
324    };
325    let style_link_prefix = if options.split_chapters {
326        "./styles".to_string()
327    } else {
328        format!("./{book_slug}/styles")
329    };
330
331    let mut extracted_images: HashMap<String, String> = HashMap::new();
332    let mut extracted_media: HashMap<String, String> = HashMap::new();
333    let mut extracted_count = 0usize;
334    let mut extracted_media_count = 0usize;
335
336    let mut css_hrefs: HashSet<String> = HashSet::new();
337    let mut inline_styles: Vec<String> = Vec::new();
338    let mut warnings: Vec<String> = Vec::new();
339    let mut errors: Vec<String> = Vec::new();
340
341    let mut warn = |message: String| {
342        warnings.push(message);
343    };
344
345    if options.media == MediaMode::All {
346        for href in collect_image_hrefs(&epub) {
347            let _ = extract_image(
348                &epub,
349                &href,
350                &image_root,
351                &image_link_prefix,
352                &mut extracted_images,
353                &mut extracted_count,
354            );
355        }
356        for href in collect_media_hrefs(&epub) {
357            let _ = extract_media_file(
358                &epub,
359                &href,
360                &media_root,
361                &media_link_prefix,
362                &mut extracted_media,
363                &mut extracted_media_count,
364            );
365        }
366    }
367
368    let mut content_cache: HashMap<String, ContentDoc> = HashMap::new();
369
370    let mut image_resolver = |src: &str, base_href: &str| -> Option<String> {
371        match options.media {
372            MediaMode::None => Some(src.to_string()),
373            MediaMode::Image | MediaMode::All => resolve_and_extract_image(
374                &epub,
375                src,
376                base_href,
377                &image_root,
378                &image_link_prefix,
379                &mut extracted_images,
380                &mut extracted_count,
381            ),
382        }
383    };
384
385    let toc_entries_raw = collect_toc_entries(&epub);
386    let (toc_entries, nav_removed) = cleanup_toc_entries(toc_entries_raw, options.nav_cleanup);
387    let spine_docs = collect_readable_spine_docs(&epub);
388    let spine_hrefs: Vec<String> = spine_docs.iter().map(|doc| doc.href_path.clone()).collect();
389    let spine_index_by_href: HashMap<String, usize> = spine_hrefs
390        .iter()
391        .enumerate()
392        .map(|(idx, href)| (href.clone(), idx))
393        .collect();
394    let (toc_is_degenerate, toc_entry_count, toc_unique_count, toc_coverage_ratio) =
395        toc_degeneracy_stats(&toc_entries, spine_hrefs.len());
396    let mut sections: Vec<SectionRecord> = Vec::new();
397
398    let mut use_heading_fallback = false;
399    let attempt_heading_fallback = match options.chapter_fallback {
400        ChapterFallbackMode::Off => false,
401        ChapterFallbackMode::Auto => {
402            if toc_is_degenerate {
403                true
404            } else {
405                warn(format!(
406                    "heading fallback skipped for {}: TOC not degenerate (entries={}, unique_hrefs={}, coverage={:.2}).",
407                    title, toc_entry_count, toc_unique_count, toc_coverage_ratio
408                ));
409                false
410            }
411        }
412        ChapterFallbackMode::Force => true,
413    };
414
415    if attempt_heading_fallback {
416        let heading_candidates = detect_heading_candidates(&spine_hrefs, &mut content_cache, &epub);
417        let confident_candidates: Vec<HeadingCandidate> = heading_candidates
418            .into_iter()
419            .filter(|candidate| candidate.spine_idx > 0)
420            .collect();
421        if !confident_candidates.is_empty() {
422            let first_label = toc_entries
423                .first()
424                .map(|entry| entry.label.clone())
425                .filter(|label| !label.trim().is_empty())
426                .unwrap_or_else(|| {
427                    spine_hrefs
428                        .first()
429                        .map(|href| prettify_section_name(href))
430                        .unwrap_or_else(|| "Section 1".to_string())
431                });
432            let mut starts: Vec<(usize, String)> = vec![(0, first_label)];
433            for candidate in &confident_candidates {
434                let label = if candidate.label.trim().is_empty() {
435                    format!("Section {}", starts.len() + 1)
436                } else {
437                    candidate.label.clone()
438                };
439                starts.push((candidate.spine_idx, label));
440            }
441
442            warn(format!(
443                "using heading fallback for {} (mode={:?}, toc_entries={}, spine_docs={}, detected_starts={}).",
444                title,
445                options.chapter_fallback,
446                toc_entry_count,
447                spine_hrefs.len(),
448                confident_candidates.len()
449            ));
450            use_heading_fallback = true;
451
452            for (start_pos, (start_idx, section_label)) in starts.iter().enumerate() {
453                let next_start = starts
454                    .get(start_pos + 1)
455                    .map(|(idx, _)| *idx)
456                    .unwrap_or(spine_hrefs.len());
457                if next_start == 0 || next_start <= *start_idx {
458                    continue;
459                }
460                let end_idx = next_start - 1;
461                let mut chunks: Vec<String> = Vec::new();
462                let mut anchors: HashSet<String> = HashSet::new();
463                for spine_idx in *start_idx..=end_idx {
464                    let Some(href) = spine_hrefs.get(spine_idx) else {
465                        continue;
466                    };
467                    let content = match load_content(&epub, href, &mut content_cache) {
468                        Ok(content) => content,
469                        Err(err) => {
470                            errors.push(err.to_string());
471                            continue;
472                        }
473                    };
474                    if options.format == FormatMode::Rich {
475                        collect_css(content, href, &mut css_hrefs, &mut inline_styles);
476                    }
477                    let (part, part_anchors) = render_partial_with_anchors(
478                        content,
479                        options.format,
480                        None,
481                        None,
482                        &mut image_resolver,
483                    );
484                    for anchor in part_anchors {
485                        anchors.insert(anchor);
486                    }
487                    if let Some(part) = part {
488                        if !part.trim().is_empty() {
489                            chunks.push(part);
490                        }
491                    }
492                }
493                let text = chunks.join("\n\n").trim().to_string();
494                if !text.is_empty() {
495                    sections.push(SectionRecord {
496                        title: section_label.clone(),
497                        text,
498                        start_href: spine_hrefs[*start_idx].clone(),
499                        start_fragment: None,
500                        end_href: Some(spine_hrefs[end_idx].clone()),
501                        end_fragment: None,
502                        spine_start: *start_idx,
503                        spine_end: end_idx,
504                        anchors: {
505                            let mut values: Vec<String> = anchors.into_iter().collect();
506                            values.sort();
507                            values
508                        },
509                        section_id: String::new(),
510                        output_path: String::new(),
511                    });
512                }
513            }
514        } else {
515            warn(format!(
516                "heading fallback skipped for {}: insufficient heading confidence.",
517                title
518            ));
519        }
520    }
521
522    if !use_heading_fallback && !toc_entries.is_empty() {
523        for (idx, entry) in toc_entries.iter().enumerate() {
524            let Some(start_idx) = spine_index_by_href.get(&entry.href_path).copied() else {
525                continue;
526            };
527            let next_entry = toc_entries.get(idx + 1);
528            let end_idx = if let Some(next) = next_entry {
529                spine_index_by_href
530                    .get(&next.href_path)
531                    .copied()
532                    .unwrap_or(spine_hrefs.len().saturating_sub(1))
533            } else {
534                spine_hrefs.len().saturating_sub(1)
535            };
536            if end_idx < start_idx {
537                continue;
538            }
539
540            let mut chunks: Vec<String> = Vec::new();
541            let mut section_anchors: HashSet<String> = HashSet::new();
542            for spine_idx in start_idx..=end_idx {
543                let Some(href) = spine_hrefs.get(spine_idx) else {
544                    continue;
545                };
546                let content = match load_content(&epub, href, &mut content_cache) {
547                    Ok(content) => content,
548                    Err(err) => {
549                        errors.push(err.to_string());
550                        continue;
551                    }
552                };
553                if options.format == FormatMode::Rich {
554                    collect_css(content, href, &mut css_hrefs, &mut inline_styles);
555                }
556
557                if let Some(next) = next_entry {
558                    if spine_idx == end_idx && next.fragment.is_none() {
559                        // Next section starts at the beginning of this file.
560                        continue;
561                    }
562                }
563
564                let start_fragment = if spine_idx == start_idx {
565                    entry.fragment.as_deref()
566                } else {
567                    None
568                };
569                let end_fragment = if let Some(next) = next_entry {
570                    if spine_idx == end_idx {
571                        next.fragment.as_deref()
572                    } else {
573                        None
574                    }
575                } else {
576                    None
577                };
578
579                let (part, part_anchors) = render_partial_with_anchors(
580                    content,
581                    options.format,
582                    start_fragment,
583                    end_fragment,
584                    &mut image_resolver,
585                );
586                for anchor in part_anchors {
587                    section_anchors.insert(anchor);
588                }
589                if let Some(part) = part {
590                    if !part.trim().is_empty() {
591                        chunks.push(part);
592                    }
593                }
594            }
595
596            let text = chunks.join("\n\n").trim().to_string();
597            if !text.is_empty() {
598                sections.push(SectionRecord {
599                    title: entry.label.clone(),
600                    text,
601                    start_href: entry.href_path.clone(),
602                    start_fragment: entry.fragment.clone(),
603                    end_href: next_entry.map(|n| n.href_path.clone()),
604                    end_fragment: next_entry.and_then(|n| n.fragment.clone()),
605                    spine_start: start_idx,
606                    spine_end: end_idx,
607                    anchors: {
608                        let mut values: Vec<String> = section_anchors.into_iter().collect();
609                        values.sort();
610                        values
611                    },
612                    section_id: String::new(),
613                    output_path: String::new(),
614                });
615            }
616        }
617    } else if !use_heading_fallback {
618        for spine_doc in &spine_docs {
619            let href_path = spine_doc.href_path.clone();
620            let label = spine_doc.label.clone();
621            let content = match load_content(&epub, &href_path, &mut content_cache) {
622                Ok(content) => content,
623                Err(err) => {
624                    errors.push(err.to_string());
625                    continue;
626                }
627            };
628            if options.format == FormatMode::Rich {
629                collect_css(content, &href_path, &mut css_hrefs, &mut inline_styles);
630            }
631            let (text_opt, anchors) = render_partial_with_anchors(
632                content,
633                options.format,
634                None,
635                None,
636                &mut image_resolver,
637            );
638            if let Some(text) = text_opt {
639                if !text.trim().is_empty() {
640                    sections.push(SectionRecord {
641                        title: label,
642                        text,
643                        start_href: href_path,
644                        start_fragment: None,
645                        end_href: None,
646                        end_fragment: None,
647                        spine_start: spine_index_by_href
648                            .get(&content.href_path)
649                            .copied()
650                            .unwrap_or(0),
651                        spine_end: spine_index_by_href
652                            .get(&content.href_path)
653                            .copied()
654                            .unwrap_or(0),
655                        anchors,
656                        section_id: String::new(),
657                        output_path: String::new(),
658                    });
659                }
660            }
661        }
662    }
663
664    if sections.is_empty() {
665        anyhow::bail!("No readable sections found in {}", epub_path.display());
666    }
667
668    let stats = postprocess_sections(
669        &mut sections,
670        options.split_chapters,
671        options.filename_scheme,
672        &book_slug,
673        options.ocr_cleanup,
674        options.notes_mode,
675    );
676    if stats.link_unresolved > 0 {
677        warn(format!(
678            "{}: unresolved internal links detected ({}).",
679            title, stats.link_unresolved
680        ));
681    }
682
683    let style_header_lines = if options.format == FormatMode::Rich {
684        build_style_header(
685            &epub,
686            &css_hrefs,
687            &inline_styles,
688            &style_root,
689            &style_link_prefix,
690            options.css,
691        )?
692    } else {
693        Vec::new()
694    };
695
696    let return_path = write_markdown_outputs(
697        &sections,
698        options,
699        &options.output,
700        &book_dir,
701        &book_slug,
702        &title,
703        author.as_ref(),
704        &style_header_lines,
705        &stats.global_note_lines,
706    )?;
707
708    write_manifest_export(
709        options.export_manifest,
710        &book_dir,
711        &title,
712        author.as_ref(),
713        &book_slug,
714        &spine_hrefs,
715        &toc_entries,
716        &sections,
717        &extracted_images,
718        &extracted_media,
719        options,
720    )?;
721    write_quality_report(
722        options.quality_report,
723        &book_dir,
724        toc_entry_count,
725        toc_unique_count,
726        toc_coverage_ratio,
727        toc_is_degenerate,
728        use_heading_fallback,
729        options,
730        &stats,
731        extracted_count,
732        extracted_media_count,
733        nav_removed,
734        &warnings,
735        &errors,
736    )?;
737
738    let mut diagnostics = Vec::new();
739    if extracted_count > 0 {
740        diagnostics.push(Diagnostic {
741            level: DiagnosticLevel::Info,
742            message: format!("Extracted {extracted_count} images for {title}"),
743        });
744    }
745    if extracted_media_count > 0 {
746        diagnostics.push(Diagnostic {
747            level: DiagnosticLevel::Info,
748            message: format!("Extracted {extracted_media_count} media files for {title}"),
749        });
750    }
751    diagnostics.extend(warnings.into_iter().map(|message| Diagnostic {
752        level: DiagnosticLevel::Warning,
753        message,
754    }));
755    diagnostics.extend(errors.into_iter().map(|message| Diagnostic {
756        level: DiagnosticLevel::Error,
757        message,
758    }));
759
760    Ok(BookConversionResult {
761        input_path: epub_path.to_path_buf(),
762        title,
763        output_path: Some(return_path),
764        diagnostics,
765    })
766}
767
768fn toc_degeneracy_stats(
769    toc_entries: &[TocEntryInfo],
770    spine_doc_count: usize,
771) -> (bool, usize, usize, f32) {
772    let toc_entry_count = toc_entries.len();
773    let unique_toc_hrefs: HashSet<&str> = toc_entries
774        .iter()
775        .map(|entry| entry.href_path.as_str())
776        .collect();
777    let unique_count = unique_toc_hrefs.len();
778    let coverage_ratio = if spine_doc_count > 0 {
779        unique_count as f32 / spine_doc_count as f32
780    } else {
781        0.0
782    };
783    let is_degenerate = toc_entry_count <= 1 || unique_count < 3 || coverage_ratio < 0.15;
784    (is_degenerate, toc_entry_count, unique_count, coverage_ratio)
785}
786
787fn collect_input_epubs(input: &Path) -> Result<Vec<PathBuf>> {
788    let metadata = std::fs::metadata(input)
789        .with_context(|| format!("Failed to access {}", input.display()))?;
790
791    if metadata.is_file() {
792        if input.extension().and_then(|ext| ext.to_str()) == Some("epub") {
793            return Ok(vec![input.to_path_buf()]);
794        }
795        anyhow::bail!(
796            "Input path {} is a file, but not an .epub file",
797            input.display()
798        );
799    }
800
801    if !metadata.is_dir() {
802        anyhow::bail!(
803            "Input path {} is neither a regular file nor a directory",
804            input.display()
805        );
806    }
807
808    let mut epub_paths = Vec::new();
809    for entry in WalkDir::new(input)
810        .follow_links(false)
811        .into_iter()
812        .filter_map(|entry| entry.ok())
813    {
814        if entry.file_type().is_file() {
815            let path = entry.path();
816            if path.extension().and_then(|ext| ext.to_str()) == Some("epub") {
817                epub_paths.push(path.to_path_buf());
818            }
819        }
820    }
821
822    if epub_paths.is_empty() {
823        anyhow::bail!("No EPUB files found under {}", input.display());
824    }
825
826    Ok(epub_paths)
827}
828
829fn normalize_space(text: &str) -> String {
830    text.split_whitespace().collect::<Vec<_>>().join(" ")
831}
832
833fn clean_heading_label(text: &str) -> String {
834    let normalized = normalize_space(text);
835    normalized
836        .trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
837        .to_string()
838}
839
840fn extract_major_heading_label(text: &str) -> Option<String> {
841    MAJOR_HEADING_LABEL_RE
842        .find(text)
843        .map(|m| clean_heading_label(m.as_str()))
844        .filter(|label| !label.is_empty())
845}
846
847fn is_heading_like_line(line: &str) -> bool {
848    let normalized = normalize_space(line);
849    if normalized.is_empty() || normalized.chars().count() > 80 {
850        return false;
851    }
852    let words: Vec<&str> = normalized
853        .split_whitespace()
854        .filter(|word| word.chars().any(|c| c.is_alphabetic()))
855        .collect();
856    if words.is_empty() {
857        return false;
858    }
859    let letters: Vec<char> = normalized.chars().filter(|c| c.is_alphabetic()).collect();
860    if letters.is_empty() {
861        return false;
862    }
863    let all_caps = letters.iter().all(|c| !c.is_lowercase());
864    let title_like = words
865        .iter()
866        .filter(|word| {
867            word.chars()
868                .next()
869                .map(|c| c.is_uppercase())
870                .unwrap_or(false)
871        })
872        .count()
873        >= std::cmp::max(1, (words.len() * 8) / 10);
874    all_caps || title_like
875}
876
877fn resolve_href(base_href: &str, rel: &str) -> String {
878    if rel.starts_with('/') {
879        normalize_path(rel)
880    } else {
881        let base_dir = base_href.rsplit_once('/').map(|(dir, _)| dir).unwrap_or("");
882        let combined = format!("{base_dir}/{rel}");
883        normalize_path(&combined)
884    }
885}
886
887fn normalize_path(path: &str) -> String {
888    let mut parts = Vec::new();
889    let absolute = path.starts_with('/');
890    for part in path.split('/') {
891        match part {
892            "" | "." => {}
893            ".." => {
894                parts.pop();
895            }
896            _ => parts.push(part),
897        }
898    }
899    let joined = parts.join("/");
900    if absolute {
901        format!("/{joined}")
902    } else {
903        joined
904    }
905}
906
907fn decode_path(path: &str) -> String {
908    let trimmed = path.trim_start_matches('/');
909    urlencoding::decode(trimmed)
910        .map(|s| s.into_owned())
911        .unwrap_or_else(|_| trimmed.to_string())
912}
913
914fn is_external(value: &str) -> bool {
915    let lower = value.to_lowercase();
916    lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("data:")
917}
918
919fn slugify(value: &str) -> String {
920    let mut out = String::new();
921    let mut prev_underscore = false;
922    for ch in value.chars() {
923        if ch.is_ascii_alphanumeric() || ch == '.' || ch == '-' {
924            out.push(ch);
925            prev_underscore = false;
926        } else if !prev_underscore {
927            out.push('_');
928            prev_underscore = true;
929        }
930    }
931    let trimmed = out.trim_matches(&['_', '.', '-'][..]).to_string();
932    if trimmed.is_empty() {
933        "book".to_string()
934    } else {
935        trimmed
936    }
937}