rbook-utils 0.0.1

A high-level wrapper over `rbook` for easy ebook parsing/conversion/rendering
Documentation
use rbook::Epub;
use std::collections::HashMap;

use super::collect::load_content;
use super::{
    ContentDoc, HeadingCandidate, MAJOR_HEADING_RE, OCR_NOISE_RE, clean_heading_label,
    extract_major_heading_label, is_heading_like_line, normalize_space,
};

pub(super) fn detect_heading_candidates(
    spine_hrefs: &[String],
    cache: &mut HashMap<String, ContentDoc>,
    epub: &Epub,
) -> Vec<HeadingCandidate> {
    let mut accepted: Vec<HeadingCandidate> = Vec::new();
    let min_gap_docs = 2usize;

    for (idx, href) in spine_hrefs.iter().enumerate() {
        let content = match load_content(epub, href, cache) {
            Ok(content) => content,
            Err(_) => continue,
        };
        let (score, label, true_heading) = score_heading_candidate(content);
        if score < 1.0 {
            continue;
        }
        if idx == 0 && !true_heading {
            continue;
        }

        let candidate = HeadingCandidate {
            spine_idx: idx,
            score,
            label: clean_heading_label(&label),
        };

        if let Some(prev) = accepted.last_mut() {
            if idx.saturating_sub(prev.spine_idx) < min_gap_docs {
                if candidate.score > prev.score {
                    *prev = candidate;
                }
                continue;
            }
        }
        accepted.push(candidate);
    }

    accepted
}

pub(super) fn prettify_section_name(value: &str) -> String {
    let file_name = value
        .rsplit('/')
        .next()
        .unwrap_or(value)
        .rsplit_once('.')
        .map(|(stem, _)| stem)
        .unwrap_or(value);
    let cleaned = file_name.replace(['_', '-'], " ");
    let cleaned = normalize_space(&cleaned);
    if cleaned.is_empty() {
        value.to_string()
    } else {
        cleaned
    }
}

fn score_heading_candidate(content: &ContentDoc) -> (f32, String, bool) {
    let (top_window_text, first_nonempty_line, heading_texts) = extract_heading_features(content);

    let mut score = 0.0f32;
    let mut label = String::new();
    let mut heading_match = false;

    for heading_text in &heading_texts {
        if MAJOR_HEADING_RE.is_match(heading_text) {
            score += 0.9;
            heading_match = true;
            label = extract_major_heading_label(heading_text)
                .unwrap_or_else(|| clean_heading_label(heading_text));
            break;
        }
    }

    let top_match = MAJOR_HEADING_RE.find(&top_window_text);
    if top_match.is_some() {
        score += 0.8;
        if label.is_empty() {
            if !first_nonempty_line.is_empty() && MAJOR_HEADING_RE.is_match(&first_nonempty_line) {
                label = extract_major_heading_label(&first_nonempty_line)
                    .unwrap_or_else(|| clean_heading_label(&first_nonempty_line));
            } else if let Some(found) = top_match {
                label = extract_major_heading_label(&top_window_text)
                    .unwrap_or_else(|| clean_heading_label(found.as_str()));
            }
        }
    }

    let first_line_major_match =
        !first_nonempty_line.is_empty() && MAJOR_HEADING_RE.is_match(&first_nonempty_line);
    if !first_nonempty_line.is_empty()
        && (is_heading_like_line(&first_nonempty_line) || first_line_major_match)
    {
        score += 0.4;
        if label.is_empty() && first_line_major_match {
            label = extract_major_heading_label(&first_nonempty_line)
                .unwrap_or_else(|| clean_heading_label(&first_nonempty_line));
        }
    }

    if OCR_NOISE_RE.is_match(&top_window_text) {
        score -= 0.5;
    }

    score = score.clamp(0.0, 2.0);
    let true_heading = heading_match || top_match.is_some();
    (score, label, true_heading)
}

fn extract_heading_features(content: &ContentDoc) -> (String, String, Vec<String>) {
    let Ok(body) = content.document.select_first("body") else {
        return (String::new(), String::new(), Vec::new());
    };
    let body_node = body.as_node();
    let body_text = body_node.text_contents();
    let top_window_raw: String = body_text.chars().take(1500).collect();
    let top_window_text = normalize_space(&top_window_raw);

    let mut first_nonempty_line = String::new();
    for line in top_window_raw.lines() {
        let stripped = normalize_space(line);
        if !stripped.is_empty() {
            first_nonempty_line = stripped;
            break;
        }
    }
    if first_nonempty_line.is_empty() && !top_window_text.is_empty() {
        first_nonempty_line = top_window_text.chars().take(80).collect::<String>();
    }

    let mut heading_texts: Vec<String> = Vec::new();
    if let Ok(headings) = body_node.select("h1, h2, h3") {
        for heading in headings {
            let text = normalize_space(&heading.text_contents());
            if !text.is_empty() {
                heading_texts.push(text);
            }
        }
    }

    (top_window_text, first_nonempty_line, heading_texts)
}