agent-book-translate 0.1.2

//! EPUB ZIP parser & HTML AST scraper.

use crate::error::{AppError, Result};
use scraper::{ElementRef, Html, Selector};
use std::cmp::Reverse;
use std::collections::HashMap;
use std::fs::File;
use std::io::{Read, Write};
use std::path::Path;
use zip::write::FileOptions;
use zip::{ZipArchive, ZipWriter};

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TextChunk {
    pub source_path: String,
    pub node_id: Option<String>,
    pub text: String,
}

impl TextChunk {
    pub fn with_source_path(mut self, source_path: impl Into<String>) -> Self {
        self.source_path = source_path.into();
        self
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RenderedChunk {
    pub file_name: String,
    pub chunk_index: usize,
    pub original: String,
    pub translated: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct EpubEntry {
    pub name: String,
    pub data: Vec<u8>,
    pub is_text: bool,
}

pub fn extract_text_chunks(document: &Html) -> Vec<TextChunk> {
    let mut chunks = Vec::new();
    let selector = Selector::parse("p, h1, h2, h3, li").expect("valid selector");
    for (index, element) in document.select(&selector).enumerate() {
        let text = flatten_inline_text(&element);
        if !text.trim().is_empty() {
            chunks.push(TextChunk {
                source_path: String::new(),
                node_id: Some(format!("{}-{index}", element.value().name())),
                text,
            });
        }
    }
    chunks
}

pub fn extract_and_flatten_text(document: &Html) -> String {
    extract_text_chunks(document)
        .into_iter()
        .map(|chunk| chunk.text)
        .collect::<Vec<_>>()
        .join("")
}

pub fn render_bilingual_node(original_html: &str, translated: &str) -> String {
    let mut output = String::new();
    output.push_str(original_html);
    if !translated.trim().is_empty() {
        output.push_str(r#"<p class="translation">"#);
        output.push_str(translated);
        output.push_str("</p>");
    }
    output
}

pub fn render_translation_node(original_html: &str, translated: &str) -> String {
    let escaped_translation = escape_text(translated);
    if translated.trim().is_empty() {
        String::new()
    } else if let Some((start, end)) = find_first_block_tag(original_html) {
        let mut rendered = String::with_capacity(original_html.len() + escaped_translation.len());
        rendered.push_str(&original_html[..start]);
        rendered.push_str(&escaped_translation);
        rendered.push_str(&original_html[end..]);
        rendered
    } else {
        escaped_translation
    }
}

pub fn render_file_from_chunks(original_html: &str, chunks: &[RenderedChunk]) -> String {
    let mut rendered = original_html.to_string();
    let block_spans = find_translatable_block_spans(original_html);
    let mut ordered = chunks.iter().collect::<Vec<_>>();
    ordered.sort_by_key(|chunk| Reverse(chunk.chunk_index));

    for chunk in ordered {
        if chunk.translated.trim().is_empty() {
            continue;
        }
        if let Some(span) = block_spans.get(chunk.chunk_index) {
            rendered.replace_range(
                span.inner_start..span.inner_end,
                &escape_text(&chunk.translated),
            );
            continue;
        }
        if let Some(updated) = replace_nth_occurrence(
            &rendered,
            &chunk.original,
            &escape_text(&chunk.translated),
            chunk.chunk_index,
        ) {
            rendered = updated;
        }
    }

    rendered
}

pub fn parse_epub(input: &Path) -> Result<Vec<EpubEntry>> {
    let file = File::open(input)?;
    let mut archive = ZipArchive::new(file).map_err(|e| AppError::Parse(e.to_string()))?;
    let mut files = Vec::new();

    for index in 0..archive.len() {
        let mut entry = archive
            .by_index(index)
            .map_err(|e| AppError::Parse(e.to_string()))?;
        let name = entry.name().to_string();
        let mut data = Vec::new();
        entry.read_to_end(&mut data)?;
        let is_text = name.ends_with(".xhtml") || name.ends_with(".html") || name.ends_with(".htm");
        files.push(EpubEntry {
            name,
            data,
            is_text,
        });
    }

    Ok(files)
}

pub fn write_epub(
    output: &Path,
    files: &[EpubEntry],
    rendered: &HashMap<String, String>,
) -> Result<()> {
    let file = File::create(output)?;
    let mut writer = ZipWriter::new(file);
    let options: FileOptions<'static, ()> =
        FileOptions::default().compression_method(zip::CompressionMethod::Stored);
    writer
        .start_file("mimetype", options)
        .map_err(|e| AppError::Parse(e.to_string()))?;
    writer
        .write_all(b"application/epub+zip")
        .map_err(AppError::from)?;

    let deflated: FileOptions<'static, ()> =
        FileOptions::default().compression_method(zip::CompressionMethod::Deflated);
    for entry in files {
        if entry.name == "mimetype" {
            continue;
        }
        let content = rendered
            .get(&entry.name)
            .map(String::as_bytes)
            .unwrap_or(&entry.data);
        writer
            .start_file(&entry.name, deflated)
            .map_err(|e| AppError::Parse(e.to_string()))?;
        writer.write_all(content)?;
    }

    writer
        .finish()
        .map_err(|e| AppError::Parse(e.to_string()))?;
    Ok(())
}

fn flatten_inline_text(element: &ElementRef<'_>) -> String {
    let mut text = String::new();
    for child in element.children() {
        if let Some(text_node) = child.value().as_text() {
            text.push_str(text_node);
        } else if let Some(child_el) = ElementRef::wrap(child) {
            match child_el.value().name() {
                "rt" | "img" | "svg" | "figure" | "video" => continue,
                "ruby" => text.push_str(&flatten_inline_text(&child_el)),
                _ => text.push_str(&flatten_inline_text(&child_el)),
            }
        }
    }
    text
}

fn find_first_block_tag(original_html: &str) -> Option<(usize, usize)> {
    original_html.find('>').map(|start| {
        let end = original_html.rfind('<').unwrap_or(original_html.len());
        (start + 1, end)
    })
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct BlockSpan {
    inner_start: usize,
    inner_end: usize,
}

fn find_translatable_block_spans(html: &str) -> Vec<BlockSpan> {
    let mut spans = Vec::new();
    let mut cursor = 0usize;

    while let Some(relative_start) = html[cursor..].find('<') {
        let tag_start = cursor + relative_start;
        let Some((tag_name, tag_end, self_closing)) = parse_start_tag(html, tag_start) else {
            cursor = tag_start + 1;
            continue;
        };
        if !is_translatable_block(&tag_name) || self_closing {
            cursor = tag_end;
            continue;
        }
        if let Some((closing_start, _closing_end)) =
            find_matching_close_tag(html, tag_end, &tag_name)
        {
            spans.push(BlockSpan {
                inner_start: tag_end,
                inner_end: closing_start,
            });
        }
        cursor = tag_end;
    }

    spans
}

fn parse_start_tag(html: &str, tag_start: usize) -> Option<(String, usize, bool)> {
    let after_open = tag_start.checked_add(1)?;
    let first = html.as_bytes().get(after_open).copied()? as char;
    if !first.is_ascii_alphabetic() {
        return None;
    }

    let mut name_end = after_open;
    for (offset, ch) in html[after_open..].char_indices() {
        if !(ch.is_ascii_alphanumeric() || ch == '-' || ch == ':') {
            name_end = after_open + offset;
            break;
        }
    }
    if name_end == after_open {
        return None;
    }

    let tag_end = find_tag_end(html, name_end)? + 1;
    let before_end = html[..tag_end].trim_end();
    Some((
        html[after_open..name_end].to_ascii_lowercase(),
        tag_end,
        before_end.ends_with("/>"),
    ))
}

fn find_matching_close_tag(html: &str, from: usize, tag_name: &str) -> Option<(usize, usize)> {
    let mut cursor = from;
    let mut depth = 1usize;

    while let Some(relative_start) = html[cursor..].find('<') {
        let tag_start = cursor + relative_start;
        if is_close_tag(html, tag_start, tag_name) {
            let tag_end = find_tag_end(html, tag_start)? + 1;
            depth = depth.saturating_sub(1);
            if depth == 0 {
                return Some((tag_start, tag_end));
            }
            cursor = tag_end;
            continue;
        }

        if let Some((nested_name, tag_end, self_closing)) = parse_start_tag(html, tag_start) {
            if nested_name == tag_name && !self_closing {
                depth += 1;
            }
            cursor = tag_end;
        } else {
            cursor = tag_start + 1;
        }
    }

    None
}

fn is_close_tag(html: &str, tag_start: usize, tag_name: &str) -> bool {
    let Some(rest) = html.get(tag_start + 2..) else {
        return false;
    };
    html[tag_start..].starts_with("</")
        && rest
            .get(..tag_name.len())
            .map(|name| name.eq_ignore_ascii_case(tag_name))
            .unwrap_or(false)
}

fn find_tag_end(html: &str, from: usize) -> Option<usize> {
    let mut quote = None;
    for (offset, ch) in html[from..].char_indices() {
        match (quote, ch) {
            (Some(active), current) if active == current => quote = None,
            (None, '"' | '\'') => quote = Some(ch),
            (None, '>') => return Some(from + offset),
            _ => {}
        }
    }
    None
}

fn is_translatable_block(tag_name: &str) -> bool {
    matches!(tag_name, "p" | "h1" | "h2" | "h3" | "li")
}

fn escape_text(text: &str) -> String {
    text.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
}

fn replace_nth_occurrence(
    haystack: &str,
    needle: &str,
    replacement: &str,
    nth: usize,
) -> Option<String> {
    if needle.is_empty() {
        return None;
    }

    let mut start_at = 0usize;
    let mut occurrence = 0usize;
    while let Some(relative) = haystack[start_at..].find(needle) {
        let match_start = start_at + relative;
        if occurrence == nth {
            let mut output = String::with_capacity(
                haystack.len().saturating_sub(needle.len()) + replacement.len(),
            );
            output.push_str(&haystack[..match_start]);
            output.push_str(replacement);
            output.push_str(&haystack[match_start + needle.len()..]);
            return Some(output);
        }
        occurrence += 1;
        start_at = match_start + needle.len();
    }

    None
}