trans-epub 0.0.23

Translate EPUB with CLI
use crate::translate::translator::Translator;
use log::{debug, info};
use quick_xml::events::{BytesText, Event};
use quick_xml::{Reader, Writer, escape::unescape};
use regex::Regex;
use std::fs::File;
use std::io::{Cursor, Read, Write};
use std::path::PathBuf;
use zip::ZipArchive;
use zip::write::SimpleFileOptions;

pub struct Epub {
    input_path: PathBuf,
    output_path: PathBuf,
}

impl Epub {
    pub fn new(input_path: PathBuf, output_path: PathBuf) -> Self {
        Self {
            input_path,
            output_path,
        }
    }

    pub async fn translate(self, translator: Translator) {
        debug!("translate start");
        let input_file = File::open(self.input_path).expect("input file open fail");
        let mut archive = ZipArchive::new(input_file).expect("input file unzip fail");

        let mut file_contents = Vec::new();
        for i in 0..archive.len() {
            let mut file = archive.by_index(i).unwrap();
            let mut buffer = Vec::new();
            file.read_to_end(&mut buffer).unwrap();
            file_contents.push((file.name().to_string(), buffer));
        }

        let mut translated_contents = Vec::new();
        let mut count = 1;
        let size = file_contents.len();
        for (name, content) in file_contents {
            info!("{count}/{size} {name}");
            count += 1;
            if name.ends_with(".xhtml")
                || name.ends_with(".xml")
                || name.ends_with(".html")
                || name.ends_with(".htm")
            {
                let content = strip_xml_content(&content);
                let lines = translate_lines(&content).await;
                let lines = translator.translate(lines).await;
                let translated_content = translate_xml_content(lines, &content).await;
                translated_contents.push((name, translated_content));
            } else {
                translated_contents.push((name, content));
            }
        }
        debug!("translate end");

        debug!("output file start");
        let file = File::create(self.output_path).expect("output file open fail");
        let mut zip = zip::ZipWriter::new(file);

        for (name, content) in translated_contents {
            zip.start_file(name, SimpleFileOptions::default())
                .expect("output file zip fail");
            zip.write_all(&content).expect("output file zip fail");
        }

        zip.finish().expect("output file zip fail");
        debug!("output file end");
    }
}

fn strip_xml_content(content: &[u8]) -> Vec<u8> {
    let mut reader = Reader::from_reader(content);
    reader.config_mut().trim_text(true);

    let mut writer = Writer::new(Cursor::new(Vec::new()));
    let mut is_rt = false;
    loop {
        match reader.read_event() {
            Ok(Event::Eof) => break,
            Ok(Event::Start(e)) => match e.name().0 {
                b"rt" => is_rt = true,
                b"ruby" => continue,
                _ => writer.write_event(Event::Start(e)).unwrap(),
            },
            Ok(Event::End(e)) => match e.name().0 {
                b"rt" => is_rt = false,
                b"ruby" => continue,
                _ => writer.write_event(Event::End(e)).unwrap(),
            },
            Ok(Event::Text(e)) if !is_rt => writer.write_event(Event::Text(e)).unwrap(),
            Ok(Event::Text(_)) if is_rt => continue,
            event => writer.write_event(event.unwrap()).unwrap(),
        }
    }
    writer.into_inner().into_inner()
}

async fn translate_lines(content: &[u8]) -> Vec<String> {
    let ignore_text = Regex::new(r"^[\s\p{Cc}\p{So}0-9[:punct:]–]*$").unwrap();
    let mut reader = Reader::from_reader(content);
    reader.config_mut().trim_text(true);

    let mut is_translate = false;
    let mut translate_tag: String = String::new();
    let mut depth = 0;
    let mut translate: String = String::new();
    let mut result = Vec::new();

    loop {
        match reader.read_event() {
            Ok(Event::Eof) => break,
            Ok(Event::Start(e)) => {
                let tag = std::str::from_utf8(e.name().0).unwrap();
                match tag {
                    "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" => {
                        if !is_translate {
                            translate_tag = tag.to_string();
                            is_translate = true;
                            translate = String::new();
                        }
                        if *tag == translate_tag {
                            depth += 1;
                        }
                    }
                    _ => (),
                }
            }
            Ok(Event::End(e)) => {
                let tag = std::str::from_utf8(e.name().0).unwrap();
                match tag {
                    "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" => {
                        if *tag == translate_tag {
                            depth -= 1;
                            if depth == 0 {
                                is_translate = false;
                                if !ignore_text.is_match(&translate) {
                                    result.push(translate.clone());
                                }
                            }
                        }
                    }
                    _ => (),
                }
            }
            Ok(Event::Text(e)) => {
                let original_text = unescape(str::from_utf8(e.as_ref()).unwrap())
                    .unwrap()
                    .into_owned();
                if is_translate {
                    translate.push_str(&original_text);
                }
            }
            _ => (),
        }
    }
    result
}

async fn translate_xml_content(lines: Vec<String>, content: &[u8]) -> Vec<u8> {
    let ignore_text = Regex::new(r"^[\s\p{Cc}\p{So}0-9[:punct:]–]*$").unwrap();
    let mut reader = Reader::from_reader(content);
    reader.config_mut().trim_text(true);

    let mut writer = Writer::new(Cursor::new(Vec::new()));
    let mut is_translate = false;
    let mut translate_tag: String = String::new();
    let mut depth = 0;
    let mut translate: String = String::new();
    let mut index = 0;

    loop {
        match reader.read_event() {
            Ok(Event::Eof) => break,
            Ok(Event::Start(e)) => {
                let tag = std::str::from_utf8(e.name().0).unwrap();
                match tag {
                    "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" => {
                        if !is_translate {
                            translate_tag = tag.to_string();
                            is_translate = true;
                            translate = String::new();
                        }
                        if *tag == translate_tag {
                            depth += 1;
                        }
                        writer.write_event(Event::Start(e)).unwrap();
                    }
                    _ => writer.write_event(Event::Start(e)).unwrap(),
                }
            }
            Ok(Event::End(e)) => {
                let tag = std::str::from_utf8(e.name().0).unwrap();
                match tag {
                    "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" => {
                        if *tag == translate_tag {
                            depth -= 1;
                            if depth == 0 {
                                is_translate = false;
                                if !ignore_text.is_match(&translate) {
                                    writer
                                        .write_event(Event::Text(BytesText::new("<<")))
                                        .unwrap();
                                    writer
                                        .write_event(Event::Text(BytesText::new(
                                            lines.get(index).unwrap(),
                                        )))
                                        .unwrap();
                                    writer
                                        .write_event(Event::Text(BytesText::new(">>")))
                                        .unwrap();
                                    index += 1;
                                }
                            }
                        }
                        writer.write_event(Event::End(e)).unwrap();
                    }
                    _ => writer.write_event(Event::End(e)).unwrap(),
                }
            }
            Ok(Event::Text(e)) => {
                let original_text = unescape(str::from_utf8(e.as_ref()).unwrap())
                    .unwrap()
                    .into_owned();
                if is_translate {
                    translate.push_str(&original_text);
                }
                writer
                    .write_event(Event::Text(BytesText::new(&original_text)))
                    .unwrap();
            }
            event => writer.write_event(event.unwrap()).unwrap(),
        }
    }
    writer.into_inner().into_inner()
}