use anyhow::{Error, anyhow};
use regex::Regex;
use std::borrow::Cow;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::{
OnceLock,
atomic::{AtomicBool, Ordering},
};
use url::Url;
pub(crate) mod epub;
pub(crate) mod html1;
pub(crate) mod html2;
pub(crate) mod html3;
pub(crate) mod llm;
pub(crate) mod markdown1;
pub(crate) mod markdown2;
pub(crate) mod text;
use llm::LlmConnector;
#[derive(Clone, Debug)]
pub enum DocFormat {
Html,
Epub,
MarkDown,
Text,
}
impl FromStr for DocFormat {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"html" => Ok(Self::Html),
"epub" => Ok(Self::Html),
"md" => Ok(Self::MarkDown),
"text" => Ok(Self::Text),
_ => Err(format!("Unsupported document format: {}", s)),
}
}
}
#[derive(Clone, Debug)]
pub enum LlmApiStyle {
OLLAMA,
OPENAI,
GEMINI,
ANTHROPIC,
}
impl FromStr for LlmApiStyle {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"ollama" => Ok(Self::OLLAMA),
"openai" => Ok(Self::OPENAI),
"gemini" => Ok(Self::GEMINI),
"anthropic" => Ok(Self::ANTHROPIC),
_ => Err(format!("Unsupported LLM API style: {}", s)),
}
}
}
#[derive(Clone, Debug)]
pub enum LlmProvider {
Custom {
api_style: LlmApiStyle,
full_url: String,
},
OLLAMA { full_url: Option<String> },
OPENAI,
GEMINI,
ANTHROPIC,
ZHIPU,
DEEPSEEK,
QWEN,
}
impl FromStr for LlmProvider {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let parts: Vec<&str> = s.split(';').collect();
let main_part = parts[0].to_lowercase();
match main_part.as_str() {
"openai" => Ok(Self::OPENAI),
"gemini" => Ok(Self::GEMINI),
"anthropic" => Ok(Self::ANTHROPIC),
"zhipu" => Ok(Self::ZHIPU),
"deepseek" => Ok(Self::DEEPSEEK),
"qwen" => Ok(Self::QWEN),
"ollama" => {
let url = parts.get(1).map(|&u| u.to_string());
Ok(Self::OLLAMA { full_url: url })
}
"custom" => {
if parts.len() < 3 {
return Err(
"Wrong custom provider format. It should be 'custom;<api_style>;<url>'"
.into(),
);
}
let api_style = parts[1].parse::<LlmApiStyle>()?;
let full_url = parts[2].to_string();
Ok(Self::Custom {
api_style,
full_url,
})
}
_ => Err(format!("Unkonwn provider: {}", main_part)),
}
}
}
#[derive(Clone, Debug)]
pub struct LlmConfig {
pub model_name: String,
pub provider: LlmProvider,
pub api_key: Option<String>,
pub temperature: Option<f64>,
pub time_out: Option<u64>,
}
impl LlmConfig {
pub fn new(model_name: &str, provider: LlmProvider) -> Self {
Self {
model_name: model_name.into(),
provider,
api_key: None,
temperature: None,
time_out: None,
}
}
pub fn set_api_key(&mut self, api_key: &str) -> &mut Self {
self.api_key = Some(api_key.to_string());
self
}
pub fn set_temperature(&mut self, temperature: f64) -> &mut Self {
self.temperature = Some(temperature);
self
}
pub fn set_time_out(&mut self, time_out: u64) -> &mut Self {
self.time_out = Some(time_out);
self
}
}
#[derive(Clone, Debug)]
pub(crate) struct LlmConfigInner {
pub model_name: String,
pub full_url: String,
pub api_style: LlmApiStyle,
pub api_key: Option<String>,
pub temperature: f64,
pub time_out: u64,
}
#[derive(Clone, Debug)]
pub enum SyntaxStrategy {
MaintainedByLlm,
MaintainedByTransBot,
Stripped,
}
impl FromStr for SyntaxStrategy {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"byllm" => Ok(Self::MaintainedByLlm),
"bytransbot" => Ok(Self::MaintainedByTransBot),
"stripped" => Ok(Self::Stripped),
_ => Err(format!("Unsupported syntax strategy: {}", s)),
}
}
}
#[derive(Clone, Debug)]
pub struct PromptHint {
pub topic: Option<String>,
pub extra_prompt: Option<String>,
pub full_prompt: Option<String>,
}
impl PromptHint {
pub fn new() -> Self {
Self {
topic: None,
extra_prompt: None,
full_prompt: None,
}
}
pub fn set_topic(&mut self, topic: &str) -> &mut Self {
self.topic = Some(topic.to_string());
self
}
pub fn set_extra_prompt(&mut self, extra_prompt: &str) -> &mut Self {
self.extra_prompt = Some(extra_prompt.to_string());
self
}
pub fn set_full_prompt(&mut self, full_prompt: &str) -> &mut Self {
self.full_prompt = Some(full_prompt.to_string());
self
}
}
impl Default for PromptHint {
fn default() -> Self {
Self::new()
}
}
#[derive(Clone, Debug)]
pub struct TransConfig {
pub dest_lang: Option<String>,
pub single_prompt: Option<bool>,
pub html_elem_selector: Option<String>,
pub syntax_strategy: Option<SyntaxStrategy>,
pub prompt_hint: Option<PromptHint>,
pub print_translating_text: Option<bool>,
pub clean_cjk_ascii_spacing: Option<bool>,
pub whole_doc_to_llm: Option<bool>,
pub trans_code_in_md: Option<bool>,
pub text_chunk_size: Option<usize>,
}
impl TransConfig {
pub fn new() -> Self {
Self {
dest_lang: None,
single_prompt: None,
html_elem_selector: None,
syntax_strategy: None,
prompt_hint: None,
print_translating_text: None,
clean_cjk_ascii_spacing: None,
whole_doc_to_llm: None,
trans_code_in_md: None,
text_chunk_size: None,
}
}
pub fn set_dest_lang(&mut self, dest_lang: &str) -> &mut Self {
self.dest_lang = Some(dest_lang.to_string());
self
}
pub fn set_single_prompt(&mut self, single_prompt: bool) -> &mut Self {
self.single_prompt = Some(single_prompt);
self
}
pub fn set_html_elem_selector(&mut self, html_elem_selector: &str) -> &mut Self {
self.html_elem_selector = Some(html_elem_selector.to_string());
self
}
pub fn set_syntax_strategy(&mut self, syntax_strategy: SyntaxStrategy) -> &mut Self {
self.syntax_strategy = Some(syntax_strategy);
self
}
pub fn set_prompt_hint(&mut self, prompt_hint: PromptHint) -> &mut Self {
self.prompt_hint = Some(prompt_hint);
self
}
pub fn set_print_translating_text(&mut self, print_translating_text: bool) -> &mut Self {
self.print_translating_text = Some(print_translating_text);
self
}
pub fn set_clean_cjk_ascii_spacing(&mut self, clean_cjk_ascii_spacing: bool) -> &mut Self {
self.clean_cjk_ascii_spacing = Some(clean_cjk_ascii_spacing);
self
}
}
impl Default for TransConfig {
fn default() -> Self {
Self::new()
}
}
pub(crate) struct TransConfigInner {
dest_lang: String,
single_prompt: bool,
html_elem_selector: String,
syntax_strategy: SyntaxStrategy,
print_translating_text: bool,
clean_spacing: bool,
whole_doc_to_llm: bool,
trans_code_in_md: bool,
text_chunk_size: usize,
}
fn verify_url(url_str: &str) -> Result<String, Error> {
let url = Url::parse(url_str)?;
Ok(url.to_string())
}
fn get_prompt(dest_lang: &str, prompt_hint: &Option<PromptHint>, single_prompt: bool) -> String {
let mut topic = "".to_string();
let mut extra_prompt = "".to_string();
if let Some(hint) = prompt_hint {
if let Some(prompt) = hint.full_prompt.as_ref() {
return prompt.to_owned();
}
if let Some(t) = hint.topic.as_ref() {
topic = format!(" related to '{}'", t);
}
if let Some(e) = hint.extra_prompt.as_ref() {
extra_prompt = format!(" {}\n", e);
}
}
let translate_request = if single_prompt {
format!("Please translate below text into {}:", dest_lang)
} else {
format!("Please translate the provided text into {}.", dest_lang)
};
format!(
"You are a professional translator. \
Your task is to translate the provided text{} into {}. \
Strictly maintain the original format, including HTML/XML tags and entities. \
Return the translated text only.{} {}",
topic, dest_lang, extra_prompt, &translate_request,
)
}
pub(crate) fn get_extended_path<P: AsRef<Path>>(
src_path: P,
to_extend: &str,
at_end: bool,
) -> PathBuf {
let path = src_path.as_ref().to_path_buf();
let parent = path.parent().unwrap_or_else(|| Path::new(""));
let new_filename = if at_end {
let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
format!("{}.{}", filename, to_extend,)
} else {
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
format!("{}.{}.{}", stem, to_extend, ext)
};
parent.join(new_filename)
}
pub struct TransBot {
trans_config: TransConfigInner,
llm_interactor: LlmConnector,
resuming_enabled: bool,
is_interrupted: AtomicBool,
}
impl TransBot {
pub fn new(llm_config: &LlmConfig, trans_config: &TransConfig) -> Result<Self, Error> {
let mut llm_config_inner = LlmConfigInner {
model_name: llm_config.model_name.to_owned(),
api_style: match &llm_config.provider {
LlmProvider::Custom {
full_url: _,
api_style: style,
} => style.to_owned(),
LlmProvider::OLLAMA { full_url: _ } => LlmApiStyle::OLLAMA,
LlmProvider::GEMINI => LlmApiStyle::GEMINI,
LlmProvider::ANTHROPIC => LlmApiStyle::ANTHROPIC,
_ => LlmApiStyle::OPENAI,
},
full_url: match &llm_config.provider {
LlmProvider::Custom {
full_url: url,
api_style: _,
} => verify_url(url)?,
LlmProvider::OLLAMA { full_url: url } => match url {
Some(url) => verify_url(url)?,
None => String::from("http://localhost:11434/api/chat"),
},
LlmProvider::GEMINI => {
String::from("https://generativelanguage.googleapis.com/v1beta/models")
}
LlmProvider::OPENAI => String::from("https://api.openai.com/v1/chat/completions"),
LlmProvider::ANTHROPIC => String::from("https://api.anthropic.com/v1/messages"),
LlmProvider::ZHIPU => {
String::from("https://open.bigmodel.cn/api/paas/v4/chat/completions")
}
LlmProvider::DEEPSEEK => String::from("https://api.deepseek.com/chat/completions"),
LlmProvider::QWEN => String::from(
"https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
),
},
api_key: llm_config.api_key.to_owned(),
temperature: llm_config.temperature.unwrap_or(0.1),
time_out: llm_config.time_out.unwrap_or(300),
};
if let LlmApiStyle::GEMINI = &llm_config_inner.api_style {
llm_config_inner.full_url = verify_url(&format!(
"{}/{}:generateContent",
llm_config_inner.full_url, llm_config_inner.model_name
))?;
}
let trans_config_inner = TransConfigInner {
dest_lang: trans_config
.dest_lang
.to_owned()
.unwrap_or("Chinese(zh-Hans)".into()),
single_prompt: trans_config.single_prompt.unwrap_or(false),
html_elem_selector: trans_config
.html_elem_selector
.to_owned()
.unwrap_or("p,h1,h2,h3,li".into()),
syntax_strategy: trans_config
.syntax_strategy
.to_owned()
.unwrap_or(SyntaxStrategy::MaintainedByLlm),
print_translating_text: trans_config.print_translating_text.unwrap_or(false),
clean_spacing: trans_config.clean_cjk_ascii_spacing.unwrap_or(false),
whole_doc_to_llm: trans_config.whole_doc_to_llm.unwrap_or(false),
trans_code_in_md: trans_config.trans_code_in_md.unwrap_or(false),
text_chunk_size: trans_config.text_chunk_size.unwrap_or(400),
};
let llm_interactor = LlmConnector::new(
llm_config_inner,
trans_config_inner.single_prompt,
get_prompt(
&trans_config_inner.dest_lang,
&trans_config.prompt_hint,
trans_config_inner.single_prompt,
),
trans_config_inner.print_translating_text,
trans_config_inner.clean_spacing,
)?;
Ok(Self {
trans_config: trans_config_inner,
llm_interactor,
resuming_enabled: false,
is_interrupted: AtomicBool::new(false),
})
}
pub fn set_resuming_support(&mut self, enabled: bool) {
self.resuming_enabled = enabled;
}
pub fn set_interrupted(&self) {
self.is_interrupted.store(true, Ordering::Release);
}
pub(crate) fn is_interrupted(&self) -> bool {
self.is_interrupted.load(Ordering::Acquire)
}
pub(crate) fn get_interrupted_error() -> Error {
anyhow!("The translation job is interrupted.")
}
pub fn set_prompt(&mut self, prompt_hint: &PromptHint) {
let hint = Some(prompt_hint.to_owned());
self.llm_interactor.set_prompt(get_prompt(
&self.trans_config.dest_lang,
&hint,
self.trans_config.single_prompt,
));
}
pub fn translate_html(&self, orig_html: &[u8]) -> Result<Vec<u8>, Error> {
self.translate_bytes::<&str>(DocFormat::Html, orig_html, None)
}
pub fn translate_html_resumable<P: AsRef<Path>>(
&self,
orig_html: &[u8],
state_file_path: Option<P>,
) -> Result<Vec<u8>, Error> {
self.translate_bytes(DocFormat::Html, orig_html, state_file_path.as_ref())
}
pub fn translate_html_file<P: AsRef<Path>>(
&self,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
self.translate_file(DocFormat::Html, src_path, dest_path)
}
pub fn translate_epub_file<P: AsRef<Path>>(
&self,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
if let Some(dest) = dest_path {
epub::epub(self, src_path, dest)
} else {
let dest = get_extended_path(src_path.as_ref(), "transbot", false);
epub::epub(self, src_path, dest)
}
}
pub fn translate_markdown<P: AsRef<Path>>(
&self,
orig_markdown: &[u8],
state_file_path: Option<P>,
) -> Result<Vec<u8>, Error> {
self.translate_bytes(DocFormat::MarkDown, orig_markdown, state_file_path.as_ref())
}
pub fn translate_markdown_file<P: AsRef<Path>>(
&self,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
self.translate_file(DocFormat::MarkDown, src_path, dest_path)
}
pub fn translate_text<P: AsRef<Path>>(
&self,
orig_markdown: &[u8],
state_file_path: Option<P>,
) -> Result<Vec<u8>, Error> {
self.translate_bytes(DocFormat::Text, orig_markdown, state_file_path.as_ref())
}
pub fn translate_text_file<P: AsRef<Path>>(
&self,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
self.translate_file(DocFormat::Text, src_path, dest_path)
}
fn translate_bytes_no_delete<P: AsRef<Path>>(
&self,
format: DocFormat,
orig_doc: &[u8],
state_file_path: Option<P>,
) -> Result<Vec<u8>, Error> {
let mut whole_doc_to_llm = self.trans_config.whole_doc_to_llm;
if let DocFormat::Html = format
&& self.trans_config.html_elem_selector.to_lowercase() == "whole"
{
whole_doc_to_llm = true;
}
if whole_doc_to_llm {
let input = String::from_utf8_lossy(orig_doc);
let output = self.llm_interactor.interact(&input)?;
return Ok(output.into());
}
match format {
DocFormat::Html => match self.trans_config.syntax_strategy {
SyntaxStrategy::MaintainedByTransBot => {
html1::translate_html(self, orig_doc, state_file_path)
}
SyntaxStrategy::MaintainedByLlm => {
html2::translate_html(self, orig_doc, state_file_path)
}
SyntaxStrategy::Stripped => html3::translate_html(self, orig_doc, state_file_path),
},
DocFormat::MarkDown => match self.trans_config.syntax_strategy {
SyntaxStrategy::MaintainedByTransBot => {
markdown1::translate_markdown(self, orig_doc, state_file_path)
}
SyntaxStrategy::MaintainedByLlm => {
markdown2::translate_markdown(self, orig_doc, state_file_path)
}
SyntaxStrategy::Stripped => Err(anyhow!(
"Syntax strategy 'stripped' is not supported yet for MarkDown files."
)),
},
DocFormat::Text => text::translate_text(self, orig_doc, state_file_path),
_ => Err(anyhow!("Unexpected format.")),
}
}
fn translate_bytes<P: AsRef<Path>>(
&self,
format: DocFormat,
orig_doc: &[u8],
state_file_path: Option<P>,
) -> Result<Vec<u8>, Error> {
let state_file_path = if !self.resuming_enabled {
None
} else {
state_file_path
};
let out = self.translate_bytes_no_delete(format, orig_doc, state_file_path.as_ref())?;
if self.resuming_enabled
&& let Some(path) = state_file_path
{
let _ = std::fs::remove_file(path);
}
Ok(out)
}
fn translate_file<P: AsRef<Path>>(
&self,
format: DocFormat,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
let input = std::fs::read(src_path.as_ref())?;
let dest = if let Some(dest) = dest_path {
dest.as_ref().to_path_buf()
} else {
get_extended_path(src_path, "transbot", false)
};
let state_file_path = if self.resuming_enabled {
Some(get_extended_path(&dest, "temp", true))
} else {
None
};
let output = self.translate_bytes_no_delete(format, &input, state_file_path.as_ref())?;
std::fs::write(&dest, &output)?;
if self.resuming_enabled
&& let Some(path) = state_file_path.as_ref()
{
let _ = std::fs::remove_file(path);
}
Ok(())
}
pub(crate) fn get_llm_interactor(&self) -> &LlmConnector {
&self.llm_interactor
}
}
pub(crate) fn remove_boundary_spaces<'a>(text: &'a str) -> Cow<'a, str> {
static RE_ASCII_NON: OnceLock<Regex> = OnceLock::new();
static RE_NON_ASCII: OnceLock<Regex> = OnceLock::new();
let re_ascii_non =
RE_ASCII_NON.get_or_init(|| Regex::new(r"(\p{ASCII})\s+(\P{ASCII})").unwrap());
let re_non_ascii =
RE_NON_ASCII.get_or_init(|| Regex::new(r"(\P{ASCII})\s+(\p{ASCII})").unwrap());
let step1 = re_ascii_non.replace_all(text, "$1$2");
match step1 {
Cow::Borrowed(b) => re_non_ascii.replace_all(b, "$1$2"),
Cow::Owned(s) => {
let step2 = re_non_ascii.replace_all(&s, "$1$2");
match step2 {
Cow::Borrowed(_) => Cow::Owned(s),
Cow::Owned(s2) => Cow::Owned(s2),
}
}
}
}
pub(crate) const SYNTAX_TAG: &str = "span";