use anyhow::Error;
use regex::Regex;
use std::borrow::Cow;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::OnceLock;
use url::Url;
pub(crate) mod epub;
pub(crate) mod html1;
pub(crate) mod html2;
pub(crate) mod html3;
pub(crate) mod llm;
use llm::LlmConnector;
#[derive(Clone, Debug)]
pub enum LlmApiStyle {
OLLAMA,
OPENAI,
GEMINI,
ANTHROPIC,
}
impl FromStr for LlmApiStyle {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"ollama" => Ok(Self::OLLAMA),
"openai" => Ok(Self::OPENAI),
"gemini" => Ok(Self::GEMINI),
"anthropic" => Ok(Self::ANTHROPIC),
_ => Err(format!("Unsupported LLM API style: {}", s)),
}
}
}
#[derive(Clone, Debug)]
pub enum LlmProvider {
Custom {
api_style: LlmApiStyle,
full_url: String,
},
OLLAMA { full_url: Option<String> },
OPENAI,
GEMINI,
ANTHROPIC,
ZHIPU,
DEEPSEEK,
QWEN,
}
impl FromStr for LlmProvider {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let parts: Vec<&str> = s.split(';').collect();
let main_part = parts[0].to_lowercase();
match main_part.as_str() {
"openai" => Ok(Self::OPENAI),
"gemini" => Ok(Self::GEMINI),
"anthropic" => Ok(Self::ANTHROPIC),
"zhipu" => Ok(Self::ZHIPU),
"deepseek" => Ok(Self::DEEPSEEK),
"qwen" => Ok(Self::QWEN),
"ollama" => {
let url = parts.get(1).map(|&u| u.to_string());
Ok(Self::OLLAMA { full_url: url })
}
"custom" => {
if parts.len() < 3 {
return Err(
"Wrong custom provider format. It should be 'custom:<api_style>:<url>'"
.into(),
);
}
let api_style = parts[1].parse::<LlmApiStyle>()?;
let full_url = parts[2].to_string();
Ok(Self::Custom {
api_style,
full_url,
})
}
_ => Err(format!("Unkonwn provider: {}", main_part)),
}
}
}
#[derive(Clone, Debug)]
pub struct LlmConfig {
pub model_name: String,
pub provider: LlmProvider,
pub api_key: Option<String>,
pub temperature: Option<f64>,
pub time_out: Option<u64>,
}
impl LlmConfig {
pub fn new(model_name: &str, provider: LlmProvider) -> Self {
Self {
model_name: model_name.into(),
provider,
api_key: None,
temperature: None,
time_out: None,
}
}
pub fn set_api_key(&mut self, api_key: &str) -> &mut Self {
self.api_key = Some(api_key.to_string());
self
}
pub fn set_temperature(&mut self, temperature: f64) -> &mut Self {
self.temperature = Some(temperature);
self
}
pub fn set_time_out(&mut self, time_out: u64) -> &mut Self {
self.time_out = Some(time_out);
self
}
}
#[derive(Clone, Debug)]
pub(crate) struct LlmConfigInner {
pub model_name: String,
pub full_url: String,
pub api_style: LlmApiStyle,
pub api_key: Option<String>,
pub temperature: f64,
pub time_out: u64,
}
#[derive(Clone, Debug)]
pub enum SyntaxStrategy {
MaintainedByLlm,
MaintainedByTransBot,
Stripped,
}
impl FromStr for SyntaxStrategy {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"byllm" => Ok(Self::MaintainedByLlm),
"bytransbot" => Ok(Self::MaintainedByTransBot),
"stripped" => Ok(Self::Stripped),
_ => Err(format!("Unsupported syntax strategy: {}", s)),
}
}
}
#[derive(Clone, Debug)]
pub struct PromptHint {
pub topic: Option<String>,
pub extra_prompt: Option<String>,
pub full_prompt: Option<String>,
}
impl PromptHint {
pub fn new() -> Self {
Self {
topic: None,
extra_prompt: None,
full_prompt: None,
}
}
pub fn set_topic(&mut self, topic: &str) -> &mut Self {
self.topic = Some(topic.to_string());
self
}
pub fn set_extra_prompt(&mut self, extra_prompt: &str) -> &mut Self {
self.extra_prompt = Some(extra_prompt.to_string());
self
}
pub fn set_full_prompt(&mut self, full_prompt: &str) -> &mut Self {
self.full_prompt = Some(full_prompt.to_string());
self
}
}
impl Default for PromptHint {
fn default() -> Self {
Self::new()
}
}
#[derive(Clone, Debug)]
pub struct TransConfig {
pub dest_lang: Option<String>,
pub html_elem_selector: Option<String>,
pub syntax_strategy: Option<SyntaxStrategy>,
pub prompt_hint: Option<PromptHint>,
pub print_translating_text: Option<bool>,
pub clean_cjk_ascii_spacing: Option<bool>,
}
impl TransConfig {
pub fn new() -> Self {
Self {
dest_lang: None,
html_elem_selector: None,
syntax_strategy: None,
prompt_hint: None,
print_translating_text: None,
clean_cjk_ascii_spacing: None,
}
}
pub fn set_dest_lang(&mut self, dest_lang: &str) -> &mut Self {
self.dest_lang = Some(dest_lang.to_string());
self
}
pub fn set_html_elem_selector(&mut self, html_elem_selector: &str) -> &mut Self {
self.html_elem_selector = Some(html_elem_selector.to_string());
self
}
pub fn set_syntax_strategy(&mut self, syntax_strategy: SyntaxStrategy) -> &mut Self {
self.syntax_strategy = Some(syntax_strategy);
self
}
pub fn set_prompt_hint(&mut self, prompt_hint: PromptHint) -> &mut Self {
self.prompt_hint = Some(prompt_hint);
self
}
pub fn set_print_translating_text(&mut self, print_translating_text: bool) -> &mut Self {
self.print_translating_text = Some(print_translating_text);
self
}
pub fn set_clean_cjk_ascii_spacing(&mut self, clean_cjk_ascii_spacing: bool) -> &mut Self {
self.clean_cjk_ascii_spacing = Some(clean_cjk_ascii_spacing);
self
}
}
impl Default for TransConfig {
fn default() -> Self {
Self::new()
}
}
pub(crate) struct TransConfigInner {
dest_lang: String,
html_elem_selector: String,
syntax_strategy: SyntaxStrategy,
print_translating_text: bool,
clean_spacing: bool,
}
fn verify_url(url_str: &str) -> Result<String, Error> {
let url = Url::parse(url_str)?;
Ok(url.to_string())
}
fn get_prompt(dest_lang: &str, prompt_hint: &Option<PromptHint>) -> String {
let mut topic = "".to_string();
let mut extra_prompt = "".to_string();
if let Some(hint) = prompt_hint {
if let Some(prompt) = hint.full_prompt.as_ref() {
return prompt.to_owned();
}
if let Some(t) = hint.topic.as_ref() {
topic = format!(" related to {}", t);
}
if let Some(e) = hint.extra_prompt.as_ref() {
extra_prompt = e.to_owned();
}
}
format!(
"You are a professional translator. \
Translate the provided text{} into {}. \
Strictly maintain the original HTML tags and HTML entities. \
Return the translated text only. {}",
topic, dest_lang, extra_prompt
)
}
pub(crate) fn get_extended_path<P: AsRef<Path>>(
src_path: P,
to_extend: &str,
at_end: bool,
) -> PathBuf {
let path = src_path.as_ref().to_path_buf();
let parent = path.parent().unwrap_or_else(|| Path::new(""));
let new_filename = if at_end {
let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
format!("{}.{}", filename, to_extend,)
} else {
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
format!("{}.{}.{}", stem, to_extend, ext)
};
parent.join(new_filename)
}
pub struct TransBot {
trans_config: TransConfigInner,
llm_interactor: LlmConnector,
}
impl TransBot {
pub fn new(llm_config: &LlmConfig, trans_config: &TransConfig) -> Result<Self, Error> {
let mut llm_config_inner = LlmConfigInner {
model_name: llm_config.model_name.to_owned(),
api_style: match &llm_config.provider {
LlmProvider::Custom {
full_url: _,
api_style: style,
} => style.to_owned(),
LlmProvider::OLLAMA { full_url: _ } => LlmApiStyle::OLLAMA,
LlmProvider::GEMINI => LlmApiStyle::GEMINI,
LlmProvider::ANTHROPIC => LlmApiStyle::ANTHROPIC,
_ => LlmApiStyle::OPENAI,
},
full_url: match &llm_config.provider {
LlmProvider::Custom {
full_url: url,
api_style: _,
} => verify_url(url)?,
LlmProvider::OLLAMA { full_url: url } => match url {
Some(url) => verify_url(url)?,
None => String::from("http://localhost:11434/api/chat"),
},
LlmProvider::GEMINI => {
String::from("https://generativelanguage.googleapis.com/v1beta/models")
}
LlmProvider::OPENAI => String::from("https://api.openai.com/v1/chat/completions"),
LlmProvider::ANTHROPIC => String::from("https://api.anthropic.com/v1/messages"),
LlmProvider::ZHIPU => {
String::from("https://open.bigmodel.cn/api/paas/v4/chat/completions")
}
LlmProvider::DEEPSEEK => String::from("https://api.deepseek.com/chat/completions"),
LlmProvider::QWEN => String::from(
"https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
),
},
api_key: llm_config.api_key.to_owned(),
temperature: llm_config.temperature.unwrap_or(0.1),
time_out: llm_config.time_out.unwrap_or(300),
};
if let LlmApiStyle::GEMINI = &llm_config_inner.api_style {
llm_config_inner.full_url = verify_url(&format!(
"{}/{}:generateContent",
llm_config_inner.full_url, llm_config_inner.model_name
))?;
}
let trans_config_inner = TransConfigInner {
dest_lang: trans_config
.dest_lang
.to_owned()
.unwrap_or("Chinese(汉语)".into()),
html_elem_selector: trans_config
.html_elem_selector
.to_owned()
.unwrap_or("p,h1,h2,h3,li".into()),
syntax_strategy: trans_config
.syntax_strategy
.to_owned()
.unwrap_or(SyntaxStrategy::MaintainedByLlm),
print_translating_text: trans_config.print_translating_text.unwrap_or(false),
clean_spacing: trans_config.clean_cjk_ascii_spacing.unwrap_or(false),
};
let llm_interactor = LlmConnector::new(
llm_config_inner,
get_prompt(&trans_config_inner.dest_lang, &trans_config.prompt_hint),
trans_config_inner.print_translating_text,
trans_config_inner.clean_spacing,
)?;
Ok(Self {
trans_config: trans_config_inner,
llm_interactor,
})
}
pub fn set_prompt(&mut self, prompt_hint: &PromptHint) {
let hint = Some(prompt_hint.to_owned());
self.llm_interactor
.set_prompt(get_prompt(&self.trans_config.dest_lang, &hint));
}
pub fn translate_html(&self, orig_html: &[u8]) -> Result<Vec<u8>, Error> {
if self.trans_config.html_elem_selector.to_lowercase() == "whole" {
let input = String::from_utf8_lossy(orig_html);
let output = self.llm_interactor.interact(&input)?;
return Ok(output.into());
}
match self.trans_config.syntax_strategy {
SyntaxStrategy::MaintainedByTransBot => html1::translate_html(
&self.llm_interactor,
&self.trans_config.html_elem_selector,
orig_html,
),
SyntaxStrategy::MaintainedByLlm => html2::translate_html(
&self.llm_interactor,
&self.trans_config.html_elem_selector,
orig_html,
),
SyntaxStrategy::Stripped => html3::translate_html(
&self.llm_interactor,
&self.trans_config.html_elem_selector,
orig_html,
),
}
}
pub fn translate_html_file<P: AsRef<Path>>(
&self,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
let input = std::fs::read(src_path.as_ref())?;
let output = self.translate_html(&input)?;
if let Some(dest) = dest_path {
std::fs::write(dest, &output)?;
} else {
let dest = get_extended_path(src_path, "transbot", false);
std::fs::write(dest, &output)?;
}
Ok(())
}
pub fn translate_epub_file<P: AsRef<Path>>(
&self,
src_path: P,
dest_path: Option<P>,
) -> Result<(), Error> {
if let Some(dest) = dest_path {
epub::epub(self, src_path, dest)
} else {
let dest = get_extended_path(src_path.as_ref(), "transbot", false);
epub::epub(self, src_path, dest)
}
}
pub(crate) fn get_llm_interactor(&self) -> &LlmConnector {
&self.llm_interactor
}
}
pub(crate) fn remove_boundary_spaces<'a>(text: &'a str) -> Cow<'a, str> {
static RE_ASCII_NON: OnceLock<Regex> = OnceLock::new();
static RE_NON_ASCII: OnceLock<Regex> = OnceLock::new();
let re_ascii_non =
RE_ASCII_NON.get_or_init(|| Regex::new(r"(\p{ASCII})\s+(\P{ASCII})").unwrap());
let re_non_ascii =
RE_NON_ASCII.get_or_init(|| Regex::new(r"(\P{ASCII})\s+(\p{ASCII})").unwrap());
let step1 = re_ascii_non.replace_all(text, "$1$2");
match step1 {
Cow::Borrowed(b) => re_non_ascii.replace_all(b, "$1$2"),
Cow::Owned(s) => {
let step2 = re_non_ascii.replace_all(&s, "$1$2");
match step2 {
Cow::Borrowed(_) => Cow::Owned(s),
Cow::Owned(s2) => Cow::Owned(s2),
}
}
}
}
pub(crate) const SYNTAX_TAG: &str = "a";