clitrans 0.4.1

Yet another command-line translator
#[cfg(test)]
mod test;

use std::sync::LazyLock;

use super::*;
use itertools::Itertools;
use regex::Regex;
use scraper::{ElementRef, Html, Selector};
use url::Url;

#[derive(Clone)]
pub struct Translator;

impl Translate for Translator {
    fn translate(&self, query: &str) -> Result<Option<Translation>> {
        let url: Url = format!("http://dict.youdao.com/w/{query}").parse()?;
        let resp = ureq::request_url("GET", &url)
            .set("Accept-Encoding", "gzip")
            .set("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7")
            .call()?
            .into_string()?;
        Ok(parse(url, &resp))
    }
}

fn parse(url: Url, body: &str) -> Option<Translation> {
    let root = Html::parse_document(body);
    let content = get_element(&root, "#results-contents")?;
    let query = get_text(content, "#phrsListTab > h2 > .keyword")
        .into_iter()
        .next()
        .or_else(|| {
            get_text(content, "#fanyiToggle > div > p:nth-child(1)")
                .into_iter()
                .next()
        })?;
    let prons = parse_pronounciations(content);
    let exps = parse_explanation(content);
    let phrases = parse_phrases(content);
    Some(
        Translation::new(query, url.to_string())
            .pronunciations(prons)
            .explanations(exps)
            .phrases(phrases),
    )
}

fn parse_phrases(content: ElementRef) -> Vec<(String, Vec<String>)> {
    let mut rs = vec![];
    for item in content.select(&Selector::parse("#webPhrase > p.wordGroup").unwrap()) {
        if let Some(title) = get_text(item, ".contentTitle").into_iter().next() {
            let text: String = item.text().collect();
            let items = text
                .replacen(&title, "", 1)
                .split(&[';', ''][..])
                .map(|s| s.split_whitespace().join(" "))
                .unique()
                .collect();
            rs.push((title, items));
        }
    }
    rs
}

fn parse_pronounciations(detail: ElementRef) -> Vec<Pronunciation> {
    static RE_US: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*美\s*\[(.*?)]").unwrap());
    static RE_UK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*英\s*\[(.*?)]").unwrap());
    let mut prons = vec![];
    let pron_selector = Selector::parse("#phrsListTab > h2 > div.baav > .pronounce").unwrap();
    let audio_selector = Selector::parse("a.dictvoice").unwrap();
    for pron in detail.select(&pron_selector) {
        let text: String = pron.text().collect();
        let audio = pron
            .select(&audio_selector)
            .next()
            .and_then(|a| a.value().attr("data-rel"))
            .map(|data_rel| format!("https://dict.youdao.com/dictvoice?audio={data_rel}"));
        if let Some(caps) = RE_US.captures(&text) {
            prons.push(Pronunciation::us(caps[1].to_owned()).audio(audio));
        } else if let Some(caps) = RE_UK.captures(&text) {
            prons.push(Pronunciation::uk(caps[1].to_owned()).audio(audio));
        }
    }
    if let Some(s) = get_text(detail, "#phrsListTab > h2 > span.phonetic").first() {
        prons.push(Pronunciation::pinyin(
            s.as_str().trim_matches(&['[', ']'][..]).to_owned(),
        ));
    }
    prons
}

fn parse_explanation(detail: ElementRef) -> Vec<Explanation> {
    let mut exps = parse_explanation_en(detail);
    exps.extend(parse_explanation_cn(detail));
    exps.extend(parse_explanation_machine(detail));
    exps.extend(parse_explanation_web(detail));
    exps
}

fn parse_explanation_en(detail: ElementRef) -> Vec<Explanation> {
    static RE_EXP: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r#"(?P<pos>\w+\.)?(?P<exp>.*)"#).unwrap());
    let selector = Selector::parse("#phrsListTab > div.trans-container > ul > li").unwrap();
    let mut exps = vec![];
    for li in detail.select(&selector) {
        let text: String = li.text().collect();
        if let Some(caps) = RE_EXP.captures(&text) {
            if let Some(exp) = caps.name("exp") {
                let tag = caps
                    .name("pos")
                    .map(|pos| ExpTag::Pos(pos.as_str().trim().to_owned()))
                    .unwrap_or(ExpTag::Phrase);
                let items = exp
                    .as_str()
                    .split(&['', ';'][..])
                    .map(|v| v.trim().to_owned())
                    .collect();
                exps.push(Explanation { tag, items });
            }
        }
    }
    exps
}

fn parse_explanation_cn(detail: ElementRef) -> Vec<Explanation> {
    let mut exps = vec![];
    let selector = Selector::parse("#phrsListTab > div.trans-container > ul > p").unwrap();
    for record in detail.select(&selector) {
        let values = get_text(record, "span .search-js");
        if values.is_empty() {
            continue;
        }
        let tag = get_text(record, "span:nth-child(1):not(.contentTitle)")
            .into_iter()
            .next()
            .map(ExpTag::Pos)
            .unwrap_or(ExpTag::Phrase);
        let items = get_text(record, "span .search-js");
        exps.push(Explanation { tag, items });
    }
    exps
}

fn parse_explanation_machine(detail: ElementRef) -> Option<Explanation> {
    let value = get_text(detail, "#fanyiToggle > div > p:nth-child(2)")
        .into_iter()
        .next()?;
    Some(Explanation { tag: ExpTag::Machine, items: vec![value] })
}

fn parse_explanation_web(detail: ElementRef) -> Vec<Explanation> {
    let texts = get_text(detail, "#tWebTrans > div.wt-container > .title");
    let items = texts
        .iter()
        .map(|s| s.split_whitespace().join(" "))
        .collect();
    vec![Explanation { tag: ExpTag::Web, items }]
}