dictx-parser 0.1.0

use crate::traits::{DictParser, ValidationReport};
use dictx_core::{
    clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Phrase,
    RelatedWord, RelatedWordItem, Result, Synonym,
};
use serde::Deserialize;
use serde_json::json;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{BufRead, BufReader, Lines};
use std::path::Path;

pub struct AnkiJsonlParser;

impl DictParser for AnkiJsonlParser {
    fn name(&self) -> &'static str {
        "Anki JSONL"
    }

    fn format_id(&self) -> &'static str {
        "anki-jsonl"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let file = File::open(path)?;
        let mut reader = BufReader::new(file);
        let mut first = String::new();
        reader.read_line(&mut first)?;
        if first.trim().is_empty() {
            return Ok(ValidationReport::invalid(self.format_id(), "文件为空"));
        }
        serde_json::from_str::<AnkiRawEntry>(first.trim())?;
        Ok(ValidationReport::ok(
            self.format_id(),
            count_lines(path).ok(),
        ))
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let file = File::open(path)?;
        let reader = BufReader::new(file);
        Ok(Box::new(AnkiIter {
            lines: reader.lines(),
        }))
    }
}

struct AnkiIter {
    lines: Lines<BufReader<File>>,
}

impl Iterator for AnkiIter {
    type Item = Result<DictEntry>;

    fn next(&mut self) -> Option<Self::Item> {
        for line in self.lines.by_ref() {
            match line {
                Ok(line) if line.trim().is_empty() => continue,
                Ok(line) => {
                    return Some(
                        serde_json::from_str::<AnkiRawEntry>(&line)
                            .map_err(Into::into)
                            .and_then(AnkiRawEntry::into_entry),
                    );
                }
                Err(err) => return Some(Err(err.into())),
            }
        }
        None
    }
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct AnkiRawEntry {
    word_rank: Option<u32>,
    head_word: String,
    book_id: Option<String>,
    content: Option<OuterContent>,
}

#[derive(Debug, Deserialize)]
struct OuterContent {
    word: Option<WordNode>,
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct WordNode {
    word_id: Option<String>,
    word_head: Option<String>,
    content: Option<WordContent>,
}

#[derive(Debug, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
struct WordContent {
    usphone: Option<String>,
    ukphone: Option<String>,
    trans: Option<Vec<Trans>>,
    sentence: Option<SentenceBlock>,
    syno: Option<SynoBlock>,
    phrase: Option<PhraseBlock>,
    rel_word: Option<RelWordBlock>,
    rem_method: Option<serde_json::Value>,
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Trans {
    tran_cn: Option<String>,
    tran_other: Option<String>,
    pos: Option<String>,
}

#[derive(Debug, Deserialize)]
struct SentenceBlock {
    sentences: Option<Vec<SentenceRaw>>,
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct SentenceRaw {
    s_content: Option<String>,
    s_cn: Option<String>,
}

#[derive(Debug, Deserialize)]
struct SynoBlock {
    synos: Option<Vec<SynoRaw>>,
}

#[derive(Debug, Deserialize)]
struct SynoRaw {
    pos: Option<String>,
    tran: Option<String>,
    hwds: Option<Vec<SynoWordRaw>>,
}

#[derive(Debug, Deserialize)]
struct SynoWordRaw {
    w: Option<String>,
}

#[derive(Debug, Deserialize)]
struct PhraseBlock {
    phrases: Option<Vec<PhraseRaw>>,
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct PhraseRaw {
    p_content: Option<String>,
    p_cn: Option<String>,
}

#[derive(Debug, Deserialize)]
struct RelWordBlock {
    rels: Option<Vec<RelRaw>>,
}

#[derive(Debug, Deserialize)]
struct RelRaw {
    pos: Option<String>,
    words: Option<Vec<RelWordRaw>>,
}

#[derive(Debug, Deserialize)]
struct RelWordRaw {
    hwd: Option<String>,
    tran: Option<String>,
}

impl AnkiRawEntry {
    fn into_entry(self) -> Result<DictEntry> {
        let book_id = self.book_id.unwrap_or_else(|| "anki".to_string());
        let word_node = self.content.and_then(|content| content.word);
        let word_content = word_node
            .as_ref()
            .and_then(|word| word.content.as_ref())
            .cloned()
            .unwrap_or_default();
        let word = word_node
            .as_ref()
            .and_then(|node| node.word_head.clone())
            .unwrap_or(self.head_word);

        let mut entry = DictEntry::new(
            DictSource::Anki {
                deck_name: book_id.clone(),
            },
            clean_text(word),
        );

        if let Some(word_id) = word_node.and_then(|node| node.word_id) {
            entry.id = format!("anki:{}:{}", book_id, word_id);
        }

        entry.phonetic_us = clean_optional(word_content.usphone);
        entry.phonetic_uk = clean_optional(word_content.ukphone);
        entry.definitions = parse_trans(word_content.trans.unwrap_or_default());
        entry.pos = collect_pos(&entry.definitions);
        entry.tags = vec![normalize_tag("kao_yan"), book_id.to_ascii_lowercase()];
        entry.examples = parse_examples(word_content.sentence);
        entry.synonyms = parse_synonyms(word_content.syno);
        entry.phrases = parse_phrases(word_content.phrase);
        entry.related_words = parse_related(word_content.rel_word);
        entry.mnemonic = parse_mnemonic(word_content.rem_method);
        entry.extra = json!({
            "rank": self.word_rank,
            "book_id": book_id,
        });

        Ok(entry)
    }
}

impl Clone for WordContent {
    fn clone(&self) -> Self {
        Self {
            usphone: self.usphone.clone(),
            ukphone: self.ukphone.clone(),
            trans: self.trans.clone(),
            sentence: self.sentence.clone(),
            syno: self.syno.clone(),
            phrase: self.phrase.clone(),
            rel_word: self.rel_word.clone(),
            rem_method: self.rem_method.clone(),
        }
    }
}

impl Clone for Trans {
    fn clone(&self) -> Self {
        Self {
            tran_cn: self.tran_cn.clone(),
            tran_other: self.tran_other.clone(),
            pos: self.pos.clone(),
        }
    }
}

impl Clone for SentenceBlock {
    fn clone(&self) -> Self {
        Self {
            sentences: self.sentences.clone(),
        }
    }
}

impl Clone for SentenceRaw {
    fn clone(&self) -> Self {
        Self {
            s_content: self.s_content.clone(),
            s_cn: self.s_cn.clone(),
        }
    }
}

impl Clone for SynoBlock {
    fn clone(&self) -> Self {
        Self {
            synos: self.synos.clone(),
        }
    }
}

impl Clone for SynoRaw {
    fn clone(&self) -> Self {
        Self {
            pos: self.pos.clone(),
            tran: self.tran.clone(),
            hwds: self.hwds.clone(),
        }
    }
}

impl Clone for SynoWordRaw {
    fn clone(&self) -> Self {
        Self { w: self.w.clone() }
    }
}

impl Clone for PhraseBlock {
    fn clone(&self) -> Self {
        Self {
            phrases: self.phrases.clone(),
        }
    }
}

impl Clone for PhraseRaw {
    fn clone(&self) -> Self {
        Self {
            p_content: self.p_content.clone(),
            p_cn: self.p_cn.clone(),
        }
    }
}

impl Clone for RelWordBlock {
    fn clone(&self) -> Self {
        Self {
            rels: self.rels.clone(),
        }
    }
}

impl Clone for RelRaw {
    fn clone(&self) -> Self {
        Self {
            pos: self.pos.clone(),
            words: self.words.clone(),
        }
    }
}

impl Clone for RelWordRaw {
    fn clone(&self) -> Self {
        Self {
            hwd: self.hwd.clone(),
            tran: self.tran.clone(),
        }
    }
}

fn count_lines(path: &Path) -> std::io::Result<usize> {
    let file = File::open(path)?;
    Ok(BufReader::new(file).lines().count())
}

fn clean_optional(value: Option<String>) -> Option<String> {
    value
        .map(clean_text)
        .filter(|value| !value.trim().is_empty())
}

fn parse_trans(trans: Vec<Trans>) -> Vec<Definition> {
    trans
        .into_iter()
        .filter_map(|item| {
            let zh = clean_optional(item.tran_cn).unwrap_or_default();
            let en = clean_optional(item.tran_other).unwrap_or_default();
            let pos = item.pos.map(clean_pos);
            if zh.is_empty() && en.is_empty() {
                None
            } else {
                Some(Definition::new(en, zh, pos))
            }
        })
        .collect()
}

fn collect_pos(definitions: &[Definition]) -> Vec<String> {
    let mut set = BTreeSet::new();
    for definition in definitions {
        if let Some(pos) = &definition.pos {
            set.insert(pos.clone());
        }
    }
    set.into_iter().collect()
}

fn parse_examples(block: Option<SentenceBlock>) -> Vec<Example> {
    block
        .and_then(|block| block.sentences)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|item| {
            let en = clean_optional(item.s_content).unwrap_or_default();
            let zh = clean_optional(item.s_cn).unwrap_or_default();
            if en.is_empty() && zh.is_empty() {
                None
            } else {
                Some(Example { en, zh })
            }
        })
        .collect()
}

fn parse_synonyms(block: Option<SynoBlock>) -> Vec<Synonym> {
    block
        .and_then(|block| block.synos)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|item| {
            let words: Vec<String> = item
                .hwds
                .unwrap_or_default()
                .into_iter()
                .filter_map(|word| clean_optional(word.w))
                .collect();
            if words.is_empty() {
                None
            } else {
                Some(Synonym {
                    pos: item.pos.map(clean_pos),
                    zh_meaning: clean_optional(item.tran).unwrap_or_default(),
                    words,
                })
            }
        })
        .collect()
}

fn parse_phrases(block: Option<PhraseBlock>) -> Vec<Phrase> {
    block
        .and_then(|block| block.phrases)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|item| {
            let en = clean_optional(item.p_content).unwrap_or_default();
            let zh = clean_optional(item.p_cn).unwrap_or_default();
            if en.is_empty() && zh.is_empty() {
                None
            } else {
                Some(Phrase { en, zh })
            }
        })
        .collect()
}

fn parse_related(block: Option<RelWordBlock>) -> Vec<RelatedWord> {
    block
        .and_then(|block| block.rels)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|item| {
            let words: Vec<RelatedWordItem> = item
                .words
                .unwrap_or_default()
                .into_iter()
                .filter_map(|word| {
                    let item = RelatedWordItem {
                        word: clean_optional(word.hwd).unwrap_or_default(),
                        translation: clean_optional(word.tran).unwrap_or_default(),
                    };
                    if item.word.is_empty() {
                        None
                    } else {
                        Some(item)
                    }
                })
                .collect();
            if words.is_empty() {
                None
            } else {
                Some(RelatedWord {
                    pos: item.pos.map(clean_pos).unwrap_or_default(),
                    words,
                })
            }
        })
        .collect()
}

fn parse_mnemonic(value: Option<serde_json::Value>) -> Option<String> {
    let value = value?;
    if let Some(text) = value.as_str() {
        return clean_optional(Some(text.to_string()));
    }
    for key in ["val", "value", "text"] {
        if let Some(text) = value.get(key).and_then(|value| value.as_str()) {
            return clean_optional(Some(text.to_string()));
        }
    }
    None
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    #[test]
    fn parses_anki_jsonl_entry() {
        let mut file = tempfile::NamedTempFile::new().unwrap();
        writeln!(
            file,
            r#"{{"wordRank":1,"headWord":"cancel","content":{{"word":{{"wordHead":"cancel","wordId":"KaoYan_3_1","content":{{"usphone":"'kænsl","ukphone":"'kænsl","trans":[{{"tranCn":"取消","pos":"vt","tranOther":"to decide something will not happen"}}],"sentence":{{"sentences":[{{"sContent":"Cancel it.","sCn":"取消它。"}}]}},"phrase":{{"phrases":[{{"pContent":"cancel out","pCn":"抵消"}}]}}}}}}}},"bookId":"KaoYan_3"}}"#
        )
        .unwrap();

        let parser = AnkiJsonlParser;
        let entries = parser
            .parse(file.path())
            .unwrap()
            .collect::<Result<Vec<_>>>()
            .unwrap();

        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].word, "cancel");
        assert_eq!(entries[0].definitions[0].zh, "取消");
        assert_eq!(entries[0].examples[0].zh, "取消它。");
        assert!(entries[0].tags.contains(&"kao_yan".to_string()));
    }
}