Skip to main content

dictx_parser/
anki.rs

1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{
3    clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Phrase,
4    RelatedWord, RelatedWordItem, Result, Synonym,
5};
6use serde::Deserialize;
7use serde_json::json;
8use std::collections::BTreeSet;
9use std::fs::File;
10use std::io::{BufRead, BufReader, Lines};
11use std::path::Path;
12
13pub struct AnkiJsonlParser;
14
15impl DictParser for AnkiJsonlParser {
16    fn name(&self) -> &'static str {
17        "Anki JSONL"
18    }
19
20    fn format_id(&self) -> &'static str {
21        "anki-jsonl"
22    }
23
24    fn validate(&self, path: &Path) -> Result<ValidationReport> {
25        let file = File::open(path)?;
26        let mut reader = BufReader::new(file);
27        let mut first = String::new();
28        reader.read_line(&mut first)?;
29        if first.trim().is_empty() {
30            return Ok(ValidationReport::invalid(self.format_id(), "文件为空"));
31        }
32        serde_json::from_str::<AnkiRawEntry>(first.trim())?;
33        Ok(ValidationReport::ok(
34            self.format_id(),
35            count_lines(path).ok(),
36        ))
37    }
38
39    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
40        let file = File::open(path)?;
41        let reader = BufReader::new(file);
42        Ok(Box::new(AnkiIter {
43            lines: reader.lines(),
44        }))
45    }
46}
47
48struct AnkiIter {
49    lines: Lines<BufReader<File>>,
50}
51
52impl Iterator for AnkiIter {
53    type Item = Result<DictEntry>;
54
55    fn next(&mut self) -> Option<Self::Item> {
56        for line in self.lines.by_ref() {
57            match line {
58                Ok(line) if line.trim().is_empty() => continue,
59                Ok(line) => {
60                    return Some(
61                        serde_json::from_str::<AnkiRawEntry>(&line)
62                            .map_err(Into::into)
63                            .and_then(AnkiRawEntry::into_entry),
64                    );
65                }
66                Err(err) => return Some(Err(err.into())),
67            }
68        }
69        None
70    }
71}
72
73#[derive(Debug, Deserialize)]
74#[serde(rename_all = "camelCase")]
75struct AnkiRawEntry {
76    word_rank: Option<u32>,
77    head_word: String,
78    book_id: Option<String>,
79    content: Option<OuterContent>,
80}
81
82#[derive(Debug, Deserialize)]
83struct OuterContent {
84    word: Option<WordNode>,
85}
86
87#[derive(Debug, Deserialize)]
88#[serde(rename_all = "camelCase")]
89struct WordNode {
90    word_id: Option<String>,
91    word_head: Option<String>,
92    content: Option<WordContent>,
93}
94
95#[derive(Debug, Deserialize, Default)]
96#[serde(rename_all = "camelCase")]
97struct WordContent {
98    usphone: Option<String>,
99    ukphone: Option<String>,
100    trans: Option<Vec<Trans>>,
101    sentence: Option<SentenceBlock>,
102    syno: Option<SynoBlock>,
103    phrase: Option<PhraseBlock>,
104    rel_word: Option<RelWordBlock>,
105    rem_method: Option<serde_json::Value>,
106}
107
108#[derive(Debug, Deserialize)]
109#[serde(rename_all = "camelCase")]
110struct Trans {
111    tran_cn: Option<String>,
112    tran_other: Option<String>,
113    pos: Option<String>,
114}
115
116#[derive(Debug, Deserialize)]
117struct SentenceBlock {
118    sentences: Option<Vec<SentenceRaw>>,
119}
120
121#[derive(Debug, Deserialize)]
122#[serde(rename_all = "camelCase")]
123struct SentenceRaw {
124    s_content: Option<String>,
125    s_cn: Option<String>,
126}
127
128#[derive(Debug, Deserialize)]
129struct SynoBlock {
130    synos: Option<Vec<SynoRaw>>,
131}
132
133#[derive(Debug, Deserialize)]
134struct SynoRaw {
135    pos: Option<String>,
136    tran: Option<String>,
137    hwds: Option<Vec<SynoWordRaw>>,
138}
139
140#[derive(Debug, Deserialize)]
141struct SynoWordRaw {
142    w: Option<String>,
143}
144
145#[derive(Debug, Deserialize)]
146struct PhraseBlock {
147    phrases: Option<Vec<PhraseRaw>>,
148}
149
150#[derive(Debug, Deserialize)]
151#[serde(rename_all = "camelCase")]
152struct PhraseRaw {
153    p_content: Option<String>,
154    p_cn: Option<String>,
155}
156
157#[derive(Debug, Deserialize)]
158struct RelWordBlock {
159    rels: Option<Vec<RelRaw>>,
160}
161
162#[derive(Debug, Deserialize)]
163struct RelRaw {
164    pos: Option<String>,
165    words: Option<Vec<RelWordRaw>>,
166}
167
168#[derive(Debug, Deserialize)]
169struct RelWordRaw {
170    hwd: Option<String>,
171    tran: Option<String>,
172}
173
174impl AnkiRawEntry {
175    fn into_entry(self) -> Result<DictEntry> {
176        let book_id = self.book_id.unwrap_or_else(|| "anki".to_string());
177        let word_node = self.content.and_then(|content| content.word);
178        let word_content = word_node
179            .as_ref()
180            .and_then(|word| word.content.as_ref())
181            .cloned()
182            .unwrap_or_default();
183        let word = word_node
184            .as_ref()
185            .and_then(|node| node.word_head.clone())
186            .unwrap_or(self.head_word);
187
188        let mut entry = DictEntry::new(
189            DictSource::Anki {
190                deck_name: book_id.clone(),
191            },
192            clean_text(word),
193        );
194
195        if let Some(word_id) = word_node.and_then(|node| node.word_id) {
196            entry.id = format!("anki:{}:{}", book_id, word_id);
197        }
198
199        entry.phonetic_us = clean_optional(word_content.usphone);
200        entry.phonetic_uk = clean_optional(word_content.ukphone);
201        entry.definitions = parse_trans(word_content.trans.unwrap_or_default());
202        entry.pos = collect_pos(&entry.definitions);
203        entry.tags = vec![normalize_tag("kao_yan"), book_id.to_ascii_lowercase()];
204        entry.examples = parse_examples(word_content.sentence);
205        entry.synonyms = parse_synonyms(word_content.syno);
206        entry.phrases = parse_phrases(word_content.phrase);
207        entry.related_words = parse_related(word_content.rel_word);
208        entry.mnemonic = parse_mnemonic(word_content.rem_method);
209        entry.extra = json!({
210            "rank": self.word_rank,
211            "book_id": book_id,
212        });
213
214        Ok(entry)
215    }
216}
217
218impl Clone for WordContent {
219    fn clone(&self) -> Self {
220        Self {
221            usphone: self.usphone.clone(),
222            ukphone: self.ukphone.clone(),
223            trans: self.trans.clone(),
224            sentence: self.sentence.clone(),
225            syno: self.syno.clone(),
226            phrase: self.phrase.clone(),
227            rel_word: self.rel_word.clone(),
228            rem_method: self.rem_method.clone(),
229        }
230    }
231}
232
233impl Clone for Trans {
234    fn clone(&self) -> Self {
235        Self {
236            tran_cn: self.tran_cn.clone(),
237            tran_other: self.tran_other.clone(),
238            pos: self.pos.clone(),
239        }
240    }
241}
242
243impl Clone for SentenceBlock {
244    fn clone(&self) -> Self {
245        Self {
246            sentences: self.sentences.clone(),
247        }
248    }
249}
250
251impl Clone for SentenceRaw {
252    fn clone(&self) -> Self {
253        Self {
254            s_content: self.s_content.clone(),
255            s_cn: self.s_cn.clone(),
256        }
257    }
258}
259
260impl Clone for SynoBlock {
261    fn clone(&self) -> Self {
262        Self {
263            synos: self.synos.clone(),
264        }
265    }
266}
267
268impl Clone for SynoRaw {
269    fn clone(&self) -> Self {
270        Self {
271            pos: self.pos.clone(),
272            tran: self.tran.clone(),
273            hwds: self.hwds.clone(),
274        }
275    }
276}
277
278impl Clone for SynoWordRaw {
279    fn clone(&self) -> Self {
280        Self { w: self.w.clone() }
281    }
282}
283
284impl Clone for PhraseBlock {
285    fn clone(&self) -> Self {
286        Self {
287            phrases: self.phrases.clone(),
288        }
289    }
290}
291
292impl Clone for PhraseRaw {
293    fn clone(&self) -> Self {
294        Self {
295            p_content: self.p_content.clone(),
296            p_cn: self.p_cn.clone(),
297        }
298    }
299}
300
301impl Clone for RelWordBlock {
302    fn clone(&self) -> Self {
303        Self {
304            rels: self.rels.clone(),
305        }
306    }
307}
308
309impl Clone for RelRaw {
310    fn clone(&self) -> Self {
311        Self {
312            pos: self.pos.clone(),
313            words: self.words.clone(),
314        }
315    }
316}
317
318impl Clone for RelWordRaw {
319    fn clone(&self) -> Self {
320        Self {
321            hwd: self.hwd.clone(),
322            tran: self.tran.clone(),
323        }
324    }
325}
326
327fn count_lines(path: &Path) -> std::io::Result<usize> {
328    let file = File::open(path)?;
329    Ok(BufReader::new(file).lines().count())
330}
331
332fn clean_optional(value: Option<String>) -> Option<String> {
333    value
334        .map(clean_text)
335        .filter(|value| !value.trim().is_empty())
336}
337
338fn parse_trans(trans: Vec<Trans>) -> Vec<Definition> {
339    trans
340        .into_iter()
341        .filter_map(|item| {
342            let zh = clean_optional(item.tran_cn).unwrap_or_default();
343            let en = clean_optional(item.tran_other).unwrap_or_default();
344            let pos = item.pos.map(clean_pos);
345            if zh.is_empty() && en.is_empty() {
346                None
347            } else {
348                Some(Definition::new(en, zh, pos))
349            }
350        })
351        .collect()
352}
353
354fn collect_pos(definitions: &[Definition]) -> Vec<String> {
355    let mut set = BTreeSet::new();
356    for definition in definitions {
357        if let Some(pos) = &definition.pos {
358            set.insert(pos.clone());
359        }
360    }
361    set.into_iter().collect()
362}
363
364fn parse_examples(block: Option<SentenceBlock>) -> Vec<Example> {
365    block
366        .and_then(|block| block.sentences)
367        .unwrap_or_default()
368        .into_iter()
369        .filter_map(|item| {
370            let en = clean_optional(item.s_content).unwrap_or_default();
371            let zh = clean_optional(item.s_cn).unwrap_or_default();
372            if en.is_empty() && zh.is_empty() {
373                None
374            } else {
375                Some(Example { en, zh })
376            }
377        })
378        .collect()
379}
380
381fn parse_synonyms(block: Option<SynoBlock>) -> Vec<Synonym> {
382    block
383        .and_then(|block| block.synos)
384        .unwrap_or_default()
385        .into_iter()
386        .filter_map(|item| {
387            let words: Vec<String> = item
388                .hwds
389                .unwrap_or_default()
390                .into_iter()
391                .filter_map(|word| clean_optional(word.w))
392                .collect();
393            if words.is_empty() {
394                None
395            } else {
396                Some(Synonym {
397                    pos: item.pos.map(clean_pos),
398                    zh_meaning: clean_optional(item.tran).unwrap_or_default(),
399                    words,
400                })
401            }
402        })
403        .collect()
404}
405
406fn parse_phrases(block: Option<PhraseBlock>) -> Vec<Phrase> {
407    block
408        .and_then(|block| block.phrases)
409        .unwrap_or_default()
410        .into_iter()
411        .filter_map(|item| {
412            let en = clean_optional(item.p_content).unwrap_or_default();
413            let zh = clean_optional(item.p_cn).unwrap_or_default();
414            if en.is_empty() && zh.is_empty() {
415                None
416            } else {
417                Some(Phrase { en, zh })
418            }
419        })
420        .collect()
421}
422
423fn parse_related(block: Option<RelWordBlock>) -> Vec<RelatedWord> {
424    block
425        .and_then(|block| block.rels)
426        .unwrap_or_default()
427        .into_iter()
428        .filter_map(|item| {
429            let words: Vec<RelatedWordItem> = item
430                .words
431                .unwrap_or_default()
432                .into_iter()
433                .filter_map(|word| {
434                    let item = RelatedWordItem {
435                        word: clean_optional(word.hwd).unwrap_or_default(),
436                        translation: clean_optional(word.tran).unwrap_or_default(),
437                    };
438                    if item.word.is_empty() {
439                        None
440                    } else {
441                        Some(item)
442                    }
443                })
444                .collect();
445            if words.is_empty() {
446                None
447            } else {
448                Some(RelatedWord {
449                    pos: item.pos.map(clean_pos).unwrap_or_default(),
450                    words,
451                })
452            }
453        })
454        .collect()
455}
456
457fn parse_mnemonic(value: Option<serde_json::Value>) -> Option<String> {
458    let value = value?;
459    if let Some(text) = value.as_str() {
460        return clean_optional(Some(text.to_string()));
461    }
462    for key in ["val", "value", "text"] {
463        if let Some(text) = value.get(key).and_then(|value| value.as_str()) {
464            return clean_optional(Some(text.to_string()));
465        }
466    }
467    None
468}
469
470#[cfg(test)]
471mod tests {
472    use super::*;
473    use std::io::Write;
474
475    #[test]
476    fn parses_anki_jsonl_entry() {
477        let mut file = tempfile::NamedTempFile::new().unwrap();
478        writeln!(
479            file,
480            r#"{{"wordRank":1,"headWord":"cancel","content":{{"word":{{"wordHead":"cancel","wordId":"KaoYan_3_1","content":{{"usphone":"'kænsl","ukphone":"'kænsl","trans":[{{"tranCn":"取消","pos":"vt","tranOther":"to decide something will not happen"}}],"sentence":{{"sentences":[{{"sContent":"Cancel it.","sCn":"取消它。"}}]}},"phrase":{{"phrases":[{{"pContent":"cancel out","pCn":"抵消"}}]}}}}}}}},"bookId":"KaoYan_3"}}"#
481        )
482        .unwrap();
483
484        let parser = AnkiJsonlParser;
485        let entries = parser
486            .parse(file.path())
487            .unwrap()
488            .collect::<Result<Vec<_>>>()
489            .unwrap();
490
491        assert_eq!(entries.len(), 1);
492        assert_eq!(entries[0].word, "cancel");
493        assert_eq!(entries[0].definitions[0].zh, "取消");
494        assert_eq!(entries[0].examples[0].zh, "取消它。");
495        assert!(entries[0].tags.contains(&"kao_yan".to_string()));
496    }
497}