1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{
3 clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Phrase,
4 RelatedWord, RelatedWordItem, Result, Synonym,
5};
6use serde::Deserialize;
7use serde_json::json;
8use std::collections::BTreeSet;
9use std::fs::File;
10use std::io::{BufRead, BufReader, Lines};
11use std::path::Path;
12
13pub struct AnkiJsonlParser;
14
15impl DictParser for AnkiJsonlParser {
16 fn name(&self) -> &'static str {
17 "Anki JSONL"
18 }
19
20 fn format_id(&self) -> &'static str {
21 "anki-jsonl"
22 }
23
24 fn validate(&self, path: &Path) -> Result<ValidationReport> {
25 let file = File::open(path)?;
26 let mut reader = BufReader::new(file);
27 let mut first = String::new();
28 reader.read_line(&mut first)?;
29 if first.trim().is_empty() {
30 return Ok(ValidationReport::invalid(self.format_id(), "文件为空"));
31 }
32 serde_json::from_str::<AnkiRawEntry>(first.trim())?;
33 Ok(ValidationReport::ok(
34 self.format_id(),
35 count_lines(path).ok(),
36 ))
37 }
38
39 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
40 let file = File::open(path)?;
41 let reader = BufReader::new(file);
42 Ok(Box::new(AnkiIter {
43 lines: reader.lines(),
44 }))
45 }
46}
47
48struct AnkiIter {
49 lines: Lines<BufReader<File>>,
50}
51
52impl Iterator for AnkiIter {
53 type Item = Result<DictEntry>;
54
55 fn next(&mut self) -> Option<Self::Item> {
56 for line in self.lines.by_ref() {
57 match line {
58 Ok(line) if line.trim().is_empty() => continue,
59 Ok(line) => {
60 return Some(
61 serde_json::from_str::<AnkiRawEntry>(&line)
62 .map_err(Into::into)
63 .and_then(AnkiRawEntry::into_entry),
64 );
65 }
66 Err(err) => return Some(Err(err.into())),
67 }
68 }
69 None
70 }
71}
72
73#[derive(Debug, Deserialize)]
74#[serde(rename_all = "camelCase")]
75struct AnkiRawEntry {
76 word_rank: Option<u32>,
77 head_word: String,
78 book_id: Option<String>,
79 content: Option<OuterContent>,
80}
81
82#[derive(Debug, Deserialize)]
83struct OuterContent {
84 word: Option<WordNode>,
85}
86
87#[derive(Debug, Deserialize)]
88#[serde(rename_all = "camelCase")]
89struct WordNode {
90 word_id: Option<String>,
91 word_head: Option<String>,
92 content: Option<WordContent>,
93}
94
95#[derive(Debug, Deserialize, Default)]
96#[serde(rename_all = "camelCase")]
97struct WordContent {
98 usphone: Option<String>,
99 ukphone: Option<String>,
100 trans: Option<Vec<Trans>>,
101 sentence: Option<SentenceBlock>,
102 syno: Option<SynoBlock>,
103 phrase: Option<PhraseBlock>,
104 rel_word: Option<RelWordBlock>,
105 rem_method: Option<serde_json::Value>,
106}
107
108#[derive(Debug, Deserialize)]
109#[serde(rename_all = "camelCase")]
110struct Trans {
111 tran_cn: Option<String>,
112 tran_other: Option<String>,
113 pos: Option<String>,
114}
115
116#[derive(Debug, Deserialize)]
117struct SentenceBlock {
118 sentences: Option<Vec<SentenceRaw>>,
119}
120
121#[derive(Debug, Deserialize)]
122#[serde(rename_all = "camelCase")]
123struct SentenceRaw {
124 s_content: Option<String>,
125 s_cn: Option<String>,
126}
127
128#[derive(Debug, Deserialize)]
129struct SynoBlock {
130 synos: Option<Vec<SynoRaw>>,
131}
132
133#[derive(Debug, Deserialize)]
134struct SynoRaw {
135 pos: Option<String>,
136 tran: Option<String>,
137 hwds: Option<Vec<SynoWordRaw>>,
138}
139
140#[derive(Debug, Deserialize)]
141struct SynoWordRaw {
142 w: Option<String>,
143}
144
145#[derive(Debug, Deserialize)]
146struct PhraseBlock {
147 phrases: Option<Vec<PhraseRaw>>,
148}
149
150#[derive(Debug, Deserialize)]
151#[serde(rename_all = "camelCase")]
152struct PhraseRaw {
153 p_content: Option<String>,
154 p_cn: Option<String>,
155}
156
157#[derive(Debug, Deserialize)]
158struct RelWordBlock {
159 rels: Option<Vec<RelRaw>>,
160}
161
162#[derive(Debug, Deserialize)]
163struct RelRaw {
164 pos: Option<String>,
165 words: Option<Vec<RelWordRaw>>,
166}
167
168#[derive(Debug, Deserialize)]
169struct RelWordRaw {
170 hwd: Option<String>,
171 tran: Option<String>,
172}
173
174impl AnkiRawEntry {
175 fn into_entry(self) -> Result<DictEntry> {
176 let book_id = self.book_id.unwrap_or_else(|| "anki".to_string());
177 let word_node = self.content.and_then(|content| content.word);
178 let word_content = word_node
179 .as_ref()
180 .and_then(|word| word.content.as_ref())
181 .cloned()
182 .unwrap_or_default();
183 let word = word_node
184 .as_ref()
185 .and_then(|node| node.word_head.clone())
186 .unwrap_or(self.head_word);
187
188 let mut entry = DictEntry::new(
189 DictSource::Anki {
190 deck_name: book_id.clone(),
191 },
192 clean_text(word),
193 );
194
195 if let Some(word_id) = word_node.and_then(|node| node.word_id) {
196 entry.id = format!("anki:{}:{}", book_id, word_id);
197 }
198
199 entry.phonetic_us = clean_optional(word_content.usphone);
200 entry.phonetic_uk = clean_optional(word_content.ukphone);
201 entry.definitions = parse_trans(word_content.trans.unwrap_or_default());
202 entry.pos = collect_pos(&entry.definitions);
203 entry.tags = vec![normalize_tag("kao_yan"), book_id.to_ascii_lowercase()];
204 entry.examples = parse_examples(word_content.sentence);
205 entry.synonyms = parse_synonyms(word_content.syno);
206 entry.phrases = parse_phrases(word_content.phrase);
207 entry.related_words = parse_related(word_content.rel_word);
208 entry.mnemonic = parse_mnemonic(word_content.rem_method);
209 entry.extra = json!({
210 "rank": self.word_rank,
211 "book_id": book_id,
212 });
213
214 Ok(entry)
215 }
216}
217
218impl Clone for WordContent {
219 fn clone(&self) -> Self {
220 Self {
221 usphone: self.usphone.clone(),
222 ukphone: self.ukphone.clone(),
223 trans: self.trans.clone(),
224 sentence: self.sentence.clone(),
225 syno: self.syno.clone(),
226 phrase: self.phrase.clone(),
227 rel_word: self.rel_word.clone(),
228 rem_method: self.rem_method.clone(),
229 }
230 }
231}
232
233impl Clone for Trans {
234 fn clone(&self) -> Self {
235 Self {
236 tran_cn: self.tran_cn.clone(),
237 tran_other: self.tran_other.clone(),
238 pos: self.pos.clone(),
239 }
240 }
241}
242
243impl Clone for SentenceBlock {
244 fn clone(&self) -> Self {
245 Self {
246 sentences: self.sentences.clone(),
247 }
248 }
249}
250
251impl Clone for SentenceRaw {
252 fn clone(&self) -> Self {
253 Self {
254 s_content: self.s_content.clone(),
255 s_cn: self.s_cn.clone(),
256 }
257 }
258}
259
260impl Clone for SynoBlock {
261 fn clone(&self) -> Self {
262 Self {
263 synos: self.synos.clone(),
264 }
265 }
266}
267
268impl Clone for SynoRaw {
269 fn clone(&self) -> Self {
270 Self {
271 pos: self.pos.clone(),
272 tran: self.tran.clone(),
273 hwds: self.hwds.clone(),
274 }
275 }
276}
277
278impl Clone for SynoWordRaw {
279 fn clone(&self) -> Self {
280 Self { w: self.w.clone() }
281 }
282}
283
284impl Clone for PhraseBlock {
285 fn clone(&self) -> Self {
286 Self {
287 phrases: self.phrases.clone(),
288 }
289 }
290}
291
292impl Clone for PhraseRaw {
293 fn clone(&self) -> Self {
294 Self {
295 p_content: self.p_content.clone(),
296 p_cn: self.p_cn.clone(),
297 }
298 }
299}
300
301impl Clone for RelWordBlock {
302 fn clone(&self) -> Self {
303 Self {
304 rels: self.rels.clone(),
305 }
306 }
307}
308
309impl Clone for RelRaw {
310 fn clone(&self) -> Self {
311 Self {
312 pos: self.pos.clone(),
313 words: self.words.clone(),
314 }
315 }
316}
317
318impl Clone for RelWordRaw {
319 fn clone(&self) -> Self {
320 Self {
321 hwd: self.hwd.clone(),
322 tran: self.tran.clone(),
323 }
324 }
325}
326
327fn count_lines(path: &Path) -> std::io::Result<usize> {
328 let file = File::open(path)?;
329 Ok(BufReader::new(file).lines().count())
330}
331
332fn clean_optional(value: Option<String>) -> Option<String> {
333 value
334 .map(clean_text)
335 .filter(|value| !value.trim().is_empty())
336}
337
338fn parse_trans(trans: Vec<Trans>) -> Vec<Definition> {
339 trans
340 .into_iter()
341 .filter_map(|item| {
342 let zh = clean_optional(item.tran_cn).unwrap_or_default();
343 let en = clean_optional(item.tran_other).unwrap_or_default();
344 let pos = item.pos.map(clean_pos);
345 if zh.is_empty() && en.is_empty() {
346 None
347 } else {
348 Some(Definition::new(en, zh, pos))
349 }
350 })
351 .collect()
352}
353
354fn collect_pos(definitions: &[Definition]) -> Vec<String> {
355 let mut set = BTreeSet::new();
356 for definition in definitions {
357 if let Some(pos) = &definition.pos {
358 set.insert(pos.clone());
359 }
360 }
361 set.into_iter().collect()
362}
363
364fn parse_examples(block: Option<SentenceBlock>) -> Vec<Example> {
365 block
366 .and_then(|block| block.sentences)
367 .unwrap_or_default()
368 .into_iter()
369 .filter_map(|item| {
370 let en = clean_optional(item.s_content).unwrap_or_default();
371 let zh = clean_optional(item.s_cn).unwrap_or_default();
372 if en.is_empty() && zh.is_empty() {
373 None
374 } else {
375 Some(Example { en, zh })
376 }
377 })
378 .collect()
379}
380
381fn parse_synonyms(block: Option<SynoBlock>) -> Vec<Synonym> {
382 block
383 .and_then(|block| block.synos)
384 .unwrap_or_default()
385 .into_iter()
386 .filter_map(|item| {
387 let words: Vec<String> = item
388 .hwds
389 .unwrap_or_default()
390 .into_iter()
391 .filter_map(|word| clean_optional(word.w))
392 .collect();
393 if words.is_empty() {
394 None
395 } else {
396 Some(Synonym {
397 pos: item.pos.map(clean_pos),
398 zh_meaning: clean_optional(item.tran).unwrap_or_default(),
399 words,
400 })
401 }
402 })
403 .collect()
404}
405
406fn parse_phrases(block: Option<PhraseBlock>) -> Vec<Phrase> {
407 block
408 .and_then(|block| block.phrases)
409 .unwrap_or_default()
410 .into_iter()
411 .filter_map(|item| {
412 let en = clean_optional(item.p_content).unwrap_or_default();
413 let zh = clean_optional(item.p_cn).unwrap_or_default();
414 if en.is_empty() && zh.is_empty() {
415 None
416 } else {
417 Some(Phrase { en, zh })
418 }
419 })
420 .collect()
421}
422
423fn parse_related(block: Option<RelWordBlock>) -> Vec<RelatedWord> {
424 block
425 .and_then(|block| block.rels)
426 .unwrap_or_default()
427 .into_iter()
428 .filter_map(|item| {
429 let words: Vec<RelatedWordItem> = item
430 .words
431 .unwrap_or_default()
432 .into_iter()
433 .filter_map(|word| {
434 let item = RelatedWordItem {
435 word: clean_optional(word.hwd).unwrap_or_default(),
436 translation: clean_optional(word.tran).unwrap_or_default(),
437 };
438 if item.word.is_empty() {
439 None
440 } else {
441 Some(item)
442 }
443 })
444 .collect();
445 if words.is_empty() {
446 None
447 } else {
448 Some(RelatedWord {
449 pos: item.pos.map(clean_pos).unwrap_or_default(),
450 words,
451 })
452 }
453 })
454 .collect()
455}
456
457fn parse_mnemonic(value: Option<serde_json::Value>) -> Option<String> {
458 let value = value?;
459 if let Some(text) = value.as_str() {
460 return clean_optional(Some(text.to_string()));
461 }
462 for key in ["val", "value", "text"] {
463 if let Some(text) = value.get(key).and_then(|value| value.as_str()) {
464 return clean_optional(Some(text.to_string()));
465 }
466 }
467 None
468}
469
470#[cfg(test)]
471mod tests {
472 use super::*;
473 use std::io::Write;
474
475 #[test]
476 fn parses_anki_jsonl_entry() {
477 let mut file = tempfile::NamedTempFile::new().unwrap();
478 writeln!(
479 file,
480 r#"{{"wordRank":1,"headWord":"cancel","content":{{"word":{{"wordHead":"cancel","wordId":"KaoYan_3_1","content":{{"usphone":"'kænsl","ukphone":"'kænsl","trans":[{{"tranCn":"取消","pos":"vt","tranOther":"to decide something will not happen"}}],"sentence":{{"sentences":[{{"sContent":"Cancel it.","sCn":"取消它。"}}]}},"phrase":{{"phrases":[{{"pContent":"cancel out","pCn":"抵消"}}]}}}}}}}},"bookId":"KaoYan_3"}}"#
481 )
482 .unwrap();
483
484 let parser = AnkiJsonlParser;
485 let entries = parser
486 .parse(file.path())
487 .unwrap()
488 .collect::<Result<Vec<_>>>()
489 .unwrap();
490
491 assert_eq!(entries.len(), 1);
492 assert_eq!(entries[0].word, "cancel");
493 assert_eq!(entries[0].definitions[0].zh, "取消");
494 assert_eq!(entries[0].examples[0].zh, "取消它。");
495 assert!(entries[0].tags.contains(&"kao_yan".to_string()));
496 }
497}