use crate::traits::{DictParser, ValidationReport};
use dictx_core::{
clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Phrase,
RelatedWord, RelatedWordItem, Result, Synonym,
};
use serde::Deserialize;
use serde_json::json;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{BufRead, BufReader, Lines};
use std::path::Path;
pub struct AnkiJsonlParser;
impl DictParser for AnkiJsonlParser {
fn name(&self) -> &'static str {
"Anki JSONL"
}
fn format_id(&self) -> &'static str {
"anki-jsonl"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let mut first = String::new();
reader.read_line(&mut first)?;
if first.trim().is_empty() {
return Ok(ValidationReport::invalid(self.format_id(), "文件为空"));
}
serde_json::from_str::<AnkiRawEntry>(first.trim())?;
Ok(ValidationReport::ok(
self.format_id(),
count_lines(path).ok(),
))
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let file = File::open(path)?;
let reader = BufReader::new(file);
Ok(Box::new(AnkiIter {
lines: reader.lines(),
}))
}
}
struct AnkiIter {
lines: Lines<BufReader<File>>,
}
impl Iterator for AnkiIter {
type Item = Result<DictEntry>;
fn next(&mut self) -> Option<Self::Item> {
for line in self.lines.by_ref() {
match line {
Ok(line) if line.trim().is_empty() => continue,
Ok(line) => {
return Some(
serde_json::from_str::<AnkiRawEntry>(&line)
.map_err(Into::into)
.and_then(AnkiRawEntry::into_entry),
);
}
Err(err) => return Some(Err(err.into())),
}
}
None
}
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct AnkiRawEntry {
word_rank: Option<u32>,
head_word: String,
book_id: Option<String>,
content: Option<OuterContent>,
}
#[derive(Debug, Deserialize)]
struct OuterContent {
word: Option<WordNode>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct WordNode {
word_id: Option<String>,
word_head: Option<String>,
content: Option<WordContent>,
}
#[derive(Debug, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
struct WordContent {
usphone: Option<String>,
ukphone: Option<String>,
trans: Option<Vec<Trans>>,
sentence: Option<SentenceBlock>,
syno: Option<SynoBlock>,
phrase: Option<PhraseBlock>,
rel_word: Option<RelWordBlock>,
rem_method: Option<serde_json::Value>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Trans {
tran_cn: Option<String>,
tran_other: Option<String>,
pos: Option<String>,
}
#[derive(Debug, Deserialize)]
struct SentenceBlock {
sentences: Option<Vec<SentenceRaw>>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct SentenceRaw {
s_content: Option<String>,
s_cn: Option<String>,
}
#[derive(Debug, Deserialize)]
struct SynoBlock {
synos: Option<Vec<SynoRaw>>,
}
#[derive(Debug, Deserialize)]
struct SynoRaw {
pos: Option<String>,
tran: Option<String>,
hwds: Option<Vec<SynoWordRaw>>,
}
#[derive(Debug, Deserialize)]
struct SynoWordRaw {
w: Option<String>,
}
#[derive(Debug, Deserialize)]
struct PhraseBlock {
phrases: Option<Vec<PhraseRaw>>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct PhraseRaw {
p_content: Option<String>,
p_cn: Option<String>,
}
#[derive(Debug, Deserialize)]
struct RelWordBlock {
rels: Option<Vec<RelRaw>>,
}
#[derive(Debug, Deserialize)]
struct RelRaw {
pos: Option<String>,
words: Option<Vec<RelWordRaw>>,
}
#[derive(Debug, Deserialize)]
struct RelWordRaw {
hwd: Option<String>,
tran: Option<String>,
}
impl AnkiRawEntry {
fn into_entry(self) -> Result<DictEntry> {
let book_id = self.book_id.unwrap_or_else(|| "anki".to_string());
let word_node = self.content.and_then(|content| content.word);
let word_content = word_node
.as_ref()
.and_then(|word| word.content.as_ref())
.cloned()
.unwrap_or_default();
let word = word_node
.as_ref()
.and_then(|node| node.word_head.clone())
.unwrap_or(self.head_word);
let mut entry = DictEntry::new(
DictSource::Anki {
deck_name: book_id.clone(),
},
clean_text(word),
);
if let Some(word_id) = word_node.and_then(|node| node.word_id) {
entry.id = format!("anki:{}:{}", book_id, word_id);
}
entry.phonetic_us = clean_optional(word_content.usphone);
entry.phonetic_uk = clean_optional(word_content.ukphone);
entry.definitions = parse_trans(word_content.trans.unwrap_or_default());
entry.pos = collect_pos(&entry.definitions);
entry.tags = vec![normalize_tag("kao_yan"), book_id.to_ascii_lowercase()];
entry.examples = parse_examples(word_content.sentence);
entry.synonyms = parse_synonyms(word_content.syno);
entry.phrases = parse_phrases(word_content.phrase);
entry.related_words = parse_related(word_content.rel_word);
entry.mnemonic = parse_mnemonic(word_content.rem_method);
entry.extra = json!({
"rank": self.word_rank,
"book_id": book_id,
});
Ok(entry)
}
}
impl Clone for WordContent {
fn clone(&self) -> Self {
Self {
usphone: self.usphone.clone(),
ukphone: self.ukphone.clone(),
trans: self.trans.clone(),
sentence: self.sentence.clone(),
syno: self.syno.clone(),
phrase: self.phrase.clone(),
rel_word: self.rel_word.clone(),
rem_method: self.rem_method.clone(),
}
}
}
impl Clone for Trans {
fn clone(&self) -> Self {
Self {
tran_cn: self.tran_cn.clone(),
tran_other: self.tran_other.clone(),
pos: self.pos.clone(),
}
}
}
impl Clone for SentenceBlock {
fn clone(&self) -> Self {
Self {
sentences: self.sentences.clone(),
}
}
}
impl Clone for SentenceRaw {
fn clone(&self) -> Self {
Self {
s_content: self.s_content.clone(),
s_cn: self.s_cn.clone(),
}
}
}
impl Clone for SynoBlock {
fn clone(&self) -> Self {
Self {
synos: self.synos.clone(),
}
}
}
impl Clone for SynoRaw {
fn clone(&self) -> Self {
Self {
pos: self.pos.clone(),
tran: self.tran.clone(),
hwds: self.hwds.clone(),
}
}
}
impl Clone for SynoWordRaw {
fn clone(&self) -> Self {
Self { w: self.w.clone() }
}
}
impl Clone for PhraseBlock {
fn clone(&self) -> Self {
Self {
phrases: self.phrases.clone(),
}
}
}
impl Clone for PhraseRaw {
fn clone(&self) -> Self {
Self {
p_content: self.p_content.clone(),
p_cn: self.p_cn.clone(),
}
}
}
impl Clone for RelWordBlock {
fn clone(&self) -> Self {
Self {
rels: self.rels.clone(),
}
}
}
impl Clone for RelRaw {
fn clone(&self) -> Self {
Self {
pos: self.pos.clone(),
words: self.words.clone(),
}
}
}
impl Clone for RelWordRaw {
fn clone(&self) -> Self {
Self {
hwd: self.hwd.clone(),
tran: self.tran.clone(),
}
}
}
fn count_lines(path: &Path) -> std::io::Result<usize> {
let file = File::open(path)?;
Ok(BufReader::new(file).lines().count())
}
fn clean_optional(value: Option<String>) -> Option<String> {
value
.map(clean_text)
.filter(|value| !value.trim().is_empty())
}
fn parse_trans(trans: Vec<Trans>) -> Vec<Definition> {
trans
.into_iter()
.filter_map(|item| {
let zh = clean_optional(item.tran_cn).unwrap_or_default();
let en = clean_optional(item.tran_other).unwrap_or_default();
let pos = item.pos.map(clean_pos);
if zh.is_empty() && en.is_empty() {
None
} else {
Some(Definition::new(en, zh, pos))
}
})
.collect()
}
fn collect_pos(definitions: &[Definition]) -> Vec<String> {
let mut set = BTreeSet::new();
for definition in definitions {
if let Some(pos) = &definition.pos {
set.insert(pos.clone());
}
}
set.into_iter().collect()
}
fn parse_examples(block: Option<SentenceBlock>) -> Vec<Example> {
block
.and_then(|block| block.sentences)
.unwrap_or_default()
.into_iter()
.filter_map(|item| {
let en = clean_optional(item.s_content).unwrap_or_default();
let zh = clean_optional(item.s_cn).unwrap_or_default();
if en.is_empty() && zh.is_empty() {
None
} else {
Some(Example { en, zh })
}
})
.collect()
}
fn parse_synonyms(block: Option<SynoBlock>) -> Vec<Synonym> {
block
.and_then(|block| block.synos)
.unwrap_or_default()
.into_iter()
.filter_map(|item| {
let words: Vec<String> = item
.hwds
.unwrap_or_default()
.into_iter()
.filter_map(|word| clean_optional(word.w))
.collect();
if words.is_empty() {
None
} else {
Some(Synonym {
pos: item.pos.map(clean_pos),
zh_meaning: clean_optional(item.tran).unwrap_or_default(),
words,
})
}
})
.collect()
}
fn parse_phrases(block: Option<PhraseBlock>) -> Vec<Phrase> {
block
.and_then(|block| block.phrases)
.unwrap_or_default()
.into_iter()
.filter_map(|item| {
let en = clean_optional(item.p_content).unwrap_or_default();
let zh = clean_optional(item.p_cn).unwrap_or_default();
if en.is_empty() && zh.is_empty() {
None
} else {
Some(Phrase { en, zh })
}
})
.collect()
}
fn parse_related(block: Option<RelWordBlock>) -> Vec<RelatedWord> {
block
.and_then(|block| block.rels)
.unwrap_or_default()
.into_iter()
.filter_map(|item| {
let words: Vec<RelatedWordItem> = item
.words
.unwrap_or_default()
.into_iter()
.filter_map(|word| {
let item = RelatedWordItem {
word: clean_optional(word.hwd).unwrap_or_default(),
translation: clean_optional(word.tran).unwrap_or_default(),
};
if item.word.is_empty() {
None
} else {
Some(item)
}
})
.collect();
if words.is_empty() {
None
} else {
Some(RelatedWord {
pos: item.pos.map(clean_pos).unwrap_or_default(),
words,
})
}
})
.collect()
}
fn parse_mnemonic(value: Option<serde_json::Value>) -> Option<String> {
let value = value?;
if let Some(text) = value.as_str() {
return clean_optional(Some(text.to_string()));
}
for key in ["val", "value", "text"] {
if let Some(text) = value.get(key).and_then(|value| value.as_str()) {
return clean_optional(Some(text.to_string()));
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn parses_anki_jsonl_entry() {
let mut file = tempfile::NamedTempFile::new().unwrap();
writeln!(
file,
r#"{{"wordRank":1,"headWord":"cancel","content":{{"word":{{"wordHead":"cancel","wordId":"KaoYan_3_1","content":{{"usphone":"'kænsl","ukphone":"'kænsl","trans":[{{"tranCn":"取消","pos":"vt","tranOther":"to decide something will not happen"}}],"sentence":{{"sentences":[{{"sContent":"Cancel it.","sCn":"取消它。"}}]}},"phrase":{{"phrases":[{{"pContent":"cancel out","pCn":"抵消"}}]}}}}}}}},"bookId":"KaoYan_3"}}"#
)
.unwrap();
let parser = AnkiJsonlParser;
let entries = parser
.parse(file.path())
.unwrap()
.collect::<Result<Vec<_>>>()
.unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].word, "cancel");
assert_eq!(entries[0].definitions[0].zh, "取消");
assert_eq!(entries[0].examples[0].zh, "取消它。");
assert!(entries[0].tags.contains(&"kao_yan".to_string()));
}
}