use crate::traits::{DictParser, ValidationReport};
use dictx_core::{clean_text, Definition, DictEntry, DictSource, Result};
use serde_json::json;
use std::fs::File;
use std::io::{BufRead, BufReader, Lines};
use std::path::Path;
pub struct CedictParser;
impl DictParser for CedictParser {
fn name(&self) -> &'static str {
"CC-CEDICT"
}
fn format_id(&self) -> &'static str {
"cedict"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let file = File::open(path)?;
let reader = BufReader::new(file);
let mut valid_sample = false;
let mut count = 0usize;
for line in reader.lines() {
let line = line?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
count += 1;
if parse_line(line).is_some() {
valid_sample = true;
}
if count > 100 && valid_sample {
break;
}
}
if valid_sample {
Ok(ValidationReport::ok(self.format_id(), Some(count)))
} else {
Ok(ValidationReport::invalid(
self.format_id(),
"未识别到 CC-CEDICT 格式条目",
))
}
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let file = File::open(path)?;
Ok(Box::new(CedictIter {
lines: BufReader::new(file).lines(),
}))
}
}
struct CedictIter {
lines: Lines<BufReader<File>>,
}
impl Iterator for CedictIter {
type Item = Result<DictEntry>;
fn next(&mut self) -> Option<Self::Item> {
for line in self.lines.by_ref() {
match line {
Ok(line) => {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if let Some(entry) = parse_line(line) {
return Some(Ok(entry));
}
}
Err(err) => return Some(Err(err.into())),
}
}
None
}
}
fn parse_line(line: &str) -> Option<DictEntry> {
let first_space = line.find(' ')?;
let traditional = &line[..first_space];
let rest = line[first_space + 1..].trim_start();
let second_space = rest.find(' ')?;
let simplified = &rest[..second_space];
let rest = rest[second_space + 1..].trim_start();
let pinyin_start = rest.find('[')?;
let pinyin_end = rest.find(']')?;
if pinyin_end <= pinyin_start {
return None;
}
let pinyin = &rest[pinyin_start + 1..pinyin_end];
let definitions = rest[pinyin_end + 1..].trim();
let definitions = definitions.strip_prefix('/')?.strip_suffix('/')?;
let mut entry = DictEntry::new(
DictSource::Custom {
name: "cc-cedict".to_string(),
},
clean_text(simplified),
);
entry.phonetic_uk = Some(clean_text(pinyin));
entry.definitions = definitions
.split('/')
.map(clean_text)
.filter(|definition| !definition.is_empty())
.map(|definition| Definition::new(definition, simplified, None))
.collect();
entry.tags = vec!["cc-cedict".to_string()];
entry.extra = json!({
"traditional": traditional,
"pinyin": pinyin,
"license": "CC BY-SA",
});
if entry.definitions.is_empty() {
None
} else {
Some(entry)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn parses_cedict_line() {
let entry = parse_line("老師 老师 [lao3 shi1] /teacher/CL:個|个[ge4]/").unwrap();
assert_eq!(entry.word, "老师");
assert_eq!(entry.phonetic(), Some("lao3 shi1"));
assert_eq!(entry.definitions[0].en, "teacher");
}
#[test]
fn parses_cedict_file() {
let mut file = tempfile::NamedTempFile::new().unwrap();
writeln!(file, "# comment").unwrap();
writeln!(file, "老師 老师 [lao3 shi1] /teacher/").unwrap();
let entries = CedictParser
.parse(file.path())
.unwrap()
.collect::<Result<Vec<_>>>()
.unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].word, "老师");
}
}