use crate::html::plain_text_from_html;
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{
clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Result,
};
use flate2::read::ZlibDecoder;
use rusqlite::Connection;
use serde_json::json;
use serde_json::Value;
use std::collections::BTreeSet;
use std::io::Read;
use std::path::Path;
#[derive(Debug, Clone)]
pub struct SqliteDictParser {
tables: Vec<String>,
}
impl Default for SqliteDictParser {
fn default() -> Self {
Self {
tables: vec!["en".to_string(), "ch".to_string()],
}
}
}
impl DictParser for SqliteDictParser {
fn name(&self) -> &'static str {
"SQLite dictionary"
}
fn format_id(&self) -> &'static str {
"sqlite"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let conn = Connection::open(path)
.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
let mut issues = Vec::new();
let mut total = 0usize;
for table in &self.tables {
let exists: i64 = conn
.query_row(
"select count(*) from sqlite_master where type='table' and name=?1",
[table],
|row| row.get(0),
)
.unwrap_or(0);
if exists == 0 {
issues.push(format!("缺少表: {table}"));
continue;
}
let count: i64 = conn
.query_row(&format!("select count(*) from {table}"), [], |row| {
row.get(0)
})
.unwrap_or(0);
total += count.max(0) as usize;
}
if issues.is_empty() {
Ok(ValidationReport::ok(self.format_id(), Some(total)))
} else {
Ok(ValidationReport {
valid: false,
format: self.format_id().to_string(),
estimated_entries: Some(total),
issues,
})
}
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let conn = Connection::open(path)
.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
let db_name = path
.file_stem()
.and_then(|name| name.to_str())
.unwrap_or("sqlite")
.to_string();
let mut entries = Vec::new();
for table in &self.tables {
let exists: i64 = conn
.query_row(
"select count(*) from sqlite_master where type='table' and name=?1",
[table],
|row| row.get(0),
)
.unwrap_or(0);
if exists == 0 {
continue;
}
let mut stmt = conn
.prepare(&format!("select query, detail from {table}"))
.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
let rows = stmt
.query_map([], |row| {
let query: String = row.get(0)?;
let detail: Vec<u8> = row.get(1)?;
Ok((query, detail))
})
.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
for row in rows {
let (query, detail) =
row.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
if let Some(entry) = make_entry(&db_name, table, &query, &detail) {
entries.push(Ok(entry));
}
}
}
Ok(Box::new(entries.into_iter()))
}
}
fn make_entry(db_name: &str, table: &str, query: &str, detail: &[u8]) -> Option<DictEntry> {
let query = clean_text(query);
if query.is_empty() {
return None;
}
let detail_text = decode_detail(detail);
if let Ok(json) = serde_json::from_str::<Value>(&detail_text) {
return entry_from_json(db_name, table, &query, &json);
}
let text = plain_text_from_html(&detail_text);
let text = if text.is_empty() {
clean_text(&detail_text)
} else {
text
};
if text.is_empty() {
return None;
}
let mut entry = DictEntry::new(
DictSource::Sqlite {
name: db_name.to_string(),
table: table.to_string(),
},
query.clone(),
);
if table == "ch" || contains_cjk(&query) {
entry
.definitions
.push(Definition::new(text.clone(), query, None));
} else {
entry
.definitions
.push(Definition::new("", text.clone(), None));
}
entry.extra = json!({
"table": table,
"detail_preview": text.chars().take(240).collect::<String>(),
});
Some(entry)
}
fn entry_from_json(db_name: &str, table: &str, query: &str, json: &Value) -> Option<DictEntry> {
let word = json
.get("k")
.and_then(Value::as_str)
.map(clean_text)
.filter(|value| !value.is_empty())
.unwrap_or_else(|| query.to_string());
let mut entry = DictEntry::new(
DictSource::Sqlite {
name: db_name.to_string(),
table: table.to_string(),
},
word,
);
if let Some(pron) = json.get("pron").and_then(Value::as_object) {
for (key, value) in pron {
let value = value
.as_str()
.map(clean_text)
.filter(|value| !value.is_empty());
if key.contains('美') || key.eq_ignore_ascii_case("us") {
entry.phonetic_us = value;
} else if key.contains('英') || key.eq_ignore_ascii_case("uk") {
entry.phonetic_uk = value;
}
}
}
let mut pos = BTreeSet::new();
parse_para_definitions(table, query, json, &mut entry, &mut pos);
parse_collins_definitions(table, query, json, &mut entry, &mut pos);
parse_examples(json, &mut entry);
parse_tags(json, &mut entry);
entry.pos = pos.into_iter().collect();
if entry.definitions.is_empty() && entry.examples.is_empty() {
return None;
}
entry.extra = json!({
"table": table,
"source_key": query,
});
Some(entry)
}
fn parse_para_definitions(
table: &str,
query: &str,
json: &Value,
entry: &mut DictEntry,
pos_set: &mut BTreeSet<String>,
) {
for item in json
.get("para")
.and_then(Value::as_array)
.into_iter()
.flatten()
.filter_map(Value::as_str)
{
let text = clean_text(item);
if text.is_empty() {
continue;
}
let (pos, body) = split_pos_prefix(&text);
if let Some(pos) = &pos {
pos_set.insert(pos.clone());
}
if table == "ch" || contains_cjk(query) {
entry
.definitions
.push(Definition::new(body, query, pos.clone()));
} else {
entry
.definitions
.push(Definition::new("", body, pos.clone()));
}
}
}
fn parse_collins_definitions(
table: &str,
query: &str,
json: &Value,
entry: &mut DictEntry,
pos_set: &mut BTreeSet<String>,
) {
let Some(items) = json
.get("co")
.and_then(|co| co.get("li"))
.and_then(Value::as_array)
else {
return;
};
for item in items {
let pos = item
.get("a")
.and_then(Value::as_str)
.map(clean_pos)
.filter(|value| !value.is_empty());
if let Some(pos) = &pos {
pos_set.insert(pos.clone());
}
if let Some(maj) = item.get("maj").and_then(Value::as_str).map(clean_text) {
let (en, zh) = split_english_chinese(&maj);
if table == "ch" || contains_cjk(query) {
entry.definitions.push(Definition::new(
if en.is_empty() { maj.clone() } else { en },
query,
pos.clone(),
));
} else {
entry.definitions.push(Definition::new(
en,
if zh.is_empty() { maj } else { zh },
pos.clone(),
));
}
}
if let Some(examples) = item.get("eg").and_then(Value::as_array) {
for example in examples {
if let Some(example) = parse_example_array(example) {
entry.examples.push(example);
}
}
}
}
}
fn parse_examples(json: &Value, entry: &mut DictEntry) {
let Some(eg) = json.get("eg").and_then(Value::as_object) else {
return;
};
for examples in eg.values().filter_map(Value::as_array) {
for example in examples {
if let Some(example) = parse_example_array(example) {
entry.examples.push(example);
}
}
}
}
fn parse_example_array(value: &Value) -> Option<Example> {
let array = value.as_array()?;
let en = array
.first()
.and_then(Value::as_str)
.map(clean_text)
.unwrap_or_default();
let zh = array
.get(1)
.and_then(Value::as_str)
.map(clean_text)
.unwrap_or_default();
if en.is_empty() && zh.is_empty() {
None
} else {
Some(Example { en, zh })
}
}
fn parse_tags(json: &Value, entry: &mut DictEntry) {
let Some(rank) = json
.get("co")
.and_then(|co| co.get("rank"))
.or_else(|| json.get("rank"))
.and_then(Value::as_str)
else {
return;
};
entry.tags = rank
.split_whitespace()
.map(normalize_tag)
.filter(|tag| !tag.is_empty())
.collect();
}
fn split_pos_prefix(text: &str) -> (Option<String>, String) {
if let Some((head, tail)) = text.split_once('.') {
let head = head.trim();
if head.len() <= 8 && head.chars().all(|ch| ch.is_ascii_alphabetic()) {
return (Some(clean_pos(head)), clean_text(tail));
}
}
(None, text.to_string())
}
fn split_english_chinese(text: &str) -> (String, String) {
let Some(idx) = text
.char_indices()
.find_map(|(idx, ch)| contains_cjk_char(ch).then_some(idx))
else {
return (text.to_string(), String::new());
};
let (en, zh) = text.split_at(idx);
(clean_text(en), clean_text(zh))
}
fn decode_detail(detail: &[u8]) -> String {
if let Ok(text) = zlib_to_string(detail) {
return text;
}
String::from_utf8_lossy(detail).into_owned()
}
fn zlib_to_string(detail: &[u8]) -> std::io::Result<String> {
let mut decoder = ZlibDecoder::new(detail);
let mut out = String::new();
decoder.read_to_string(&mut out)?;
Ok(out)
}
fn contains_cjk(value: &str) -> bool {
value.chars().any(contains_cjk_char)
}
fn contains_cjk_char(ch: char) -> bool {
matches!(ch as u32, 0x4E00..=0x9FFF | 0x3400..=0x4DBF)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decodes_plain_entry_when_not_compressed() {
let entry = make_entry("test", "en", "apple", "<b>苹果</b>".as_bytes()).unwrap();
assert_eq!(entry.word, "apple");
assert_eq!(entry.definitions[0].zh, "苹果");
}
}