Skip to main content

dictx_parser/
sqlite_dict.rs

1use crate::html::plain_text_from_html;
2use crate::traits::{DictParser, ValidationReport};
3use dictx_core::{
4    clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Result,
5};
6use flate2::read::ZlibDecoder;
7use rusqlite::Connection;
8use serde_json::json;
9use serde_json::Value;
10use std::collections::BTreeSet;
11use std::io::Read;
12use std::path::Path;
13
14#[derive(Debug, Clone)]
15pub struct SqliteDictParser {
16    tables: Vec<String>,
17}
18
19impl Default for SqliteDictParser {
20    fn default() -> Self {
21        Self {
22            tables: vec!["en".to_string(), "ch".to_string()],
23        }
24    }
25}
26
27impl DictParser for SqliteDictParser {
28    fn name(&self) -> &'static str {
29        "SQLite dictionary"
30    }
31
32    fn format_id(&self) -> &'static str {
33        "sqlite"
34    }
35
36    fn validate(&self, path: &Path) -> Result<ValidationReport> {
37        let conn = Connection::open(path)
38            .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
39        let mut issues = Vec::new();
40        let mut total = 0usize;
41        for table in &self.tables {
42            let exists: i64 = conn
43                .query_row(
44                    "select count(*) from sqlite_master where type='table' and name=?1",
45                    [table],
46                    |row| row.get(0),
47                )
48                .unwrap_or(0);
49            if exists == 0 {
50                issues.push(format!("缺少表: {table}"));
51                continue;
52            }
53            let count: i64 = conn
54                .query_row(&format!("select count(*) from {table}"), [], |row| {
55                    row.get(0)
56                })
57                .unwrap_or(0);
58            total += count.max(0) as usize;
59        }
60
61        if issues.is_empty() {
62            Ok(ValidationReport::ok(self.format_id(), Some(total)))
63        } else {
64            Ok(ValidationReport {
65                valid: false,
66                format: self.format_id().to_string(),
67                estimated_entries: Some(total),
68                issues,
69            })
70        }
71    }
72
73    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
74        let conn = Connection::open(path)
75            .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
76        let db_name = path
77            .file_stem()
78            .and_then(|name| name.to_str())
79            .unwrap_or("sqlite")
80            .to_string();
81        let mut entries = Vec::new();
82
83        for table in &self.tables {
84            let exists: i64 = conn
85                .query_row(
86                    "select count(*) from sqlite_master where type='table' and name=?1",
87                    [table],
88                    |row| row.get(0),
89                )
90                .unwrap_or(0);
91            if exists == 0 {
92                continue;
93            }
94
95            let mut stmt = conn
96                .prepare(&format!("select query, detail from {table}"))
97                .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
98            let rows = stmt
99                .query_map([], |row| {
100                    let query: String = row.get(0)?;
101                    let detail: Vec<u8> = row.get(1)?;
102                    Ok((query, detail))
103                })
104                .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
105
106            for row in rows {
107                let (query, detail) =
108                    row.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
109                if let Some(entry) = make_entry(&db_name, table, &query, &detail) {
110                    entries.push(Ok(entry));
111                }
112            }
113        }
114
115        Ok(Box::new(entries.into_iter()))
116    }
117}
118
119fn make_entry(db_name: &str, table: &str, query: &str, detail: &[u8]) -> Option<DictEntry> {
120    let query = clean_text(query);
121    if query.is_empty() {
122        return None;
123    }
124
125    let detail_text = decode_detail(detail);
126    if let Ok(json) = serde_json::from_str::<Value>(&detail_text) {
127        return entry_from_json(db_name, table, &query, &json);
128    }
129
130    let text = plain_text_from_html(&detail_text);
131    let text = if text.is_empty() {
132        clean_text(&detail_text)
133    } else {
134        text
135    };
136    if text.is_empty() {
137        return None;
138    }
139
140    let mut entry = DictEntry::new(
141        DictSource::Sqlite {
142            name: db_name.to_string(),
143            table: table.to_string(),
144        },
145        query.clone(),
146    );
147
148    if table == "ch" || contains_cjk(&query) {
149        entry
150            .definitions
151            .push(Definition::new(text.clone(), query, None));
152    } else {
153        entry
154            .definitions
155            .push(Definition::new("", text.clone(), None));
156    }
157
158    entry.extra = json!({
159        "table": table,
160        "detail_preview": text.chars().take(240).collect::<String>(),
161    });
162    Some(entry)
163}
164
165fn entry_from_json(db_name: &str, table: &str, query: &str, json: &Value) -> Option<DictEntry> {
166    let word = json
167        .get("k")
168        .and_then(Value::as_str)
169        .map(clean_text)
170        .filter(|value| !value.is_empty())
171        .unwrap_or_else(|| query.to_string());
172    let mut entry = DictEntry::new(
173        DictSource::Sqlite {
174            name: db_name.to_string(),
175            table: table.to_string(),
176        },
177        word,
178    );
179
180    if let Some(pron) = json.get("pron").and_then(Value::as_object) {
181        for (key, value) in pron {
182            let value = value
183                .as_str()
184                .map(clean_text)
185                .filter(|value| !value.is_empty());
186            if key.contains('美') || key.eq_ignore_ascii_case("us") {
187                entry.phonetic_us = value;
188            } else if key.contains('英') || key.eq_ignore_ascii_case("uk") {
189                entry.phonetic_uk = value;
190            }
191        }
192    }
193
194    let mut pos = BTreeSet::new();
195    parse_para_definitions(table, query, json, &mut entry, &mut pos);
196    parse_collins_definitions(table, query, json, &mut entry, &mut pos);
197    parse_examples(json, &mut entry);
198    parse_tags(json, &mut entry);
199
200    entry.pos = pos.into_iter().collect();
201    if entry.definitions.is_empty() && entry.examples.is_empty() {
202        return None;
203    }
204    entry.extra = json!({
205        "table": table,
206        "source_key": query,
207    });
208    Some(entry)
209}
210
211fn parse_para_definitions(
212    table: &str,
213    query: &str,
214    json: &Value,
215    entry: &mut DictEntry,
216    pos_set: &mut BTreeSet<String>,
217) {
218    for item in json
219        .get("para")
220        .and_then(Value::as_array)
221        .into_iter()
222        .flatten()
223        .filter_map(Value::as_str)
224    {
225        let text = clean_text(item);
226        if text.is_empty() {
227            continue;
228        }
229        let (pos, body) = split_pos_prefix(&text);
230        if let Some(pos) = &pos {
231            pos_set.insert(pos.clone());
232        }
233        if table == "ch" || contains_cjk(query) {
234            entry
235                .definitions
236                .push(Definition::new(body, query, pos.clone()));
237        } else {
238            entry
239                .definitions
240                .push(Definition::new("", body, pos.clone()));
241        }
242    }
243}
244
245fn parse_collins_definitions(
246    table: &str,
247    query: &str,
248    json: &Value,
249    entry: &mut DictEntry,
250    pos_set: &mut BTreeSet<String>,
251) {
252    let Some(items) = json
253        .get("co")
254        .and_then(|co| co.get("li"))
255        .and_then(Value::as_array)
256    else {
257        return;
258    };
259
260    for item in items {
261        let pos = item
262            .get("a")
263            .and_then(Value::as_str)
264            .map(clean_pos)
265            .filter(|value| !value.is_empty());
266        if let Some(pos) = &pos {
267            pos_set.insert(pos.clone());
268        }
269        if let Some(maj) = item.get("maj").and_then(Value::as_str).map(clean_text) {
270            let (en, zh) = split_english_chinese(&maj);
271            if table == "ch" || contains_cjk(query) {
272                entry.definitions.push(Definition::new(
273                    if en.is_empty() { maj.clone() } else { en },
274                    query,
275                    pos.clone(),
276                ));
277            } else {
278                entry.definitions.push(Definition::new(
279                    en,
280                    if zh.is_empty() { maj } else { zh },
281                    pos.clone(),
282                ));
283            }
284        }
285        if let Some(examples) = item.get("eg").and_then(Value::as_array) {
286            for example in examples {
287                if let Some(example) = parse_example_array(example) {
288                    entry.examples.push(example);
289                }
290            }
291        }
292    }
293}
294
295fn parse_examples(json: &Value, entry: &mut DictEntry) {
296    let Some(eg) = json.get("eg").and_then(Value::as_object) else {
297        return;
298    };
299    for examples in eg.values().filter_map(Value::as_array) {
300        for example in examples {
301            if let Some(example) = parse_example_array(example) {
302                entry.examples.push(example);
303            }
304        }
305    }
306}
307
308fn parse_example_array(value: &Value) -> Option<Example> {
309    let array = value.as_array()?;
310    let en = array
311        .first()
312        .and_then(Value::as_str)
313        .map(clean_text)
314        .unwrap_or_default();
315    let zh = array
316        .get(1)
317        .and_then(Value::as_str)
318        .map(clean_text)
319        .unwrap_or_default();
320    if en.is_empty() && zh.is_empty() {
321        None
322    } else {
323        Some(Example { en, zh })
324    }
325}
326
327fn parse_tags(json: &Value, entry: &mut DictEntry) {
328    let Some(rank) = json
329        .get("co")
330        .and_then(|co| co.get("rank"))
331        .or_else(|| json.get("rank"))
332        .and_then(Value::as_str)
333    else {
334        return;
335    };
336    entry.tags = rank
337        .split_whitespace()
338        .map(normalize_tag)
339        .filter(|tag| !tag.is_empty())
340        .collect();
341}
342
343fn split_pos_prefix(text: &str) -> (Option<String>, String) {
344    if let Some((head, tail)) = text.split_once('.') {
345        let head = head.trim();
346        if head.len() <= 8 && head.chars().all(|ch| ch.is_ascii_alphabetic()) {
347            return (Some(clean_pos(head)), clean_text(tail));
348        }
349    }
350    (None, text.to_string())
351}
352
353fn split_english_chinese(text: &str) -> (String, String) {
354    let Some(idx) = text
355        .char_indices()
356        .find_map(|(idx, ch)| contains_cjk_char(ch).then_some(idx))
357    else {
358        return (text.to_string(), String::new());
359    };
360    let (en, zh) = text.split_at(idx);
361    (clean_text(en), clean_text(zh))
362}
363
364fn decode_detail(detail: &[u8]) -> String {
365    if let Ok(text) = zlib_to_string(detail) {
366        return text;
367    }
368    String::from_utf8_lossy(detail).into_owned()
369}
370
371fn zlib_to_string(detail: &[u8]) -> std::io::Result<String> {
372    let mut decoder = ZlibDecoder::new(detail);
373    let mut out = String::new();
374    decoder.read_to_string(&mut out)?;
375    Ok(out)
376}
377
378fn contains_cjk(value: &str) -> bool {
379    value.chars().any(contains_cjk_char)
380}
381
382fn contains_cjk_char(ch: char) -> bool {
383    matches!(ch as u32, 0x4E00..=0x9FFF | 0x3400..=0x4DBF)
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389
390    #[test]
391    fn decodes_plain_entry_when_not_compressed() {
392        let entry = make_entry("test", "en", "apple", "<b>苹果</b>".as_bytes()).unwrap();
393        assert_eq!(entry.word, "apple");
394        assert_eq!(entry.definitions[0].zh, "苹果");
395    }
396}