1use crate::html::plain_text_from_html;
2use crate::traits::{DictParser, ValidationReport};
3use dictx_core::{
4 clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Result,
5};
6use flate2::read::ZlibDecoder;
7use rusqlite::Connection;
8use serde_json::json;
9use serde_json::Value;
10use std::collections::BTreeSet;
11use std::io::Read;
12use std::path::Path;
13
14#[derive(Debug, Clone)]
15pub struct SqliteDictParser {
16 tables: Vec<String>,
17}
18
19impl Default for SqliteDictParser {
20 fn default() -> Self {
21 Self {
22 tables: vec!["en".to_string(), "ch".to_string()],
23 }
24 }
25}
26
27impl DictParser for SqliteDictParser {
28 fn name(&self) -> &'static str {
29 "SQLite dictionary"
30 }
31
32 fn format_id(&self) -> &'static str {
33 "sqlite"
34 }
35
36 fn validate(&self, path: &Path) -> Result<ValidationReport> {
37 let conn = Connection::open(path)
38 .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
39 let mut issues = Vec::new();
40 let mut total = 0usize;
41 for table in &self.tables {
42 let exists: i64 = conn
43 .query_row(
44 "select count(*) from sqlite_master where type='table' and name=?1",
45 [table],
46 |row| row.get(0),
47 )
48 .unwrap_or(0);
49 if exists == 0 {
50 issues.push(format!("缺少表: {table}"));
51 continue;
52 }
53 let count: i64 = conn
54 .query_row(&format!("select count(*) from {table}"), [], |row| {
55 row.get(0)
56 })
57 .unwrap_or(0);
58 total += count.max(0) as usize;
59 }
60
61 if issues.is_empty() {
62 Ok(ValidationReport::ok(self.format_id(), Some(total)))
63 } else {
64 Ok(ValidationReport {
65 valid: false,
66 format: self.format_id().to_string(),
67 estimated_entries: Some(total),
68 issues,
69 })
70 }
71 }
72
73 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
74 let conn = Connection::open(path)
75 .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
76 let db_name = path
77 .file_stem()
78 .and_then(|name| name.to_str())
79 .unwrap_or("sqlite")
80 .to_string();
81 let mut entries = Vec::new();
82
83 for table in &self.tables {
84 let exists: i64 = conn
85 .query_row(
86 "select count(*) from sqlite_master where type='table' and name=?1",
87 [table],
88 |row| row.get(0),
89 )
90 .unwrap_or(0);
91 if exists == 0 {
92 continue;
93 }
94
95 let mut stmt = conn
96 .prepare(&format!("select query, detail from {table}"))
97 .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
98 let rows = stmt
99 .query_map([], |row| {
100 let query: String = row.get(0)?;
101 let detail: Vec<u8> = row.get(1)?;
102 Ok((query, detail))
103 })
104 .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
105
106 for row in rows {
107 let (query, detail) =
108 row.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
109 if let Some(entry) = make_entry(&db_name, table, &query, &detail) {
110 entries.push(Ok(entry));
111 }
112 }
113 }
114
115 Ok(Box::new(entries.into_iter()))
116 }
117}
118
119fn make_entry(db_name: &str, table: &str, query: &str, detail: &[u8]) -> Option<DictEntry> {
120 let query = clean_text(query);
121 if query.is_empty() {
122 return None;
123 }
124
125 let detail_text = decode_detail(detail);
126 if let Ok(json) = serde_json::from_str::<Value>(&detail_text) {
127 return entry_from_json(db_name, table, &query, &json);
128 }
129
130 let text = plain_text_from_html(&detail_text);
131 let text = if text.is_empty() {
132 clean_text(&detail_text)
133 } else {
134 text
135 };
136 if text.is_empty() {
137 return None;
138 }
139
140 let mut entry = DictEntry::new(
141 DictSource::Sqlite {
142 name: db_name.to_string(),
143 table: table.to_string(),
144 },
145 query.clone(),
146 );
147
148 if table == "ch" || contains_cjk(&query) {
149 entry
150 .definitions
151 .push(Definition::new(text.clone(), query, None));
152 } else {
153 entry
154 .definitions
155 .push(Definition::new("", text.clone(), None));
156 }
157
158 entry.extra = json!({
159 "table": table,
160 "detail_preview": text.chars().take(240).collect::<String>(),
161 });
162 Some(entry)
163}
164
165fn entry_from_json(db_name: &str, table: &str, query: &str, json: &Value) -> Option<DictEntry> {
166 let word = json
167 .get("k")
168 .and_then(Value::as_str)
169 .map(clean_text)
170 .filter(|value| !value.is_empty())
171 .unwrap_or_else(|| query.to_string());
172 let mut entry = DictEntry::new(
173 DictSource::Sqlite {
174 name: db_name.to_string(),
175 table: table.to_string(),
176 },
177 word,
178 );
179
180 if let Some(pron) = json.get("pron").and_then(Value::as_object) {
181 for (key, value) in pron {
182 let value = value
183 .as_str()
184 .map(clean_text)
185 .filter(|value| !value.is_empty());
186 if key.contains('美') || key.eq_ignore_ascii_case("us") {
187 entry.phonetic_us = value;
188 } else if key.contains('英') || key.eq_ignore_ascii_case("uk") {
189 entry.phonetic_uk = value;
190 }
191 }
192 }
193
194 let mut pos = BTreeSet::new();
195 parse_para_definitions(table, query, json, &mut entry, &mut pos);
196 parse_collins_definitions(table, query, json, &mut entry, &mut pos);
197 parse_examples(json, &mut entry);
198 parse_tags(json, &mut entry);
199
200 entry.pos = pos.into_iter().collect();
201 if entry.definitions.is_empty() && entry.examples.is_empty() {
202 return None;
203 }
204 entry.extra = json!({
205 "table": table,
206 "source_key": query,
207 });
208 Some(entry)
209}
210
211fn parse_para_definitions(
212 table: &str,
213 query: &str,
214 json: &Value,
215 entry: &mut DictEntry,
216 pos_set: &mut BTreeSet<String>,
217) {
218 for item in json
219 .get("para")
220 .and_then(Value::as_array)
221 .into_iter()
222 .flatten()
223 .filter_map(Value::as_str)
224 {
225 let text = clean_text(item);
226 if text.is_empty() {
227 continue;
228 }
229 let (pos, body) = split_pos_prefix(&text);
230 if let Some(pos) = &pos {
231 pos_set.insert(pos.clone());
232 }
233 if table == "ch" || contains_cjk(query) {
234 entry
235 .definitions
236 .push(Definition::new(body, query, pos.clone()));
237 } else {
238 entry
239 .definitions
240 .push(Definition::new("", body, pos.clone()));
241 }
242 }
243}
244
245fn parse_collins_definitions(
246 table: &str,
247 query: &str,
248 json: &Value,
249 entry: &mut DictEntry,
250 pos_set: &mut BTreeSet<String>,
251) {
252 let Some(items) = json
253 .get("co")
254 .and_then(|co| co.get("li"))
255 .and_then(Value::as_array)
256 else {
257 return;
258 };
259
260 for item in items {
261 let pos = item
262 .get("a")
263 .and_then(Value::as_str)
264 .map(clean_pos)
265 .filter(|value| !value.is_empty());
266 if let Some(pos) = &pos {
267 pos_set.insert(pos.clone());
268 }
269 if let Some(maj) = item.get("maj").and_then(Value::as_str).map(clean_text) {
270 let (en, zh) = split_english_chinese(&maj);
271 if table == "ch" || contains_cjk(query) {
272 entry.definitions.push(Definition::new(
273 if en.is_empty() { maj.clone() } else { en },
274 query,
275 pos.clone(),
276 ));
277 } else {
278 entry.definitions.push(Definition::new(
279 en,
280 if zh.is_empty() { maj } else { zh },
281 pos.clone(),
282 ));
283 }
284 }
285 if let Some(examples) = item.get("eg").and_then(Value::as_array) {
286 for example in examples {
287 if let Some(example) = parse_example_array(example) {
288 entry.examples.push(example);
289 }
290 }
291 }
292 }
293}
294
295fn parse_examples(json: &Value, entry: &mut DictEntry) {
296 let Some(eg) = json.get("eg").and_then(Value::as_object) else {
297 return;
298 };
299 for examples in eg.values().filter_map(Value::as_array) {
300 for example in examples {
301 if let Some(example) = parse_example_array(example) {
302 entry.examples.push(example);
303 }
304 }
305 }
306}
307
308fn parse_example_array(value: &Value) -> Option<Example> {
309 let array = value.as_array()?;
310 let en = array
311 .first()
312 .and_then(Value::as_str)
313 .map(clean_text)
314 .unwrap_or_default();
315 let zh = array
316 .get(1)
317 .and_then(Value::as_str)
318 .map(clean_text)
319 .unwrap_or_default();
320 if en.is_empty() && zh.is_empty() {
321 None
322 } else {
323 Some(Example { en, zh })
324 }
325}
326
327fn parse_tags(json: &Value, entry: &mut DictEntry) {
328 let Some(rank) = json
329 .get("co")
330 .and_then(|co| co.get("rank"))
331 .or_else(|| json.get("rank"))
332 .and_then(Value::as_str)
333 else {
334 return;
335 };
336 entry.tags = rank
337 .split_whitespace()
338 .map(normalize_tag)
339 .filter(|tag| !tag.is_empty())
340 .collect();
341}
342
343fn split_pos_prefix(text: &str) -> (Option<String>, String) {
344 if let Some((head, tail)) = text.split_once('.') {
345 let head = head.trim();
346 if head.len() <= 8 && head.chars().all(|ch| ch.is_ascii_alphabetic()) {
347 return (Some(clean_pos(head)), clean_text(tail));
348 }
349 }
350 (None, text.to_string())
351}
352
353fn split_english_chinese(text: &str) -> (String, String) {
354 let Some(idx) = text
355 .char_indices()
356 .find_map(|(idx, ch)| contains_cjk_char(ch).then_some(idx))
357 else {
358 return (text.to_string(), String::new());
359 };
360 let (en, zh) = text.split_at(idx);
361 (clean_text(en), clean_text(zh))
362}
363
364fn decode_detail(detail: &[u8]) -> String {
365 if let Ok(text) = zlib_to_string(detail) {
366 return text;
367 }
368 String::from_utf8_lossy(detail).into_owned()
369}
370
371fn zlib_to_string(detail: &[u8]) -> std::io::Result<String> {
372 let mut decoder = ZlibDecoder::new(detail);
373 let mut out = String::new();
374 decoder.read_to_string(&mut out)?;
375 Ok(out)
376}
377
378fn contains_cjk(value: &str) -> bool {
379 value.chars().any(contains_cjk_char)
380}
381
382fn contains_cjk_char(ch: char) -> bool {
383 matches!(ch as u32, 0x4E00..=0x9FFF | 0x3400..=0x4DBF)
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389
390 #[test]
391 fn decodes_plain_entry_when_not_compressed() {
392 let entry = make_entry("test", "en", "apple", "<b>苹果</b>".as_bytes()).unwrap();
393 assert_eq!(entry.word, "apple");
394 assert_eq!(entry.definitions[0].zh, "苹果");
395 }
396}