dictx-parser 0.1.1

Dictionary source parsers for DictX.
Documentation
use crate::html::plain_text_from_html;
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{
    clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Example, Result,
};
use flate2::read::ZlibDecoder;
use rusqlite::Connection;
use serde_json::json;
use serde_json::Value;
use std::collections::BTreeSet;
use std::io::Read;
use std::path::Path;

#[derive(Debug, Clone)]
pub struct SqliteDictParser {
    tables: Vec<String>,
}

impl Default for SqliteDictParser {
    fn default() -> Self {
        Self {
            tables: vec!["en".to_string(), "ch".to_string()],
        }
    }
}

impl DictParser for SqliteDictParser {
    fn name(&self) -> &'static str {
        "SQLite dictionary"
    }

    fn format_id(&self) -> &'static str {
        "sqlite"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let conn = Connection::open(path)
            .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
        let mut issues = Vec::new();
        let mut total = 0usize;
        for table in &self.tables {
            let exists: i64 = conn
                .query_row(
                    "select count(*) from sqlite_master where type='table' and name=?1",
                    [table],
                    |row| row.get(0),
                )
                .unwrap_or(0);
            if exists == 0 {
                issues.push(format!("缺少表: {table}"));
                continue;
            }
            let count: i64 = conn
                .query_row(&format!("select count(*) from {table}"), [], |row| {
                    row.get(0)
                })
                .unwrap_or(0);
            total += count.max(0) as usize;
        }

        if issues.is_empty() {
            Ok(ValidationReport::ok(self.format_id(), Some(total)))
        } else {
            Ok(ValidationReport {
                valid: false,
                format: self.format_id().to_string(),
                estimated_entries: Some(total),
                issues,
            })
        }
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let conn = Connection::open(path)
            .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
        let db_name = path
            .file_stem()
            .and_then(|name| name.to_str())
            .unwrap_or("sqlite")
            .to_string();
        let mut entries = Vec::new();

        for table in &self.tables {
            let exists: i64 = conn
                .query_row(
                    "select count(*) from sqlite_master where type='table' and name=?1",
                    [table],
                    |row| row.get(0),
                )
                .unwrap_or(0);
            if exists == 0 {
                continue;
            }

            let mut stmt = conn
                .prepare(&format!("select query, detail from {table}"))
                .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
            let rows = stmt
                .query_map([], |row| {
                    let query: String = row.get(0)?;
                    let detail: Vec<u8> = row.get(1)?;
                    Ok((query, detail))
                })
                .map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;

            for row in rows {
                let (query, detail) =
                    row.map_err(|err| dictx_core::DictxError::InvalidData(err.to_string()))?;
                if let Some(entry) = make_entry(&db_name, table, &query, &detail) {
                    entries.push(Ok(entry));
                }
            }
        }

        Ok(Box::new(entries.into_iter()))
    }
}

fn make_entry(db_name: &str, table: &str, query: &str, detail: &[u8]) -> Option<DictEntry> {
    let query = clean_text(query);
    if query.is_empty() {
        return None;
    }

    let detail_text = decode_detail(detail);
    if let Ok(json) = serde_json::from_str::<Value>(&detail_text) {
        return entry_from_json(db_name, table, &query, &json);
    }

    let text = plain_text_from_html(&detail_text);
    let text = if text.is_empty() {
        clean_text(&detail_text)
    } else {
        text
    };
    if text.is_empty() {
        return None;
    }

    let mut entry = DictEntry::new(
        DictSource::Sqlite {
            name: db_name.to_string(),
            table: table.to_string(),
        },
        query.clone(),
    );

    if table == "ch" || contains_cjk(&query) {
        entry
            .definitions
            .push(Definition::new(text.clone(), query, None));
    } else {
        entry
            .definitions
            .push(Definition::new("", text.clone(), None));
    }

    entry.extra = json!({
        "table": table,
        "detail_preview": text.chars().take(240).collect::<String>(),
    });
    Some(entry)
}

fn entry_from_json(db_name: &str, table: &str, query: &str, json: &Value) -> Option<DictEntry> {
    let word = json
        .get("k")
        .and_then(Value::as_str)
        .map(clean_text)
        .filter(|value| !value.is_empty())
        .unwrap_or_else(|| query.to_string());
    let mut entry = DictEntry::new(
        DictSource::Sqlite {
            name: db_name.to_string(),
            table: table.to_string(),
        },
        word,
    );

    if let Some(pron) = json.get("pron").and_then(Value::as_object) {
        for (key, value) in pron {
            let value = value
                .as_str()
                .map(clean_text)
                .filter(|value| !value.is_empty());
            if key.contains('') || key.eq_ignore_ascii_case("us") {
                entry.phonetic_us = value;
            } else if key.contains('') || key.eq_ignore_ascii_case("uk") {
                entry.phonetic_uk = value;
            }
        }
    }

    let mut pos = BTreeSet::new();
    parse_para_definitions(table, query, json, &mut entry, &mut pos);
    parse_collins_definitions(table, query, json, &mut entry, &mut pos);
    parse_examples(json, &mut entry);
    parse_tags(json, &mut entry);

    entry.pos = pos.into_iter().collect();
    if entry.definitions.is_empty() && entry.examples.is_empty() {
        return None;
    }
    entry.extra = json!({
        "table": table,
        "source_key": query,
    });
    Some(entry)
}

fn parse_para_definitions(
    table: &str,
    query: &str,
    json: &Value,
    entry: &mut DictEntry,
    pos_set: &mut BTreeSet<String>,
) {
    for item in json
        .get("para")
        .and_then(Value::as_array)
        .into_iter()
        .flatten()
        .filter_map(Value::as_str)
    {
        let text = clean_text(item);
        if text.is_empty() {
            continue;
        }
        let (pos, body) = split_pos_prefix(&text);
        if let Some(pos) = &pos {
            pos_set.insert(pos.clone());
        }
        if table == "ch" || contains_cjk(query) {
            entry
                .definitions
                .push(Definition::new(body, query, pos.clone()));
        } else {
            entry
                .definitions
                .push(Definition::new("", body, pos.clone()));
        }
    }
}

fn parse_collins_definitions(
    table: &str,
    query: &str,
    json: &Value,
    entry: &mut DictEntry,
    pos_set: &mut BTreeSet<String>,
) {
    let Some(items) = json
        .get("co")
        .and_then(|co| co.get("li"))
        .and_then(Value::as_array)
    else {
        return;
    };

    for item in items {
        let pos = item
            .get("a")
            .and_then(Value::as_str)
            .map(clean_pos)
            .filter(|value| !value.is_empty());
        if let Some(pos) = &pos {
            pos_set.insert(pos.clone());
        }
        if let Some(maj) = item.get("maj").and_then(Value::as_str).map(clean_text) {
            let (en, zh) = split_english_chinese(&maj);
            if table == "ch" || contains_cjk(query) {
                entry.definitions.push(Definition::new(
                    if en.is_empty() { maj.clone() } else { en },
                    query,
                    pos.clone(),
                ));
            } else {
                entry.definitions.push(Definition::new(
                    en,
                    if zh.is_empty() { maj } else { zh },
                    pos.clone(),
                ));
            }
        }
        if let Some(examples) = item.get("eg").and_then(Value::as_array) {
            for example in examples {
                if let Some(example) = parse_example_array(example) {
                    entry.examples.push(example);
                }
            }
        }
    }
}

fn parse_examples(json: &Value, entry: &mut DictEntry) {
    let Some(eg) = json.get("eg").and_then(Value::as_object) else {
        return;
    };
    for examples in eg.values().filter_map(Value::as_array) {
        for example in examples {
            if let Some(example) = parse_example_array(example) {
                entry.examples.push(example);
            }
        }
    }
}

fn parse_example_array(value: &Value) -> Option<Example> {
    let array = value.as_array()?;
    let en = array
        .first()
        .and_then(Value::as_str)
        .map(clean_text)
        .unwrap_or_default();
    let zh = array
        .get(1)
        .and_then(Value::as_str)
        .map(clean_text)
        .unwrap_or_default();
    if en.is_empty() && zh.is_empty() {
        None
    } else {
        Some(Example { en, zh })
    }
}

fn parse_tags(json: &Value, entry: &mut DictEntry) {
    let Some(rank) = json
        .get("co")
        .and_then(|co| co.get("rank"))
        .or_else(|| json.get("rank"))
        .and_then(Value::as_str)
    else {
        return;
    };
    entry.tags = rank
        .split_whitespace()
        .map(normalize_tag)
        .filter(|tag| !tag.is_empty())
        .collect();
}

fn split_pos_prefix(text: &str) -> (Option<String>, String) {
    if let Some((head, tail)) = text.split_once('.') {
        let head = head.trim();
        if head.len() <= 8 && head.chars().all(|ch| ch.is_ascii_alphabetic()) {
            return (Some(clean_pos(head)), clean_text(tail));
        }
    }
    (None, text.to_string())
}

fn split_english_chinese(text: &str) -> (String, String) {
    let Some(idx) = text
        .char_indices()
        .find_map(|(idx, ch)| contains_cjk_char(ch).then_some(idx))
    else {
        return (text.to_string(), String::new());
    };
    let (en, zh) = text.split_at(idx);
    (clean_text(en), clean_text(zh))
}

fn decode_detail(detail: &[u8]) -> String {
    if let Ok(text) = zlib_to_string(detail) {
        return text;
    }
    String::from_utf8_lossy(detail).into_owned()
}

fn zlib_to_string(detail: &[u8]) -> std::io::Result<String> {
    let mut decoder = ZlibDecoder::new(detail);
    let mut out = String::new();
    decoder.read_to_string(&mut out)?;
    Ok(out)
}

fn contains_cjk(value: &str) -> bool {
    value.chars().any(contains_cjk_char)
}

fn contains_cjk_char(ch: char) -> bool {
    matches!(ch as u32, 0x4E00..=0x9FFF | 0x3400..=0x4DBF)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn decodes_plain_entry_when_not_compressed() {
        let entry = make_entry("test", "en", "apple", "<b>苹果</b>".as_bytes()).unwrap();
        assert_eq!(entry.word, "apple");
        assert_eq!(entry.definitions[0].zh, "苹果");
    }
}