Skip to main content

ferrous_opencc_compiler/
lib.rs

1use std::{
2    collections::BTreeMap,
3    fs::File,
4    io::{
5        BufRead,
6        BufReader,
7        Write,
8    },
9    path::Path,
10};
11
12use anyhow::{
13    Context,
14    Result,
15};
16use fst::MapBuilder;
17use rkyv::{
18    Archive,
19    Deserialize,
20    Serialize,
21};
22
23#[derive(Archive, Serialize, Deserialize, Debug)]
24pub enum Delta {
25    CharDiffs(Vec<(u16, char)>),
26    FullReplacement(String),
27}
28
29#[derive(Archive, Serialize, Deserialize, Debug)]
30pub struct SerializableFstDict {
31    pub values: Vec<Vec<Delta>>,
32    pub max_key_length: u32,
33}
34
35fn compute_delta(key: &str, value: &str) -> Delta {
36    if key.chars().count() != value.chars().count() {
37        return Delta::FullReplacement(value.to_string());
38    }
39
40    let diffs: Vec<(u16, char)> = key
41        .chars()
42        .zip(value.chars())
43        .enumerate()
44        .filter_map(|(i, (k, v))| if k != v { Some((i as u16, v)) } else { None })
45        .collect();
46
47    if diffs.len() * 6 > value.len() {
48        return Delta::FullReplacement(value.to_string());
49    }
50
51    Delta::CharDiffs(diffs)
52}
53
54pub fn compile_dictionary(input_path: &Path) -> Result<Vec<u8>> {
55    let file = File::open(input_path)
56        .with_context(|| format!("Failed to open input dictionary: {}", input_path.display()))?;
57    let reader = BufReader::new(file);
58
59    let mut entries = BTreeMap::new();
60    let mut max_key_length = 0u32;
61
62    for line in reader.lines() {
63        let line = line.with_context(|| "Failed to read line from dictionary")?;
64
65        let trimmed_line = line.trim();
66        if trimmed_line.is_empty() || trimmed_line.starts_with('#') {
67            continue;
68        }
69        let parts: Vec<&str> = line.split('\t').collect();
70
71        if parts.len() == 2 {
72            let key = parts[0];
73            let values: Vec<&str> = parts[1].split(' ').collect();
74
75            if !key.is_empty() && !values.is_empty() && !values.iter().any(|s| s.is_empty()) {
76                max_key_length = max_key_length.max(key.chars().count() as u32);
77                let delta_values = values.into_iter().map(|v| compute_delta(key, v)).collect();
78                entries.insert(key.to_string(), delta_values);
79            }
80        }
81    }
82
83    let mut values_vec: Vec<Vec<Delta>> = Vec::with_capacity(entries.len());
84    let mut builder = MapBuilder::memory();
85
86    for (key, values) in entries {
87        let index = values_vec.len() as u64;
88        values_vec.push(values);
89        builder
90            .insert(key, index)
91            .with_context(|| "Failed to insert key-value pair into FST")?;
92    }
93
94    let fst_map_bytes = builder
95        .into_inner()
96        .with_context(|| "Failed to finalize FST construction")?;
97
98    let metadata = SerializableFstDict {
99        values: values_vec,
100        max_key_length,
101    };
102
103    let metadata_bytes = rkyv::to_bytes::<rkyv::rancor::Error>(&metadata)
104        .map_err(|e| anyhow::anyhow!("Rkyv serialization failed: {e}"))?
105        .into_vec();
106
107    let mut final_bytes = Vec::new();
108
109    final_bytes.write_all(&(metadata_bytes.len() as u64).to_le_bytes())?;
110    final_bytes.write_all(&metadata_bytes)?;
111    final_bytes.write_all(&fst_map_bytes)?;
112
113    Ok(final_bytes)
114}