cskk/dictionary/
mod.rs

1pub(crate) mod candidate;
2pub(crate) mod composite_key;
3pub(crate) mod dictentry;
4mod dictionary_candidate;
5mod dictionary_parser;
6pub mod empty_dict;
7pub(crate) mod file_dictionary;
8mod lru_ordered_map;
9pub mod static_dict;
10pub mod user_dictionary;
11
12use crate::dictionary::dictionary_candidate::{CompletionCandidate, DictionaryEntry};
13use crate::error::CskkError;
14use crate::form_changer::numeric_form_changer::{
15    numeric_to_daiji_as_number, numeric_to_kanji_each, numeric_to_simple_kanji_as_number,
16    numeric_to_thousand_separator, numeric_to_zenkaku,
17};
18pub(crate) use candidate::Candidate;
19pub(crate) use composite_key::CompositeKey;
20use dictentry::DictEntry;
21pub(in crate::dictionary) use dictionary_candidate::DictionaryCandidate;
22use empty_dict::EmptyDictionary;
23use log::*;
24use regex::Regex;
25use static_dict::StaticFileDict;
26use std::sync::{Arc, Mutex};
27use user_dictionary::UserDictionary;
28
29// C側に出す関係でSizedである必要があり、dyn Traitではなくenumでラップする。
30#[derive(Debug)]
31pub(crate) enum CskkDictionaryType {
32    StaticFile(StaticFileDict),
33    UserFile(UserDictionary),
34    EmptyDict(EmptyDictionary),
35}
36
37// FIXME: Not sure if this is the correct inner type. Maybe we can remove Arc on other places?
38#[derive(Debug)]
39pub struct CskkDictionary {
40    is_completable: bool,
41    pub(crate) mutex: Mutex<CskkDictionaryType>,
42}
43
44impl CskkDictionary {
45    fn new(dictionary: CskkDictionaryType, is_completable: bool) -> Self {
46        Self {
47            is_completable,
48            mutex: Mutex::new(dictionary),
49        }
50    }
51
52    /// Library user interface for creating new static read-only dictionary.
53    /// file_path: path string
54    /// encode: label of encoding that encoding_rs can recognize. "utf-8", "euc-jp", "cp866" etc.
55    pub fn new_static_dict(
56        file_path: &str,
57        encode: &str,
58        is_completable: bool,
59    ) -> Result<CskkDictionary, CskkError> {
60        let dictionary = StaticFileDict::new(file_path, encode)?;
61        Ok(CskkDictionary::new(
62            CskkDictionaryType::StaticFile(dictionary),
63            is_completable,
64        ))
65    }
66
67    /// Library user interface for creating new user readable and writable dictionary
68    /// file_path: path string
69    /// encode: label of encoding that encoding_rs can recognize. "utf-8", "euc-jp", "cp866" etc.
70    pub fn new_user_dict(
71        file_path: &str,
72        encode: &str,
73        is_completable: bool,
74    ) -> Result<CskkDictionary, CskkError> {
75        let dictionary = UserDictionary::new(file_path, encode)?;
76        Ok(CskkDictionary::new(
77            CskkDictionaryType::UserFile(dictionary),
78            is_completable,
79        ))
80    }
81
82    /// Library user interface for creating fallback dictionary.
83    /// Dictionary is required to create the context, so this dictionary is useful when no dictionary file is available.
84    pub fn new_empty_dict() -> Result<CskkDictionary, CskkError> {
85        Ok(CskkDictionary::new(
86            CskkDictionaryType::EmptyDict(EmptyDictionary::default()),
87            false,
88        ))
89    }
90}
91
92/// confirm the candidate.
93/// This updates writable dictionaries candidate order or add new entry which confirmed.
94/// Returns true if updated the dictionary.
95pub(crate) fn confirm_candidate(
96    dictionary: &Arc<CskkDictionary>,
97    candidate: &Candidate,
98) -> Result<bool, CskkError> {
99    debug!("confirm: {:?}", candidate);
100    // Using mutex in match on purpose, never acquiring lock again.
101    #[allow(clippy::significant_drop_in_scrutinee)]
102    match *dictionary.mutex.lock().unwrap() {
103        CskkDictionaryType::StaticFile(ref mut dict) => dict.select_candidate(candidate),
104        CskkDictionaryType::UserFile(ref mut dict) => dict.select_candidate(candidate),
105        CskkDictionaryType::EmptyDict(ref mut dict) => dict.select_candidate(candidate),
106    }
107}
108
109/// purge the candidate.
110/// This updates writable dictionaries candidate order or add new entry which confirmed.
111/// Returns true if updated the dictionary.
112pub(crate) fn purge_candidate(
113    dictionary: &Arc<CskkDictionary>,
114    composite_key: &CompositeKey,
115    candidate: &Candidate,
116) -> Result<bool, CskkError> {
117    // Using mutex in match on purpose, never acquiring lock again.
118    #[allow(clippy::significant_drop_in_scrutinee)]
119    match *dictionary.mutex.lock().unwrap() {
120        CskkDictionaryType::StaticFile(ref mut dict) => {
121            dict.purge_candidate(composite_key, candidate)
122        }
123        CskkDictionaryType::UserFile(ref mut dict) => {
124            dict.purge_candidate(composite_key, candidate)
125        }
126        CskkDictionaryType::EmptyDict(ref mut dict) => {
127            dict.purge_candidate(composite_key, candidate)
128        }
129    }
130}
131
132/// 現在ueno/libskk同様にDedupはkouho_textのみ。
133pub(crate) fn get_all_candidates(
134    dictionaries: &[Arc<CskkDictionary>],
135    composite_key: &CompositeKey,
136) -> Vec<Candidate> {
137    get_all_candidates_inner(dictionaries, composite_key, false)
138}
139
140///
141/// 補完候補となる辞書のエントリ列を返す。
142///
143/// [Dictionary]の[complete]に準じて、composite_keyが送りなしのみを想定し、先頭一致する送りなし候補を返す。
144///
145pub(crate) fn get_all_complete(
146    dictionaries: &[Arc<CskkDictionary>],
147    composite_key: &CompositeKey,
148) -> Vec<Candidate> {
149    let dict_candidates = get_all_complete_inner(dictionaries, composite_key);
150    let deduped_completion_candidate = dedup_candidates(dict_candidates);
151
152    deduped_completion_candidate
153        .into_iter()
154        .map(|x| Candidate::from_completion_candidate(&x))
155        .collect()
156}
157///
158/// 補完候補となる辞書のエントリ列を返す。
159///
160/// [Dictionary]の[complete]に準じて、composite_keyが送りなしのみを想定し、先頭一致する送りなし候補を返す。
161///
162/// 使われない想定ではあるが、composite_keyが送りありだと送り仮名まで一致する候補を返すので先頭一致が完全一致と同じになる。
163///
164fn get_all_complete_inner(
165    dictionaries: &[Arc<CskkDictionary>],
166    composite_key: &CompositeKey,
167) -> Vec<CompletionCandidate> {
168    let mut result = Vec::new();
169
170    for cskkdict in dictionaries.iter() {
171        if cskkdict.is_completable {
172            let lock = cskkdict.mutex.lock().unwrap();
173            let dict_entries = match &*lock {
174                CskkDictionaryType::StaticFile(dict) => dict.complete(composite_key),
175                CskkDictionaryType::UserFile(dict) => dict.complete(composite_key),
176                CskkDictionaryType::EmptyDict(dict) => dict.complete(composite_key),
177            };
178            for dict_entry in dict_entries {
179                let candidates = dict_entry.get_candidates(composite_key.get_okuri());
180
181                if let Some(candidates) = candidates {
182                    // result.extend(candidates.to_owned())
183                    result.extend(candidates.iter().map(|x| {
184                        CompletionCandidate::from_dictionary_candidate(
185                            &dict_entry.midashi,
186                            composite_key.get_okuri(),
187                            x,
188                        )
189                    }));
190                }
191            }
192        }
193    }
194
195    result
196}
197
198///
199/// First search the exact match, and then replace numerics to # and search the dict for numeric composition.
200/// If numeric-re-lookup, skip the latter don't replace numerics for the "#4" type entries.
201///
202fn get_all_candidates_inner(
203    dictionaries: &[Arc<CskkDictionary>],
204    composite_key: &CompositeKey,
205    is_numeric_re_lookup: bool,
206) -> Vec<Candidate> {
207    let mut matched_numbers: Vec<String>;
208
209    let exact_match_candidates = get_candidates_in_order(dictionaries, &composite_key);
210    let exact_match_candidates = dedup_candidates(exact_match_candidates);
211    let mut all_candidates: Vec<Candidate> = exact_match_candidates
212        .into_iter()
213        .map(|dictionary_candidate| {
214            Candidate::from_dictionary_candidate(&composite_key, &dictionary_candidate)
215        })
216        .collect();
217
218    if !is_numeric_re_lookup {
219        let replaced_key;
220        (replaced_key, matched_numbers) = to_composite_to_numeric_dict_key(&composite_key);
221        if replaced_key != *composite_key {
222            let numeric_replace_match_candidates =
223                get_candidates_in_order(dictionaries, &replaced_key);
224            let numeric_replace_match_candidates =
225                dedup_candidates(numeric_replace_match_candidates);
226            let mut numeric_replace_match_candidates: Vec<Candidate> =
227                numeric_replace_match_candidates
228                    .into_iter()
229                    .map(|dictionary_candidate| {
230                        Candidate::from_dictionary_candidate(&replaced_key, &dictionary_candidate)
231                    })
232                    .flat_map(|candidate| {
233                        replace_numeric_match(&candidate, &matched_numbers, dictionaries)
234                    })
235                    .collect();
236            all_candidates.append(&mut numeric_replace_match_candidates);
237        }
238    }
239
240    all_candidates
241}
242
243///
244/// dictionary_candidatesの内容からその順番にcandidateを作り、重複を除いて返す。
245///
246fn dedup_candidates<T>(dictionary_candidates: Vec<T>) -> Vec<T>
247where
248    T: DictionaryEntry + Ord + Clone,
249{
250    let mut deduped_candidates = vec![];
251    let mut ordered_candidates = vec![];
252
253    deduped_candidates.extend(dictionary_candidates.to_owned());
254    ordered_candidates.extend(dictionary_candidates);
255
256    if deduped_candidates.is_empty() {
257        return vec![];
258    }
259    deduped_candidates.sort_unstable();
260    // Make Option == some come before None
261    deduped_candidates.reverse();
262    deduped_candidates.dedup_by(|a, b| a.get_kouho_text() == b.get_kouho_text());
263    // reverse back for faster iteration? maybe unneeded.
264    deduped_candidates.reverse();
265
266    let mut result = vec![];
267    for candidate in ordered_candidates {
268        let mut matched_index = usize::MAX;
269        for (pos, deduped) in deduped_candidates.iter().enumerate() {
270            if (*deduped).eq(&candidate) {
271                result.push(deduped.to_owned());
272                matched_index = pos;
273            }
274        }
275        if matched_index < usize::MAX {
276            deduped_candidates.remove(matched_index);
277        }
278    }
279
280    result
281}
282
283/// dictionariesからcompositeKeyに合わせた順序でDictionaryCandidateを返す。
284///
285/// compositeKeyが送り無しの場合、単なる辞書順
286///
287/// compositeKeyが送り有りの場合、まず送り仮名の厳密マッチする候補を辞書順に、その後厳密マッチのない候補を辞書順に。
288/// つまりddskkのskk-henkan-strict-okuri-precedenceが設定された時の動作を行う。
289///
290fn get_candidates_in_order(
291    dictionaries: &[Arc<CskkDictionary>],
292    composite_key: &CompositeKey,
293) -> Vec<DictionaryCandidate> {
294    let mut result = Vec::new();
295
296    for cskkdict in dictionaries.iter() {
297        let lock = cskkdict.mutex.lock().unwrap();
298        if let Some(dict_entry) = match &*lock {
299            CskkDictionaryType::StaticFile(dict) => dict.lookup(composite_key),
300            CskkDictionaryType::UserFile(dict) => dict.lookup(composite_key),
301            CskkDictionaryType::EmptyDict(dict) => dict.lookup(composite_key),
302        } {
303            let strict_okuri_cands = if composite_key.has_okuri() {
304                dict_entry.get_candidates(composite_key.get_okuri())
305            } else {
306                None
307            };
308            if let Some(candidates) = strict_okuri_cands {
309                result.extend(candidates.to_owned());
310            }
311
312            let non_strict_okuri_cands = dict_entry.get_candidates(&None);
313            if let Some(candidates) = non_strict_okuri_cands {
314                result.extend(candidates.to_owned());
315            }
316        }
317    }
318
319    result
320}
321
322lazy_static! {
323    static ref NUM_REGEX: Regex = Regex::new(r"\d+").unwrap();
324}
325///
326/// 数字が含まれていた場合#に置きかえて数字と共にかえす。
327///
328/// 12がつ6にち -> (#がつ#にち, [12,6])
329///
330pub(crate) fn to_composite_to_numeric_dict_key(
331    to_composite: &CompositeKey,
332) -> (CompositeKey, Vec<String>) {
333    let mut dict_key = to_composite.get_to_composite().to_owned();
334    let mut matched_numbers = vec![];
335    for numeric_match in NUM_REGEX.find_iter(to_composite.get_to_composite()) {
336        let new_dict_key = dict_key.replacen(numeric_match.as_str(), "#", 1);
337        dict_key = new_dict_key;
338        matched_numbers.push(numeric_match.as_str().to_owned());
339    }
340    (
341        CompositeKey::new(&dict_key, to_composite.get_okuri().to_owned()),
342        matched_numbers,
343    )
344}
345
346/// Return how many numeric string is in string to composite
347///
348/// compile_fail example for private fn
349/// ```compile_fail
350/// use cskk::dictionary::numeric_string_count;
351/// assert_eq!(numeric_string_count("2かい"), 1);
352/// assert_eq!(numeric_string_count("2がつ13にち"), 2);
353/// ```
354pub(crate) fn numeric_string_count(to_composite: &str) -> usize {
355    NUM_REGEX.find_iter(to_composite).count()
356}
357
358/// Return how many numeric special string is in kouho string
359///
360/// compile_fail example for private fn
361/// ```compile_fail
362/// assert_eq!(numeric_entry_count("#1回"), 1);
363/// assert_eq!(numeric_entry_count("#3日"), 1);
364/// ```
365///
366pub(crate) fn numeric_entry_count(entry: &str) -> usize {
367    lazy_static! {
368        static ref NUM_ENTRY_REGEX: Regex = Regex::new(r"#[0123458]").unwrap();
369    }
370    NUM_ENTRY_REGEX.find_iter(entry).count()
371}
372
373// もし候補に#0等の数値マッチが入るならば元の数字でおきかえる。
374fn replace_numeric_match(
375    candidate: &Candidate,
376    matched_numbers: &[String],
377    dictionaries: &[Arc<CskkDictionary>],
378) -> Vec<Candidate> {
379    let output_text_list =
380        replace_numeric_string(&candidate.kouho_text, matched_numbers, dictionaries);
381
382    let mut result = vec![];
383    for output_text in output_text_list {
384        let mut new_candidate = candidate.clone();
385        new_candidate.output = output_text;
386        result.push(new_candidate)
387    }
388    result
389}
390
391/// given kouho_text that includes #[0123458], return the replaced text to be used for outputs.
392pub(crate) fn replace_numeric_string(
393    kouho_text: &str,
394    numbers: &[String],
395    dictionaries: &[Arc<CskkDictionary>],
396) -> Vec<String> {
397    lazy_static! {
398        static ref NUMERIC_ENTRY_REGEX: Regex = Regex::new(r"#[0123458]").unwrap();
399    }
400    let mut current_output_texts = vec![kouho_text.to_string()];
401    for (n, entry_match) in NUMERIC_ENTRY_REGEX.find_iter(kouho_text).enumerate() {
402        if n < numbers.len() {
403            match entry_match.as_str() {
404                "#0" => {
405                    let mut replaced_output_texts = vec![];
406                    for output_text in &current_output_texts {
407                        replaced_output_texts.push(output_text.replacen("#0", &numbers[n], 1));
408                    }
409                    current_output_texts = replaced_output_texts;
410                }
411                "#1" => {
412                    let mut replaced_output_texts = vec![];
413                    for kouho_text in &current_output_texts {
414                        replaced_output_texts.push(kouho_text.replacen(
415                            "#1",
416                            &numeric_to_zenkaku(&numbers[n]),
417                            1,
418                        ));
419                    }
420                    current_output_texts = replaced_output_texts;
421                }
422                "#2" => {
423                    let mut replaced_output_texts = vec![];
424                    for kouho_text in &current_output_texts {
425                        replaced_output_texts.push(kouho_text.replacen(
426                            "#2",
427                            &numeric_to_kanji_each(&numbers[n]),
428                            1,
429                        ));
430                    }
431                    current_output_texts = replaced_output_texts;
432                }
433                "#3" => {
434                    let mut replaced_output_texts = vec![];
435                    for output_text in &current_output_texts {
436                        replaced_output_texts.push(output_text.replacen(
437                            "#3",
438                            &numeric_to_simple_kanji_as_number(&numbers[n]),
439                            1,
440                        ));
441                    }
442                    current_output_texts = replaced_output_texts;
443                }
444                "#4" => {
445                    let mut replaced_output_texts = vec![];
446                    let numeric_lookup_results = get_all_candidates_inner(
447                        dictionaries,
448                        &CompositeKey::new(&numbers[n], None),
449                        true,
450                    );
451                    for kouho_text in &current_output_texts {
452                        for numeric_lookup in &numeric_lookup_results {
453                            replaced_output_texts.push(kouho_text.replacen(
454                                "#4",
455                                &numeric_lookup.kouho_text,
456                                1,
457                            ));
458                        }
459                    }
460                    current_output_texts = replaced_output_texts;
461                }
462                "#5" => {
463                    let mut replaced_output_texts = vec![];
464                    for kouho_text in &current_output_texts {
465                        replaced_output_texts.push(kouho_text.replacen(
466                            "#5",
467                            &numeric_to_daiji_as_number(&numbers[n], false),
468                            1,
469                        ));
470                        replaced_output_texts.push(kouho_text.replacen(
471                            "#5",
472                            &numeric_to_daiji_as_number(&numbers[n], true),
473                            1,
474                        ));
475                    }
476                    current_output_texts = replaced_output_texts;
477                }
478                "#8" => {
479                    let mut replaced_output_texts = vec![];
480                    for kouho_text in &current_output_texts {
481                        replaced_output_texts.push(kouho_text.replacen(
482                            "#8",
483                            &numeric_to_thousand_separator(&numbers[n]),
484                            1,
485                        ));
486                    }
487                    current_output_texts = replaced_output_texts;
488                }
489                _ => {}
490            }
491        }
492    }
493    current_output_texts
494}
495
496///
497/// Returns the nth candidate.
498/// first selection_pointer == 0
499///
500#[allow(dead_code)]
501pub(crate) fn get_nth_candidate(
502    dictionaries: &[Arc<CskkDictionary>],
503    composite_key: &CompositeKey,
504    selection_pointer: usize,
505) -> Option<Candidate> {
506    let candidates = get_all_candidates(dictionaries, composite_key);
507    candidates.get(selection_pointer).cloned()
508}
509
510pub(crate) trait Dictionary {
511    /// midashiと一致するエントリを返す。
512    fn lookup(&self, composite_key: &CompositeKey) -> Option<&DictEntry>;
513
514    fn is_read_only(&self) -> bool {
515        true
516    }
517    ///
518    /// `midashi_head`が送り仮名なしの場合、これから始まる送り仮名なしエントリの列を返す。送りありエントリは無視される。
519    ///
520    /// 補完として利用されない想定だが、この関数の実装としては`midashi_head`が送り仮名ありの場合、一致するエントリを返す。
521    /// 送り仮名の仕組み上一致エントリしか存在できないため。
522    ///
523    /// e.g.
524    /// complete(("あい", None)) -> (("あい", "/愛/相/"), ("あいさつ", "/挨拶/"), ...)
525    /// complete(("あ", Some("い")) -> (("あi", "/開/合/\[い/合/開/\]/"), ...)
526    fn complete<'a>(
527        &'a self,
528        midashi_head: &'a CompositeKey,
529    ) -> Box<dyn Iterator<Item = &'a DictEntry> + 'a>;
530    /// Returns true if saved, false if kindly ignored.
531    /// Safe to call to read_only dictionary.
532    fn save_dictionary(&mut self) -> Result<bool, CskkError> {
533        Ok(false)
534    }
535
536    /// Select that candidate.
537    /// Supporting dictionary will add and move that candidate to the first place so that next time it comes to candidate early.
538    /// Safe to call to read_only dictionary.
539    fn select_candidate(&mut self, _candidate: &Candidate) -> Result<bool, CskkError> {
540        Ok(false)
541    }
542    /// Remove that candidate if dictionary supports editing.
543    /// Safe to call to read_only dictionary
544    fn purge_candidate(
545        &mut self,
546        _composite_key: &CompositeKey,
547        _candidate: &Candidate,
548    ) -> Result<bool, CskkError> {
549        Ok(false)
550    }
551
552    /// Reload dictionary.
553    fn reload(&mut self) -> Result<(), CskkError> {
554        Ok(())
555    }
556}
557
558#[cfg(test)]
559mod test {
560    use super::*;
561
562    #[test]
563    fn test_numeric_string_count() {
564        assert_eq!(numeric_string_count("123つぶ"), 1);
565        assert_eq!(numeric_string_count("1にち1かい"), 2);
566        assert_eq!(numeric_string_count("1じつせんしゅう"), 1);
567    }
568
569    #[test]
570    fn get_all_candidates_basic() {
571        let test_dictionary =
572            CskkDictionary::new_static_dict("tests/data/dictionaries/SKK-JISYO.S", "euc-jp", false)
573                .unwrap();
574        let dictionaries = vec![Arc::new(test_dictionary)];
575        let key = CompositeKey::new("あい", None);
576        let result = get_all_candidates(&dictionaries, &key);
577
578        assert_eq!(result[0].kouho_text, "愛");
579    }
580
581    #[test]
582    fn get_all_candidates_numeric_match() {
583        let test_dictionary = CskkDictionary::new_static_dict(
584            "tests/data/dictionaries/number_jisyo.dat",
585            "utf-8",
586            false,
587        )
588        .unwrap();
589        let dictionaries = vec![Arc::new(test_dictionary)];
590        let key = CompositeKey::new("5/1", None);
591        let result = get_all_candidates(&dictionaries, &key);
592
593        assert_eq!(result[0].kouho_text, "#0月#0日");
594        assert_eq!(result[0].midashi, "#/#");
595        assert_eq!(result[0].output, "5月1日");
596    }
597
598    #[test]
599    fn get_all_candidates_numeric_exact_match() {
600        let test_dictionary =
601            CskkDictionary::new_static_dict("tests/data/dictionaries/maruichi.dat", "utf-8", false)
602                .unwrap();
603        let dictionaries = vec![Arc::new(test_dictionary)];
604        let key = CompositeKey::new("まる1", None);
605        let result = get_all_candidates(&dictionaries, &key);
606
607        assert_eq!(result[0].kouho_text, "①"); // 0xE291A0 (U+02460)
608        assert_eq!(result[1].kouho_text, "❶");
609        assert_eq!(result[2].kouho_text, "⓵"); // 0xE293B5 (U+024F5)
610    }
611}