1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
//! Dictionary configuration options framework
//!
//! This module contains everything related to the configuration of a
//! [`crate::Dictionary`], which can then be used for checking words.
//!
//! Usually this configuration is loaded via an affix file (typically ending in
//! `.aff`), but it can also be created programatically if desired.
//!
//! This module should be considered unstable as its usage is finalized.

mod serde;
mod types;

use unicode_segmentation::UnicodeSegmentation;

use crate::errors::AffixError;
use crate::graph_vec;

use serde::t_data_unwrap;
pub use serde::{load_affix_from_str, ProcessedToken, ProcessedTokenData};
pub use types::{Conversion, EncodingType, Rule, RuleType, TokenType};

/// Dictionary configuration object that holds affix file data
///
/// This holds the entire contents of the affix file as an AST representation
/// and is intended to be used throughout program lifetime. If you are
/// uninterested in modifying an existing dictionary structure, you are likely
/// interested in just using [`crate::Dictionary`] and its methods.
///
/// # Internal working
///
/// Generally within this class, a "string" is represented as a Vec<String> or
/// Vec<&str>, i.e. a vector of string graphemes. This is because many languages
/// may require accurate unicode segmentation to work properly. It is not yet
/// understood whether this is the best practice, so this may change in the
/// future.
///
/// Any type that can be modified must be owned (e.g. String, Vec), others may
/// be borrowed.
#[non_exhaustive]
#[derive(Debug, PartialEq, Eq)]
pub struct Config {
    /// Charset to use, reference to an [`EncodingType`] Currently this is
    /// unused; only UTF-8 is supported. However, the affix file must still have
    /// an accurate definition.
    pub encoding: EncodingType,

    /// Twofold prefix skipping for e.g. right-to-left languages
    pub complex_prefixes: bool,

    /// Language code, currently unused. Consider this unstable as it may change
    /// to be an object reference.
    pub lang: String,

    /// List of characters to ignore
    pub ignore_chars: Vec<String>,

    /// List of usable flag vectors. Defaults to all things after "/"" in a dict.
    pub afx_flag_vector: Vec<String>,

    // ## Suggestion-related items
    /// List of e.g. "qwerty", "asdfg" that define neighbors
    pub keys: Vec<Vec<String>>,

    /// Suggest words that differe by 1 try character
    pub try_characters: Vec<String>,

    /// Flag used to indicate words that should not be suggested
    pub nosuggest_flag: String,

    /// Maximum compound word suggestions
    pub compound_suggestions_max: u16,

    /// Max number of ngram suggestions
    pub ngram_suggestions_max: u16,

    /// N-gram similarity limit
    pub ngram_diff_max: u16,

    /// Remove all suggestions except the diff max
    pub ngram_limit_to_diff_max: bool,

    /// Don't suggest anything with spaces
    pub no_split_suggestions: bool,

    /// If a dot comes with the spellcheck, return one with a suggestion word
    pub keep_termination_dots: bool,

    /// Note rare (i.e. commonly misspelled) words with this flag
    pub warn_rare_flag: String,

    /// Whether to never suggest words with the warn flag (above)
    pub forbid_warn_words: bool,

    // pub replacements: Vec<&'a ReplaceRule<'a>>,
    // maps: Vec<>, // MAP
    // phones: Vec<>

    // ## Compounding-related items

    // break_points: Vec<>
    // compound_rules: Vec<>
    /// Minimum length of words used in a compound
    pub compound_min_length: u16,

    /// Words with this flag may be in compounds
    pub compound_flag: Option<String>,

    /// Words with this flag may start a compound
    pub compound_begin_flag: Option<String>,

    /// Words with this flag may end a compound
    pub compound_end_flag: Option<String>,

    /// Words with this flag may be in the middle of a compound
    pub compound_middle_flag: Option<String>,

    /// Words with this flag can't be on their own, only in compounds
    pub compound_only_flag: Option<String>,
    // There are lots of compound flags that haven't yet been implemented

    // ## Affix-related items
    pub input_conversions: Vec<Conversion>,

    pub output_conversions: Vec<Conversion>,

    // Rules for setting prefixes and suffixes
    pub affix_rules: Vec<Rule>,

    // Rules for suggestion replacements to try
    pub replacements: Vec<Conversion>,
}

impl Config {
    /// Create an empty affix object
    #[inline]
    pub const fn new() -> Self {
        Self {
            encoding: EncodingType::Utf8,
            complex_prefixes: false,
            lang: String::new(),
            ignore_chars: Vec::new(),
            afx_flag_vector: Vec::new(),
            keys: Vec::new(),
            try_characters: Vec::new(),
            nosuggest_flag: String::new(),
            compound_suggestions_max: 2,
            ngram_suggestions_max: 2,
            ngram_diff_max: 5,
            ngram_limit_to_diff_max: false,
            no_split_suggestions: false,
            keep_termination_dots: false,
            warn_rare_flag: String::new(),
            forbid_warn_words: false,
            compound_min_length: 3,
            compound_flag: None,
            compound_begin_flag: None,
            compound_end_flag: None,
            compound_middle_flag: None,
            compound_only_flag: None,
            input_conversions: Vec::new(),
            output_conversions: Vec::new(),
            affix_rules: Vec::new(),
            replacements: Vec::new(),
        }
    }

    /// Load this affix from a string, e.g. one read from an affix file
    ///
    /// # Errors
    ///
    /// Error if loading is unsuccessful
    #[inline]
    pub fn load_from_str(&mut self, s: &str) -> Result<(), AffixError> {
        load_affix_from_str(self, s)
    }

    /// Create a vector of words from a single root word by applying rules in
    /// this affix
    ///
    /// May contain duplicates
    ///
    /// # Parameters
    ///
    /// - `rootword`: The word to have prefixes/suffixes applied to
    /// - `keys`: Prefix and suffix keys to apply
    #[inline]
    pub fn create_affixed_words(&self, rootword: &str, keys: &str) -> Vec<String> {
        let mut ret = vec![rootword.to_owned()];
        // We will build our prefixed words here.
        let mut prefixed_words: Vec<String> = Vec::new();

        // List of what keys may apply to an affix
        let keys_vec: Vec<String> = graph_vec!(keys.to_uppercase());

        // Loop through rules where the identifiers apply
        // Then apply them
        self.affix_rules
            .iter()
            // Select rules whose identifier is in the desired keys
            .filter(|rule| keys_vec.contains(&rule.key))
            .for_each(|rule| {
                if let Some(newword) = rule.apply(rootword) {
                    if rule.combine_pfx_sfx && rule.atype == RuleType::Prefix {
                        prefixed_words.push(newword.clone());
                    }
                    ret.push(newword);
                }
            });

        // Redo the same thing for rules that allow chaining
        self.affix_rules
            .iter()
            // Select rules whose identifier is in the desired keys, and who
            // allow pfx+sfx combinations
            .filter(|rule| {
                rule.combine_pfx_sfx
                    && keys_vec.contains(&rule.key)
                    && rule.atype == RuleType::Suffix
            })
            .for_each(|rule| {
                for pfxword in &prefixed_words {
                    if let Some(newword) = rule.apply(pfxword) {
                        ret.push(newword);
                    }
                }
            });

        ret
    }
}

impl Default for Config {
    /// Common defaults for affix configuration with a QWERTY keyboard
    #[inline]
    fn default() -> Self {
        let mut ax = Self::new();

        ax.keys = vec![
            graph_vec!("qwertyuiop"),
            graph_vec!("asdfghjkl"),
            graph_vec!("zxcvbnm"),
        ];
        ax.try_characters = graph_vec!("esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'");
        ax.nosuggest_flag = String::from("!");

        ax
    }
}