Skip to main content

mecab_ko_dict_validator/
rules.rs

1//! Validation rules for `MeCab` dictionary entries.
2//!
3//! This module defines the validation rules and constraints for dictionary entries,
4//! including POS tags, cost ranges, CSV format, and encoding validation.
5
6#![allow(clippy::struct_excessive_bools)]
7#![allow(clippy::missing_const_for_fn)]
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashSet;
11use std::ops::RangeInclusive;
12
13/// Configuration for validation rules.
14#[derive(Debug, Clone, Serialize, Deserialize, Default)]
15pub struct ValidationConfig {
16    /// Rules for CSV format validation
17    pub csv_rules: CsvRules,
18    /// Rules for POS tag validation
19    pub pos_rules: PosRules,
20    /// Rules for cost validation
21    pub cost_rules: CostRules,
22    /// Rules for encoding validation
23    pub encoding_rules: EncodingRules,
24    /// Rules for duplicate detection
25    pub duplicate_rules: DuplicateRules,
26    /// Rules for surface form normalization
27    pub normalization_rules: NormalizationRules,
28}
29
30/// Rules for CSV format validation.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct CsvRules {
33    /// Expected number of fields in a CSV row
34    pub expected_field_count: usize,
35    /// Whether to allow empty fields
36    pub allow_empty_fields: bool,
37    /// Whether to trim whitespace from fields
38    pub trim_fields: bool,
39    /// Maximum field length (0 = unlimited)
40    pub max_field_length: usize,
41}
42
43impl Default for CsvRules {
44    fn default() -> Self {
45        Self {
46            expected_field_count: 13, // MeCab standard format
47            allow_empty_fields: false,
48            trim_fields: true,
49            max_field_length: 0,
50        }
51    }
52}
53
54/// Rules for POS (Part-of-Speech) tag validation.
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct PosRules {
57    /// Valid POS tags (empty = accept all)
58    pub valid_tags: HashSet<String>,
59    /// Whether to validate tag hierarchy
60    pub validate_hierarchy: bool,
61    /// Maximum tag depth
62    pub max_tag_depth: usize,
63    /// Tag separator
64    pub tag_separator: char,
65}
66
67impl Default for PosRules {
68    fn default() -> Self {
69        Self {
70            valid_tags: Self::default_korean_pos_tags(),
71            validate_hierarchy: true,
72            max_tag_depth: 4,
73            tag_separator: '+',
74        }
75    }
76}
77
78impl PosRules {
79    /// Returns the default set of Korean POS tags.
80    #[must_use]
81    pub fn default_korean_pos_tags() -> HashSet<String> {
82        [
83            // 체언 (Nominals)
84            "NNG", "NNP", "NNB", "NP", "NR", // 용언 (Predicates)
85            "VV", "VA", "VX", "VCP", "VCN", // 관형사 (Determiners)
86            "MM",  // 부사 (Adverbs)
87            "MAG", "MAJ", // 감탄사 (Interjections)
88            "IC",  // 조사 (Particles)
89            "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
90            // 선어말어미 (Pre-final endings)
91            "EP", // 어말어미 (Final endings)
92            "EF", "EC", "ETN", "ETM", // 접두사 (Prefixes)
93            "XPN", // 접미사 (Suffixes)
94            "XSN", "XSV", "XSA", // 어근 (Roots)
95            "XR",  // 부호 (Symbols)
96            "SF", "SE", "SSO", "SSC", "SC", "SY", // 외국어 (Foreign words)
97            "SL", // 한자 (Chinese characters)
98            "SH", // 숫자 (Numbers)
99            "SN", // 기타 (Others)
100            "UNA", "NNBC", "NA", "NV", "NF",
101        ]
102        .iter()
103        .map(|s| (*s).to_string())
104        .collect()
105    }
106
107    /// Validates a POS tag.
108    #[must_use]
109    pub fn is_valid_tag(&self, tag: &str) -> bool {
110        if self.valid_tags.is_empty() {
111            return true;
112        }
113
114        // Handle compound tags (e.g., "NNG+JKS")
115        if tag.contains(self.tag_separator) {
116            let parts: Vec<&str> = tag.split(self.tag_separator).collect();
117
118            if self.validate_hierarchy && parts.len() > self.max_tag_depth {
119                return false;
120            }
121
122            parts.iter().all(|part| self.valid_tags.contains(*part))
123        } else {
124            self.valid_tags.contains(tag)
125        }
126    }
127}
128
129/// Rules for cost validation.
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct CostRules {
132    /// Valid range for left context ID
133    pub left_context_range: RangeInclusive<i32>,
134    /// Valid range for right context ID
135    pub right_context_range: RangeInclusive<i32>,
136    /// Valid range for word cost
137    pub word_cost_range: RangeInclusive<i32>,
138    /// Whether to warn on unusual costs
139    pub warn_unusual_costs: bool,
140    /// Threshold for unusual high costs
141    pub unusual_high_cost: i32,
142    /// Threshold for unusual low costs
143    pub unusual_low_cost: i32,
144}
145
146impl Default for CostRules {
147    fn default() -> Self {
148        Self {
149            left_context_range: 0..=10000,
150            right_context_range: 0..=10000,
151            word_cost_range: -10000..=10000,
152            warn_unusual_costs: true,
153            unusual_high_cost: 8000,
154            unusual_low_cost: -8000,
155        }
156    }
157}
158
159impl CostRules {
160    /// Validates costs for a dictionary entry.
161    #[must_use]
162    pub fn validate_costs(&self, left_id: i32, right_id: i32, cost: i32) -> CostValidationResult {
163        let mut result = CostValidationResult::default();
164
165        if !self.left_context_range.contains(&left_id) {
166            result.errors.push(format!(
167                "Left context ID {left_id} is outside valid range {:?}",
168                self.left_context_range
169            ));
170        }
171
172        if !self.right_context_range.contains(&right_id) {
173            result.errors.push(format!(
174                "Right context ID {right_id} is outside valid range {:?}",
175                self.right_context_range
176            ));
177        }
178
179        if !self.word_cost_range.contains(&cost) {
180            result.errors.push(format!(
181                "Word cost {cost} is outside valid range {:?}",
182                self.word_cost_range
183            ));
184        }
185
186        if self.warn_unusual_costs {
187            if cost > self.unusual_high_cost {
188                result.warnings.push(format!(
189                    "Word cost {cost} is unusually high (threshold: {})",
190                    self.unusual_high_cost
191                ));
192            } else if cost < self.unusual_low_cost {
193                result.warnings.push(format!(
194                    "Word cost {cost} is unusually low (threshold: {})",
195                    self.unusual_low_cost
196                ));
197            }
198        }
199
200        result
201    }
202}
203
204/// Result of cost validation.
205#[derive(Debug, Default, Clone)]
206pub struct CostValidationResult {
207    /// Validation errors
208    pub errors: Vec<String>,
209    /// Validation warnings
210    pub warnings: Vec<String>,
211}
212
213impl CostValidationResult {
214    /// Returns whether the validation passed (no errors).
215    #[must_use]
216    pub fn is_valid(&self) -> bool {
217        self.errors.is_empty()
218    }
219
220    /// Returns whether there are any warnings.
221    #[must_use]
222    pub fn has_warnings(&self) -> bool {
223        !self.warnings.is_empty()
224    }
225}
226
227/// Rules for encoding validation.
228#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct EncodingRules {
230    /// Expected encoding (e.g., "UTF-8")
231    pub expected_encoding: String,
232    /// Whether to validate UTF-8 correctness
233    pub validate_utf8: bool,
234    /// Whether to detect and report encoding issues
235    pub detect_encoding_issues: bool,
236    /// Whether to allow BOM (Byte Order Mark)
237    pub allow_bom: bool,
238}
239
240impl Default for EncodingRules {
241    fn default() -> Self {
242        Self {
243            expected_encoding: "UTF-8".to_string(),
244            validate_utf8: true,
245            detect_encoding_issues: true,
246            allow_bom: false,
247        }
248    }
249}
250
251/// Rules for duplicate entry detection.
252#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct DuplicateRules {
254    /// Whether to detect exact duplicates
255    pub detect_exact_duplicates: bool,
256    /// Whether to detect semantic duplicates (same surface + POS)
257    pub detect_semantic_duplicates: bool,
258    /// Whether to allow duplicates with different costs
259    pub allow_cost_variants: bool,
260}
261
262impl Default for DuplicateRules {
263    fn default() -> Self {
264        Self {
265            detect_exact_duplicates: true,
266            detect_semantic_duplicates: true,
267            allow_cost_variants: true,
268        }
269    }
270}
271
272/// Rules for surface form normalization.
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct NormalizationRules {
275    /// Whether to check for Unicode normalization (NFC/NFD)
276    pub check_unicode_normalization: bool,
277    /// Preferred Unicode normalization form
278    pub preferred_normalization: NormalizationForm,
279    /// Whether to check for full-width/half-width consistency
280    pub check_width_consistency: bool,
281    /// Whether to check for Hangul jamo composition
282    pub check_hangul_composition: bool,
283    /// Whether to warn on whitespace in surface forms
284    pub warn_on_whitespace: bool,
285}
286
287impl Default for NormalizationRules {
288    fn default() -> Self {
289        Self {
290            check_unicode_normalization: true,
291            preferred_normalization: NormalizationForm::Nfc,
292            check_width_consistency: true,
293            check_hangul_composition: true,
294            warn_on_whitespace: true,
295        }
296    }
297}
298
299/// Unicode normalization forms.
300#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
301pub enum NormalizationForm {
302    /// Normalization Form C (Canonical Composition)
303    Nfc,
304    /// Normalization Form D (Canonical Decomposition)
305    Nfd,
306    /// Normalization Form KC (Compatibility Composition)
307    Nfkc,
308    /// Normalization Form KD (Compatibility Decomposition)
309    Nfkd,
310}
311
312impl NormalizationForm {
313    /// Normalizes a string according to this form.
314    #[must_use]
315    pub fn normalize(&self, text: &str) -> String {
316        use unicode_normalization::UnicodeNormalization;
317
318        match self {
319            Self::Nfc => text.nfc().collect(),
320            Self::Nfd => text.nfd().collect(),
321            Self::Nfkc => text.nfkc().collect(),
322            Self::Nfkd => text.nfkd().collect(),
323        }
324    }
325}
326
327#[cfg(test)]
328#[allow(
329    clippy::expect_used,
330    clippy::unwrap_used,
331    clippy::field_reassign_with_default
332)]
333mod tests {
334    use super::*;
335
336    #[test]
337    fn test_default_config() {
338        let config = ValidationConfig::default();
339        assert_eq!(config.csv_rules.expected_field_count, 13);
340        assert!(config.pos_rules.validate_hierarchy);
341        assert!(config.encoding_rules.validate_utf8);
342    }
343
344    #[test]
345    fn test_pos_tag_validation() {
346        let rules = PosRules::default();
347
348        // Valid single tags
349        assert!(rules.is_valid_tag("NNG"));
350        assert!(rules.is_valid_tag("VV"));
351        assert!(rules.is_valid_tag("JKS"));
352
353        // Valid compound tags
354        assert!(rules.is_valid_tag("NNG+JKS"));
355        assert!(rules.is_valid_tag("VV+EC"));
356
357        // Invalid tags
358        assert!(!rules.is_valid_tag("XXX"));
359        assert!(!rules.is_valid_tag("NNG+XXX"));
360    }
361
362    #[test]
363    fn test_cost_validation() {
364        let rules = CostRules::default();
365
366        // Valid costs
367        let result = rules.validate_costs(100, 200, 500);
368        assert!(result.is_valid());
369        assert!(!result.has_warnings());
370
371        // Invalid left context
372        let result = rules.validate_costs(-1, 200, 500);
373        assert!(!result.is_valid());
374
375        // Unusual high cost (warning only)
376        let result = rules.validate_costs(100, 200, 9000);
377        assert!(result.is_valid());
378        assert!(result.has_warnings());
379    }
380
381    #[test]
382    fn test_normalization_form() {
383        let nfc = NormalizationForm::Nfc;
384
385        // Test Hangul normalization
386        let composed = "한글";
387        let normalized = nfc.normalize(composed);
388        assert_eq!(composed, normalized);
389
390        // Test that decomposed Hangul gets normalized to NFC
391        let decomposed = "\u{1112}\u{1161}\u{11AB}\u{1100}\u{1173}\u{11AF}"; // 한글 (NFD)
392        let normalized = nfc.normalize(decomposed);
393        assert_eq!("한글", normalized);
394    }
395
396    #[test]
397    fn test_max_tag_depth() {
398        let mut rules = PosRules::default();
399        rules.max_tag_depth = 2;
400
401        assert!(rules.is_valid_tag("NNG+JKS"));
402        assert!(!rules.is_valid_tag("NNG+JKS+EC"));
403    }
404
405    #[test]
406    fn test_cost_ranges() {
407        let rules = CostRules {
408            left_context_range: 0..=100,
409            right_context_range: 0..=100,
410            word_cost_range: -1000..=1000,
411            warn_unusual_costs: false,
412            unusual_high_cost: 800,
413            unusual_low_cost: -800,
414        };
415
416        let result = rules.validate_costs(50, 75, 500);
417        assert!(result.is_valid());
418
419        let result = rules.validate_costs(150, 75, 500);
420        assert!(!result.is_valid());
421    }
422
423    #[test]
424    fn test_empty_valid_tags() {
425        let mut rules = PosRules::default();
426        rules.valid_tags.clear();
427
428        // With empty valid_tags, any tag should be valid
429        assert!(rules.is_valid_tag("ANYTHING"));
430        assert!(rules.is_valid_tag("XXX+YYY"));
431    }
432}