1#![allow(clippy::struct_excessive_bools)]
7#![allow(clippy::missing_const_for_fn)]
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashSet;
11use std::ops::RangeInclusive;
12
13#[derive(Debug, Clone, Serialize, Deserialize, Default)]
15pub struct ValidationConfig {
16 pub csv_rules: CsvRules,
18 pub pos_rules: PosRules,
20 pub cost_rules: CostRules,
22 pub encoding_rules: EncodingRules,
24 pub duplicate_rules: DuplicateRules,
26 pub normalization_rules: NormalizationRules,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct CsvRules {
33 pub expected_field_count: usize,
35 pub allow_empty_fields: bool,
37 pub trim_fields: bool,
39 pub max_field_length: usize,
41}
42
43impl Default for CsvRules {
44 fn default() -> Self {
45 Self {
46 expected_field_count: 13, allow_empty_fields: false,
48 trim_fields: true,
49 max_field_length: 0,
50 }
51 }
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct PosRules {
57 pub valid_tags: HashSet<String>,
59 pub validate_hierarchy: bool,
61 pub max_tag_depth: usize,
63 pub tag_separator: char,
65}
66
67impl Default for PosRules {
68 fn default() -> Self {
69 Self {
70 valid_tags: Self::default_korean_pos_tags(),
71 validate_hierarchy: true,
72 max_tag_depth: 4,
73 tag_separator: '+',
74 }
75 }
76}
77
78impl PosRules {
79 #[must_use]
81 pub fn default_korean_pos_tags() -> HashSet<String> {
82 [
83 "NNG", "NNP", "NNB", "NP", "NR", "VV", "VA", "VX", "VCP", "VCN", "MM", "MAG", "MAJ", "IC", "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
90 "EP", "EF", "EC", "ETN", "ETM", "XPN", "XSN", "XSV", "XSA", "XR", "SF", "SE", "SSO", "SSC", "SC", "SY", "SL", "SH", "SN", "UNA", "NNBC", "NA", "NV", "NF",
101 ]
102 .iter()
103 .map(|s| (*s).to_string())
104 .collect()
105 }
106
107 #[must_use]
109 pub fn is_valid_tag(&self, tag: &str) -> bool {
110 if self.valid_tags.is_empty() {
111 return true;
112 }
113
114 if tag.contains(self.tag_separator) {
116 let parts: Vec<&str> = tag.split(self.tag_separator).collect();
117
118 if self.validate_hierarchy && parts.len() > self.max_tag_depth {
119 return false;
120 }
121
122 parts.iter().all(|part| self.valid_tags.contains(*part))
123 } else {
124 self.valid_tags.contains(tag)
125 }
126 }
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct CostRules {
132 pub left_context_range: RangeInclusive<i32>,
134 pub right_context_range: RangeInclusive<i32>,
136 pub word_cost_range: RangeInclusive<i32>,
138 pub warn_unusual_costs: bool,
140 pub unusual_high_cost: i32,
142 pub unusual_low_cost: i32,
144}
145
146impl Default for CostRules {
147 fn default() -> Self {
148 Self {
149 left_context_range: 0..=10000,
150 right_context_range: 0..=10000,
151 word_cost_range: -10000..=10000,
152 warn_unusual_costs: true,
153 unusual_high_cost: 8000,
154 unusual_low_cost: -8000,
155 }
156 }
157}
158
159impl CostRules {
160 #[must_use]
162 pub fn validate_costs(&self, left_id: i32, right_id: i32, cost: i32) -> CostValidationResult {
163 let mut result = CostValidationResult::default();
164
165 if !self.left_context_range.contains(&left_id) {
166 result.errors.push(format!(
167 "Left context ID {left_id} is outside valid range {:?}",
168 self.left_context_range
169 ));
170 }
171
172 if !self.right_context_range.contains(&right_id) {
173 result.errors.push(format!(
174 "Right context ID {right_id} is outside valid range {:?}",
175 self.right_context_range
176 ));
177 }
178
179 if !self.word_cost_range.contains(&cost) {
180 result.errors.push(format!(
181 "Word cost {cost} is outside valid range {:?}",
182 self.word_cost_range
183 ));
184 }
185
186 if self.warn_unusual_costs {
187 if cost > self.unusual_high_cost {
188 result.warnings.push(format!(
189 "Word cost {cost} is unusually high (threshold: {})",
190 self.unusual_high_cost
191 ));
192 } else if cost < self.unusual_low_cost {
193 result.warnings.push(format!(
194 "Word cost {cost} is unusually low (threshold: {})",
195 self.unusual_low_cost
196 ));
197 }
198 }
199
200 result
201 }
202}
203
204#[derive(Debug, Default, Clone)]
206pub struct CostValidationResult {
207 pub errors: Vec<String>,
209 pub warnings: Vec<String>,
211}
212
213impl CostValidationResult {
214 #[must_use]
216 pub fn is_valid(&self) -> bool {
217 self.errors.is_empty()
218 }
219
220 #[must_use]
222 pub fn has_warnings(&self) -> bool {
223 !self.warnings.is_empty()
224 }
225}
226
227#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct EncodingRules {
230 pub expected_encoding: String,
232 pub validate_utf8: bool,
234 pub detect_encoding_issues: bool,
236 pub allow_bom: bool,
238}
239
240impl Default for EncodingRules {
241 fn default() -> Self {
242 Self {
243 expected_encoding: "UTF-8".to_string(),
244 validate_utf8: true,
245 detect_encoding_issues: true,
246 allow_bom: false,
247 }
248 }
249}
250
251#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct DuplicateRules {
254 pub detect_exact_duplicates: bool,
256 pub detect_semantic_duplicates: bool,
258 pub allow_cost_variants: bool,
260}
261
262impl Default for DuplicateRules {
263 fn default() -> Self {
264 Self {
265 detect_exact_duplicates: true,
266 detect_semantic_duplicates: true,
267 allow_cost_variants: true,
268 }
269 }
270}
271
272#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct NormalizationRules {
275 pub check_unicode_normalization: bool,
277 pub preferred_normalization: NormalizationForm,
279 pub check_width_consistency: bool,
281 pub check_hangul_composition: bool,
283 pub warn_on_whitespace: bool,
285}
286
287impl Default for NormalizationRules {
288 fn default() -> Self {
289 Self {
290 check_unicode_normalization: true,
291 preferred_normalization: NormalizationForm::Nfc,
292 check_width_consistency: true,
293 check_hangul_composition: true,
294 warn_on_whitespace: true,
295 }
296 }
297}
298
299#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
301pub enum NormalizationForm {
302 Nfc,
304 Nfd,
306 Nfkc,
308 Nfkd,
310}
311
312impl NormalizationForm {
313 #[must_use]
315 pub fn normalize(&self, text: &str) -> String {
316 use unicode_normalization::UnicodeNormalization;
317
318 match self {
319 Self::Nfc => text.nfc().collect(),
320 Self::Nfd => text.nfd().collect(),
321 Self::Nfkc => text.nfkc().collect(),
322 Self::Nfkd => text.nfkd().collect(),
323 }
324 }
325}
326
327#[cfg(test)]
328#[allow(
329 clippy::expect_used,
330 clippy::unwrap_used,
331 clippy::field_reassign_with_default
332)]
333mod tests {
334 use super::*;
335
336 #[test]
337 fn test_default_config() {
338 let config = ValidationConfig::default();
339 assert_eq!(config.csv_rules.expected_field_count, 13);
340 assert!(config.pos_rules.validate_hierarchy);
341 assert!(config.encoding_rules.validate_utf8);
342 }
343
344 #[test]
345 fn test_pos_tag_validation() {
346 let rules = PosRules::default();
347
348 assert!(rules.is_valid_tag("NNG"));
350 assert!(rules.is_valid_tag("VV"));
351 assert!(rules.is_valid_tag("JKS"));
352
353 assert!(rules.is_valid_tag("NNG+JKS"));
355 assert!(rules.is_valid_tag("VV+EC"));
356
357 assert!(!rules.is_valid_tag("XXX"));
359 assert!(!rules.is_valid_tag("NNG+XXX"));
360 }
361
362 #[test]
363 fn test_cost_validation() {
364 let rules = CostRules::default();
365
366 let result = rules.validate_costs(100, 200, 500);
368 assert!(result.is_valid());
369 assert!(!result.has_warnings());
370
371 let result = rules.validate_costs(-1, 200, 500);
373 assert!(!result.is_valid());
374
375 let result = rules.validate_costs(100, 200, 9000);
377 assert!(result.is_valid());
378 assert!(result.has_warnings());
379 }
380
381 #[test]
382 fn test_normalization_form() {
383 let nfc = NormalizationForm::Nfc;
384
385 let composed = "한글";
387 let normalized = nfc.normalize(composed);
388 assert_eq!(composed, normalized);
389
390 let decomposed = "\u{1112}\u{1161}\u{11AB}\u{1100}\u{1173}\u{11AF}"; let normalized = nfc.normalize(decomposed);
393 assert_eq!("한글", normalized);
394 }
395
396 #[test]
397 fn test_max_tag_depth() {
398 let mut rules = PosRules::default();
399 rules.max_tag_depth = 2;
400
401 assert!(rules.is_valid_tag("NNG+JKS"));
402 assert!(!rules.is_valid_tag("NNG+JKS+EC"));
403 }
404
405 #[test]
406 fn test_cost_ranges() {
407 let rules = CostRules {
408 left_context_range: 0..=100,
409 right_context_range: 0..=100,
410 word_cost_range: -1000..=1000,
411 warn_unusual_costs: false,
412 unusual_high_cost: 800,
413 unusual_low_cost: -800,
414 };
415
416 let result = rules.validate_costs(50, 75, 500);
417 assert!(result.is_valid());
418
419 let result = rules.validate_costs(150, 75, 500);
420 assert!(!result.is_valid());
421 }
422
423 #[test]
424 fn test_empty_valid_tags() {
425 let mut rules = PosRules::default();
426 rules.valid_tags.clear();
427
428 assert!(rules.is_valid_tag("ANYTHING"));
430 assert!(rules.is_valid_tag("XXX+YYY"));
431 }
432}