Skip to main content

mecab_ko_dict_validator/
validator.rs

1//! Main dictionary validation logic.
2//!
3//! This module provides the core validation functionality for `MeCab` dictionary files.
4
5#![allow(clippy::cast_precision_loss)]
6#![allow(clippy::collapsible_if)]
7#![allow(clippy::redundant_closure_for_method_calls)]
8
9use crate::report::{IssueCategory, Location, ValidationIssue, ValidationReport};
10use crate::rules::ValidationConfig;
11use csv::StringRecord;
12use rayon::prelude::*;
13use std::collections::{HashMap, HashSet};
14use std::fs::File;
15use std::io::{BufRead, BufReader};
16use std::path::Path;
17
18/// Dictionary entry validator.
19pub struct DictValidator {
20    config: ValidationConfig,
21}
22
23impl DictValidator {
24    /// Creates a new validator with the given configuration.
25    #[must_use]
26    pub const fn new(config: ValidationConfig) -> Self {
27        Self { config }
28    }
29
30    /// Creates a validator with default configuration.
31    #[must_use]
32    pub fn with_defaults() -> Self {
33        Self::new(ValidationConfig::default())
34    }
35
36    /// Validates a dictionary file.
37    ///
38    /// # Errors
39    ///
40    /// Returns an error if the file cannot be read or processed.
41    pub fn validate_file<P: AsRef<Path>>(
42        &self,
43        path: P,
44    ) -> Result<ValidationReport, ValidationError> {
45        let path = path.as_ref();
46        let mut report = ValidationReport::new(path.to_path_buf());
47
48        // Read and validate file encoding
49        let file = File::open(path).map_err(|e| ValidationError::IoError(e.to_string()))?;
50
51        let reader = BufReader::new(file);
52
53        // Check for BOM
54        if !self.config.encoding_rules.allow_bom {
55            let mut first_bytes = [0u8; 3];
56            let mut peek_reader = BufReader::new(File::open(path)?);
57            if std::io::Read::read_exact(&mut peek_reader, &mut first_bytes).is_ok() {
58                if first_bytes == [0xEF, 0xBB, 0xBF] {
59                    report.add_issue(ValidationIssue::warning(
60                        IssueCategory::Encoding,
61                        "File contains UTF-8 BOM".to_string(),
62                    ));
63                }
64            }
65        }
66
67        // Process entries
68        let entries = self.read_entries(reader)?;
69        report.total_entries = entries.len();
70
71        // Validate entries in parallel
72        let issues: Vec<_> = entries
73            .par_iter()
74            .enumerate()
75            .flat_map(|(line_num, entry)| self.validate_entry(entry, line_num + 1))
76            .collect();
77
78        // Detect duplicates
79        let duplicate_issues = self.detect_duplicates(&entries);
80
81        // Collect all issues
82        for issue in issues.into_iter().chain(duplicate_issues) {
83            report.add_issue(issue);
84        }
85
86        // Calculate statistics and store entries for analysis
87        report.statistics = Self::calculate_statistics(&entries);
88        report.entries = Some(entries);
89        report.valid_entries = report.total_entries.saturating_sub(report.error_entries);
90
91        Ok(report)
92    }
93
94    /// Reads all entries from the reader.
95    fn read_entries<R: BufRead>(&self, reader: R) -> Result<Vec<DictEntry>, ValidationError> {
96        let mut entries = Vec::new();
97        let mut line_num = 0;
98
99        for line in reader.lines() {
100            line_num += 1;
101            let line = line.map_err(|e| ValidationError::IoError(e.to_string()))?;
102
103            // Validate UTF-8
104            if self.config.encoding_rules.validate_utf8 {
105                // line is already validated as valid UTF-8 by the lines() iterator
106                // But we can check for other encoding issues
107                if line.chars().any(|c| c == '\u{FFFD}') {
108                    return Err(ValidationError::EncodingError(format!(
109                        "Invalid UTF-8 sequence at line {line_num}"
110                    )));
111                }
112            }
113
114            if line.trim().is_empty() || line.starts_with('#') {
115                continue; // Skip empty lines and comments
116            }
117
118            match Self::parse_entry(&line, line_num) {
119                Ok(entry) => entries.push(entry),
120                Err(e) => return Err(e),
121            }
122        }
123
124        Ok(entries)
125    }
126
127    /// Parses a single entry from a CSV line.
128    fn parse_entry(line: &str, line_num: usize) -> Result<DictEntry, ValidationError> {
129        let mut rdr = csv::ReaderBuilder::new()
130            .has_headers(false)
131            .flexible(true)
132            .from_reader(line.as_bytes());
133
134        let mut records = rdr.records();
135        let record = records
136            .next()
137            .ok_or_else(|| ValidationError::ParseError(format!("Empty line at {line_num}")))?
138            .map_err(|e| {
139                ValidationError::ParseError(format!("CSV parse error at line {line_num}: {e}"))
140            })?;
141
142        Self::record_to_entry(&record, line_num)
143    }
144
145    /// Converts a CSV record to a dictionary entry.
146    fn record_to_entry(
147        record: &StringRecord,
148        line_num: usize,
149    ) -> Result<DictEntry, ValidationError> {
150        let field_count = record.len();
151
152        if field_count < 4 {
153            return Err(ValidationError::ParseError(format!(
154                "Insufficient fields at line {line_num}: expected at least 4, got {field_count}"
155            )));
156        }
157
158        let surface = record
159            .get(0)
160            .ok_or_else(|| {
161                ValidationError::ParseError(format!("Missing surface form at line {line_num}"))
162            })?
163            .to_string();
164
165        let left_id = record
166            .get(1)
167            .and_then(|s| s.parse::<i32>().ok())
168            .ok_or_else(|| {
169                ValidationError::ParseError(format!("Invalid left context ID at line {line_num}"))
170            })?;
171
172        let right_id = record
173            .get(2)
174            .and_then(|s| s.parse::<i32>().ok())
175            .ok_or_else(|| {
176                ValidationError::ParseError(format!("Invalid right context ID at line {line_num}"))
177            })?;
178
179        let cost = record
180            .get(3)
181            .and_then(|s| s.parse::<i32>().ok())
182            .ok_or_else(|| {
183                ValidationError::ParseError(format!("Invalid cost at line {line_num}"))
184            })?;
185
186        let pos_tag = record.get(4).unwrap_or("").to_string();
187
188        // Collect additional features
189        let features: Vec<String> = (5..field_count)
190            .filter_map(|i| record.get(i).map(|s| s.to_string()))
191            .collect();
192
193        Ok(DictEntry {
194            surface,
195            left_id,
196            right_id,
197            cost,
198            pos_tag,
199            features,
200            line_num,
201        })
202    }
203
204    /// Validates a single entry.
205    fn validate_entry(&self, entry: &DictEntry, line_num: usize) -> Vec<ValidationIssue> {
206        let mut issues = Vec::new();
207
208        // CSV format validation
209        let total_fields = 5 + entry.features.len();
210        if total_fields != self.config.csv_rules.expected_field_count {
211            issues.push(
212                ValidationIssue::error(
213                    IssueCategory::CsvFormat,
214                    format!(
215                        "Invalid field count: expected {}, got {total_fields}",
216                        self.config.csv_rules.expected_field_count
217                    ),
218                )
219                .with_location(Location::new(line_num)),
220            );
221        }
222
223        // Check for empty fields
224        if !self.config.csv_rules.allow_empty_fields {
225            if entry.surface.is_empty() {
226                issues.push(
227                    ValidationIssue::error(
228                        IssueCategory::CsvFormat,
229                        "Empty surface form".to_string(),
230                    )
231                    .with_location(Location::new(line_num)),
232                );
233            }
234
235            if entry.pos_tag.is_empty() {
236                issues.push(
237                    ValidationIssue::error(IssueCategory::PosTag, "Empty POS tag".to_string())
238                        .with_location(Location::new(line_num)),
239                );
240            }
241        }
242
243        // POS tag validation
244        if !entry.pos_tag.is_empty() && !self.config.pos_rules.is_valid_tag(&entry.pos_tag) {
245            issues.push(
246                ValidationIssue::error(
247                    IssueCategory::PosTag,
248                    format!("Invalid POS tag: '{}'", entry.pos_tag),
249                )
250                .with_location(Location::new(line_num))
251                .with_suggestion("Check against valid Korean POS tags".to_string()),
252            );
253        }
254
255        // Cost validation
256        let cost_result =
257            self.config
258                .cost_rules
259                .validate_costs(entry.left_id, entry.right_id, entry.cost);
260
261        for error in cost_result.errors {
262            issues.push(
263                ValidationIssue::error(IssueCategory::Cost, error)
264                    .with_location(Location::new(line_num)),
265            );
266        }
267
268        for warning in cost_result.warnings {
269            issues.push(
270                ValidationIssue::warning(IssueCategory::Cost, warning)
271                    .with_location(Location::new(line_num)),
272            );
273        }
274
275        // Normalization validation
276        issues.extend(self.validate_normalization(entry, line_num));
277
278        issues
279    }
280
281    /// Validates Unicode normalization for an entry.
282    fn validate_normalization(&self, entry: &DictEntry, line_num: usize) -> Vec<ValidationIssue> {
283        let mut issues = Vec::new();
284        let rules = &self.config.normalization_rules;
285
286        if rules.check_unicode_normalization {
287            let normalized = rules.preferred_normalization.normalize(&entry.surface);
288            if normalized != entry.surface {
289                issues.push(
290                    ValidationIssue::warning(
291                        IssueCategory::Normalization,
292                        format!(
293                            "Surface form '{}' is not in {:?} form",
294                            entry.surface, rules.preferred_normalization
295                        ),
296                    )
297                    .with_location(Location::new(line_num))
298                    .with_suggestion(format!("Use: '{normalized}'")),
299                );
300            }
301        }
302
303        if rules.check_hangul_composition {
304            // Check if Hangul characters are properly composed
305            let has_decomposed_hangul = entry.surface.chars().any(|c| {
306                matches!(c,
307                    '\u{1100}'..='\u{11FF}' | // Hangul Jamo
308                    '\u{3130}'..='\u{318F}'   // Hangul Compatibility Jamo
309                )
310            });
311
312            if has_decomposed_hangul {
313                issues.push(
314                    ValidationIssue::warning(
315                        IssueCategory::Normalization,
316                        "Surface form contains decomposed Hangul jamo".to_string(),
317                    )
318                    .with_location(Location::new(line_num))
319                    .with_suggestion("Use composed Hangul syllables".to_string()),
320                );
321            }
322        }
323
324        if rules.warn_on_whitespace && entry.surface.contains(char::is_whitespace) {
325            issues.push(
326                ValidationIssue::warning(
327                    IssueCategory::Normalization,
328                    "Surface form contains whitespace".to_string(),
329                )
330                .with_location(Location::new(line_num)),
331            );
332        }
333
334        issues
335    }
336
337    /// Detects duplicate entries.
338    fn detect_duplicates(&self, entries: &[DictEntry]) -> Vec<ValidationIssue> {
339        let mut issues = Vec::new();
340        let rules = &self.config.duplicate_rules;
341
342        if rules.detect_exact_duplicates {
343            let mut seen = HashMap::new();
344
345            for entry in entries {
346                let key = format!(
347                    "{}|{}|{}|{}|{}",
348                    entry.surface, entry.left_id, entry.right_id, entry.cost, entry.pos_tag
349                );
350
351                if let Some(&first_line) = seen.get(&key) {
352                    issues.push(
353                        ValidationIssue::error(
354                            IssueCategory::Duplicate,
355                            format!("Exact duplicate of line {first_line}"),
356                        )
357                        .with_location(Location::new(entry.line_num))
358                        .with_context(format!("Surface: '{}'", entry.surface)),
359                    );
360                } else {
361                    seen.insert(key, entry.line_num);
362                }
363            }
364        }
365
366        if rules.detect_semantic_duplicates && !rules.allow_cost_variants {
367            let mut seen = HashMap::new();
368
369            for entry in entries {
370                let key = format!("{}|{}", entry.surface, entry.pos_tag);
371
372                if let Some(&first_line) = seen.get(&key) {
373                    issues.push(
374                        ValidationIssue::warning(
375                            IssueCategory::Duplicate,
376                            format!("Semantic duplicate of line {first_line} (same surface+POS)"),
377                        )
378                        .with_location(Location::new(entry.line_num))
379                        .with_context(format!(
380                            "Surface: '{}', POS: '{}'",
381                            entry.surface, entry.pos_tag
382                        )),
383                    );
384                } else {
385                    seen.insert(key, entry.line_num);
386                }
387            }
388        }
389
390        issues
391    }
392
393    /// Calculates validation statistics.
394    fn calculate_statistics(entries: &[DictEntry]) -> crate::report::ValidationStatistics {
395        let mut stats = crate::report::ValidationStatistics::default();
396
397        let mut costs = Vec::new();
398        let mut surface_forms = HashSet::new();
399
400        for entry in entries {
401            // POS tag counts
402            *stats
403                .pos_tag_counts
404                .entry(entry.pos_tag.clone())
405                .or_insert(0) += 1;
406
407            // Cost statistics
408            costs.push(entry.cost);
409
410            // Unique surface forms
411            surface_forms.insert(entry.surface.clone());
412        }
413
414        stats.unique_surface_forms = surface_forms.len();
415
416        if !costs.is_empty() {
417            stats.min_cost = costs.iter().min().copied();
418            stats.max_cost = costs.iter().max().copied();
419            stats.average_cost =
420                Some(costs.iter().map(|&c| f64::from(c)).sum::<f64>() / costs.len() as f64);
421        }
422
423        // Duplicate count
424        stats.duplicate_count = entries.len() - surface_forms.len();
425
426        stats
427    }
428}
429
430/// A dictionary entry.
431#[derive(Debug, Clone)]
432pub struct DictEntry {
433    /// Surface form
434    pub surface: String,
435    /// Left context ID
436    pub left_id: i32,
437    /// Right context ID
438    pub right_id: i32,
439    /// Word cost
440    pub cost: i32,
441    /// POS tag
442    pub pos_tag: String,
443    /// Additional features
444    pub features: Vec<String>,
445    /// Line number in source file
446    pub line_num: usize,
447}
448
449/// Validation error.
450#[derive(Debug, thiserror::Error)]
451pub enum ValidationError {
452    /// I/O error
453    #[error("I/O error: {0}")]
454    IoError(String),
455
456    /// Parse error
457    #[error("Parse error: {0}")]
458    ParseError(String),
459
460    /// Encoding error
461    #[error("Encoding error: {0}")]
462    EncodingError(String),
463}
464
465impl From<std::io::Error> for ValidationError {
466    fn from(err: std::io::Error) -> Self {
467        Self::IoError(err.to_string())
468    }
469}
470
471#[cfg(test)]
472#[allow(clippy::expect_used, clippy::unwrap_used, clippy::needless_collect)]
473mod tests {
474    use super::*;
475
476    #[test]
477    fn test_parse_valid_entry() {
478        let line = "한글,1,2,100,NNG,*,F,한글,*,*,*,*,*";
479        let entry = DictValidator::parse_entry(line, 1).expect("Failed to parse entry");
480
481        assert_eq!(entry.surface, "한글");
482        assert_eq!(entry.left_id, 1);
483        assert_eq!(entry.right_id, 2);
484        assert_eq!(entry.cost, 100);
485        assert_eq!(entry.pos_tag, "NNG");
486    }
487
488    #[test]
489    fn test_validate_entry_valid() {
490        let entry = DictEntry {
491            surface: "테스트".to_string(),
492            left_id: 100,
493            right_id: 200,
494            cost: 500,
495            pos_tag: "NNG".to_string(),
496            features: vec!["*".to_string(); 8],
497            line_num: 1,
498        };
499
500        let validator = DictValidator::with_defaults();
501        let issues = validator.validate_entry(&entry, 1);
502
503        // Should have no errors, might have warnings
504        let errors: Vec<_> = issues
505            .iter()
506            .filter(|i| i.severity == crate::report::Severity::Error)
507            .collect();
508        assert!(errors.is_empty());
509    }
510
511    #[test]
512    fn test_validate_entry_invalid_pos() {
513        let entry = DictEntry {
514            surface: "테스트".to_string(),
515            left_id: 100,
516            right_id: 200,
517            cost: 500,
518            pos_tag: "INVALID".to_string(),
519            features: vec!["*".to_string(); 8],
520            line_num: 1,
521        };
522
523        let validator = DictValidator::with_defaults();
524        let issues = validator.validate_entry(&entry, 1);
525
526        let errors: Vec<_> = issues
527            .iter()
528            .filter(|i| i.severity == crate::report::Severity::Error)
529            .collect();
530        assert!(!errors.is_empty());
531    }
532
533    #[test]
534    fn test_detect_exact_duplicates() {
535        let entries = vec![
536            DictEntry {
537                surface: "중복".to_string(),
538                left_id: 1,
539                right_id: 2,
540                cost: 100,
541                pos_tag: "NNG".to_string(),
542                features: vec![],
543                line_num: 1,
544            },
545            DictEntry {
546                surface: "중복".to_string(),
547                left_id: 1,
548                right_id: 2,
549                cost: 100,
550                pos_tag: "NNG".to_string(),
551                features: vec![],
552                line_num: 2,
553            },
554        ];
555
556        let validator = DictValidator::with_defaults();
557        let issues = validator.detect_duplicates(&entries);
558
559        assert!(!issues.is_empty());
560        assert!(issues
561            .iter()
562            .any(|i| matches!(i.category, IssueCategory::Duplicate)));
563    }
564}