1#![allow(clippy::cast_precision_loss)]
6#![allow(clippy::collapsible_if)]
7#![allow(clippy::redundant_closure_for_method_calls)]
8
9use crate::report::{IssueCategory, Location, ValidationIssue, ValidationReport};
10use crate::rules::ValidationConfig;
11use csv::StringRecord;
12use rayon::prelude::*;
13use std::collections::{HashMap, HashSet};
14use std::fs::File;
15use std::io::{BufRead, BufReader};
16use std::path::Path;
17
18pub struct DictValidator {
20 config: ValidationConfig,
21}
22
23impl DictValidator {
24 #[must_use]
26 pub const fn new(config: ValidationConfig) -> Self {
27 Self { config }
28 }
29
30 #[must_use]
32 pub fn with_defaults() -> Self {
33 Self::new(ValidationConfig::default())
34 }
35
36 pub fn validate_file<P: AsRef<Path>>(
42 &self,
43 path: P,
44 ) -> Result<ValidationReport, ValidationError> {
45 let path = path.as_ref();
46 let mut report = ValidationReport::new(path.to_path_buf());
47
48 let file = File::open(path).map_err(|e| ValidationError::IoError(e.to_string()))?;
50
51 let reader = BufReader::new(file);
52
53 if !self.config.encoding_rules.allow_bom {
55 let mut first_bytes = [0u8; 3];
56 let mut peek_reader = BufReader::new(File::open(path)?);
57 if std::io::Read::read_exact(&mut peek_reader, &mut first_bytes).is_ok() {
58 if first_bytes == [0xEF, 0xBB, 0xBF] {
59 report.add_issue(ValidationIssue::warning(
60 IssueCategory::Encoding,
61 "File contains UTF-8 BOM".to_string(),
62 ));
63 }
64 }
65 }
66
67 let entries = self.read_entries(reader)?;
69 report.total_entries = entries.len();
70
71 let issues: Vec<_> = entries
73 .par_iter()
74 .enumerate()
75 .flat_map(|(line_num, entry)| self.validate_entry(entry, line_num + 1))
76 .collect();
77
78 let duplicate_issues = self.detect_duplicates(&entries);
80
81 for issue in issues.into_iter().chain(duplicate_issues) {
83 report.add_issue(issue);
84 }
85
86 report.statistics = Self::calculate_statistics(&entries);
88 report.entries = Some(entries);
89 report.valid_entries = report.total_entries.saturating_sub(report.error_entries);
90
91 Ok(report)
92 }
93
94 fn read_entries<R: BufRead>(&self, reader: R) -> Result<Vec<DictEntry>, ValidationError> {
96 let mut entries = Vec::new();
97 let mut line_num = 0;
98
99 for line in reader.lines() {
100 line_num += 1;
101 let line = line.map_err(|e| ValidationError::IoError(e.to_string()))?;
102
103 if self.config.encoding_rules.validate_utf8 {
105 if line.chars().any(|c| c == '\u{FFFD}') {
108 return Err(ValidationError::EncodingError(format!(
109 "Invalid UTF-8 sequence at line {line_num}"
110 )));
111 }
112 }
113
114 if line.trim().is_empty() || line.starts_with('#') {
115 continue; }
117
118 match Self::parse_entry(&line, line_num) {
119 Ok(entry) => entries.push(entry),
120 Err(e) => return Err(e),
121 }
122 }
123
124 Ok(entries)
125 }
126
127 fn parse_entry(line: &str, line_num: usize) -> Result<DictEntry, ValidationError> {
129 let mut rdr = csv::ReaderBuilder::new()
130 .has_headers(false)
131 .flexible(true)
132 .from_reader(line.as_bytes());
133
134 let mut records = rdr.records();
135 let record = records
136 .next()
137 .ok_or_else(|| ValidationError::ParseError(format!("Empty line at {line_num}")))?
138 .map_err(|e| {
139 ValidationError::ParseError(format!("CSV parse error at line {line_num}: {e}"))
140 })?;
141
142 Self::record_to_entry(&record, line_num)
143 }
144
145 fn record_to_entry(
147 record: &StringRecord,
148 line_num: usize,
149 ) -> Result<DictEntry, ValidationError> {
150 let field_count = record.len();
151
152 if field_count < 4 {
153 return Err(ValidationError::ParseError(format!(
154 "Insufficient fields at line {line_num}: expected at least 4, got {field_count}"
155 )));
156 }
157
158 let surface = record
159 .get(0)
160 .ok_or_else(|| {
161 ValidationError::ParseError(format!("Missing surface form at line {line_num}"))
162 })?
163 .to_string();
164
165 let left_id = record
166 .get(1)
167 .and_then(|s| s.parse::<i32>().ok())
168 .ok_or_else(|| {
169 ValidationError::ParseError(format!("Invalid left context ID at line {line_num}"))
170 })?;
171
172 let right_id = record
173 .get(2)
174 .and_then(|s| s.parse::<i32>().ok())
175 .ok_or_else(|| {
176 ValidationError::ParseError(format!("Invalid right context ID at line {line_num}"))
177 })?;
178
179 let cost = record
180 .get(3)
181 .and_then(|s| s.parse::<i32>().ok())
182 .ok_or_else(|| {
183 ValidationError::ParseError(format!("Invalid cost at line {line_num}"))
184 })?;
185
186 let pos_tag = record.get(4).unwrap_or("").to_string();
187
188 let features: Vec<String> = (5..field_count)
190 .filter_map(|i| record.get(i).map(|s| s.to_string()))
191 .collect();
192
193 Ok(DictEntry {
194 surface,
195 left_id,
196 right_id,
197 cost,
198 pos_tag,
199 features,
200 line_num,
201 })
202 }
203
204 fn validate_entry(&self, entry: &DictEntry, line_num: usize) -> Vec<ValidationIssue> {
206 let mut issues = Vec::new();
207
208 let total_fields = 5 + entry.features.len();
210 if total_fields != self.config.csv_rules.expected_field_count {
211 issues.push(
212 ValidationIssue::error(
213 IssueCategory::CsvFormat,
214 format!(
215 "Invalid field count: expected {}, got {total_fields}",
216 self.config.csv_rules.expected_field_count
217 ),
218 )
219 .with_location(Location::new(line_num)),
220 );
221 }
222
223 if !self.config.csv_rules.allow_empty_fields {
225 if entry.surface.is_empty() {
226 issues.push(
227 ValidationIssue::error(
228 IssueCategory::CsvFormat,
229 "Empty surface form".to_string(),
230 )
231 .with_location(Location::new(line_num)),
232 );
233 }
234
235 if entry.pos_tag.is_empty() {
236 issues.push(
237 ValidationIssue::error(IssueCategory::PosTag, "Empty POS tag".to_string())
238 .with_location(Location::new(line_num)),
239 );
240 }
241 }
242
243 if !entry.pos_tag.is_empty() && !self.config.pos_rules.is_valid_tag(&entry.pos_tag) {
245 issues.push(
246 ValidationIssue::error(
247 IssueCategory::PosTag,
248 format!("Invalid POS tag: '{}'", entry.pos_tag),
249 )
250 .with_location(Location::new(line_num))
251 .with_suggestion("Check against valid Korean POS tags".to_string()),
252 );
253 }
254
255 let cost_result =
257 self.config
258 .cost_rules
259 .validate_costs(entry.left_id, entry.right_id, entry.cost);
260
261 for error in cost_result.errors {
262 issues.push(
263 ValidationIssue::error(IssueCategory::Cost, error)
264 .with_location(Location::new(line_num)),
265 );
266 }
267
268 for warning in cost_result.warnings {
269 issues.push(
270 ValidationIssue::warning(IssueCategory::Cost, warning)
271 .with_location(Location::new(line_num)),
272 );
273 }
274
275 issues.extend(self.validate_normalization(entry, line_num));
277
278 issues
279 }
280
281 fn validate_normalization(&self, entry: &DictEntry, line_num: usize) -> Vec<ValidationIssue> {
283 let mut issues = Vec::new();
284 let rules = &self.config.normalization_rules;
285
286 if rules.check_unicode_normalization {
287 let normalized = rules.preferred_normalization.normalize(&entry.surface);
288 if normalized != entry.surface {
289 issues.push(
290 ValidationIssue::warning(
291 IssueCategory::Normalization,
292 format!(
293 "Surface form '{}' is not in {:?} form",
294 entry.surface, rules.preferred_normalization
295 ),
296 )
297 .with_location(Location::new(line_num))
298 .with_suggestion(format!("Use: '{normalized}'")),
299 );
300 }
301 }
302
303 if rules.check_hangul_composition {
304 let has_decomposed_hangul = entry.surface.chars().any(|c| {
306 matches!(c,
307 '\u{1100}'..='\u{11FF}' | '\u{3130}'..='\u{318F}' )
310 });
311
312 if has_decomposed_hangul {
313 issues.push(
314 ValidationIssue::warning(
315 IssueCategory::Normalization,
316 "Surface form contains decomposed Hangul jamo".to_string(),
317 )
318 .with_location(Location::new(line_num))
319 .with_suggestion("Use composed Hangul syllables".to_string()),
320 );
321 }
322 }
323
324 if rules.warn_on_whitespace && entry.surface.contains(char::is_whitespace) {
325 issues.push(
326 ValidationIssue::warning(
327 IssueCategory::Normalization,
328 "Surface form contains whitespace".to_string(),
329 )
330 .with_location(Location::new(line_num)),
331 );
332 }
333
334 issues
335 }
336
337 fn detect_duplicates(&self, entries: &[DictEntry]) -> Vec<ValidationIssue> {
339 let mut issues = Vec::new();
340 let rules = &self.config.duplicate_rules;
341
342 if rules.detect_exact_duplicates {
343 let mut seen = HashMap::new();
344
345 for entry in entries {
346 let key = format!(
347 "{}|{}|{}|{}|{}",
348 entry.surface, entry.left_id, entry.right_id, entry.cost, entry.pos_tag
349 );
350
351 if let Some(&first_line) = seen.get(&key) {
352 issues.push(
353 ValidationIssue::error(
354 IssueCategory::Duplicate,
355 format!("Exact duplicate of line {first_line}"),
356 )
357 .with_location(Location::new(entry.line_num))
358 .with_context(format!("Surface: '{}'", entry.surface)),
359 );
360 } else {
361 seen.insert(key, entry.line_num);
362 }
363 }
364 }
365
366 if rules.detect_semantic_duplicates && !rules.allow_cost_variants {
367 let mut seen = HashMap::new();
368
369 for entry in entries {
370 let key = format!("{}|{}", entry.surface, entry.pos_tag);
371
372 if let Some(&first_line) = seen.get(&key) {
373 issues.push(
374 ValidationIssue::warning(
375 IssueCategory::Duplicate,
376 format!("Semantic duplicate of line {first_line} (same surface+POS)"),
377 )
378 .with_location(Location::new(entry.line_num))
379 .with_context(format!(
380 "Surface: '{}', POS: '{}'",
381 entry.surface, entry.pos_tag
382 )),
383 );
384 } else {
385 seen.insert(key, entry.line_num);
386 }
387 }
388 }
389
390 issues
391 }
392
393 fn calculate_statistics(entries: &[DictEntry]) -> crate::report::ValidationStatistics {
395 let mut stats = crate::report::ValidationStatistics::default();
396
397 let mut costs = Vec::new();
398 let mut surface_forms = HashSet::new();
399
400 for entry in entries {
401 *stats
403 .pos_tag_counts
404 .entry(entry.pos_tag.clone())
405 .or_insert(0) += 1;
406
407 costs.push(entry.cost);
409
410 surface_forms.insert(entry.surface.clone());
412 }
413
414 stats.unique_surface_forms = surface_forms.len();
415
416 if !costs.is_empty() {
417 stats.min_cost = costs.iter().min().copied();
418 stats.max_cost = costs.iter().max().copied();
419 stats.average_cost =
420 Some(costs.iter().map(|&c| f64::from(c)).sum::<f64>() / costs.len() as f64);
421 }
422
423 stats.duplicate_count = entries.len() - surface_forms.len();
425
426 stats
427 }
428}
429
430#[derive(Debug, Clone)]
432pub struct DictEntry {
433 pub surface: String,
435 pub left_id: i32,
437 pub right_id: i32,
439 pub cost: i32,
441 pub pos_tag: String,
443 pub features: Vec<String>,
445 pub line_num: usize,
447}
448
449#[derive(Debug, thiserror::Error)]
451pub enum ValidationError {
452 #[error("I/O error: {0}")]
454 IoError(String),
455
456 #[error("Parse error: {0}")]
458 ParseError(String),
459
460 #[error("Encoding error: {0}")]
462 EncodingError(String),
463}
464
465impl From<std::io::Error> for ValidationError {
466 fn from(err: std::io::Error) -> Self {
467 Self::IoError(err.to_string())
468 }
469}
470
471#[cfg(test)]
472#[allow(clippy::expect_used, clippy::unwrap_used, clippy::needless_collect)]
473mod tests {
474 use super::*;
475
476 #[test]
477 fn test_parse_valid_entry() {
478 let line = "한글,1,2,100,NNG,*,F,한글,*,*,*,*,*";
479 let entry = DictValidator::parse_entry(line, 1).expect("Failed to parse entry");
480
481 assert_eq!(entry.surface, "한글");
482 assert_eq!(entry.left_id, 1);
483 assert_eq!(entry.right_id, 2);
484 assert_eq!(entry.cost, 100);
485 assert_eq!(entry.pos_tag, "NNG");
486 }
487
488 #[test]
489 fn test_validate_entry_valid() {
490 let entry = DictEntry {
491 surface: "테스트".to_string(),
492 left_id: 100,
493 right_id: 200,
494 cost: 500,
495 pos_tag: "NNG".to_string(),
496 features: vec!["*".to_string(); 8],
497 line_num: 1,
498 };
499
500 let validator = DictValidator::with_defaults();
501 let issues = validator.validate_entry(&entry, 1);
502
503 let errors: Vec<_> = issues
505 .iter()
506 .filter(|i| i.severity == crate::report::Severity::Error)
507 .collect();
508 assert!(errors.is_empty());
509 }
510
511 #[test]
512 fn test_validate_entry_invalid_pos() {
513 let entry = DictEntry {
514 surface: "테스트".to_string(),
515 left_id: 100,
516 right_id: 200,
517 cost: 500,
518 pos_tag: "INVALID".to_string(),
519 features: vec!["*".to_string(); 8],
520 line_num: 1,
521 };
522
523 let validator = DictValidator::with_defaults();
524 let issues = validator.validate_entry(&entry, 1);
525
526 let errors: Vec<_> = issues
527 .iter()
528 .filter(|i| i.severity == crate::report::Severity::Error)
529 .collect();
530 assert!(!errors.is_empty());
531 }
532
533 #[test]
534 fn test_detect_exact_duplicates() {
535 let entries = vec![
536 DictEntry {
537 surface: "중복".to_string(),
538 left_id: 1,
539 right_id: 2,
540 cost: 100,
541 pos_tag: "NNG".to_string(),
542 features: vec![],
543 line_num: 1,
544 },
545 DictEntry {
546 surface: "중복".to_string(),
547 left_id: 1,
548 right_id: 2,
549 cost: 100,
550 pos_tag: "NNG".to_string(),
551 features: vec![],
552 line_num: 2,
553 },
554 ];
555
556 let validator = DictValidator::with_defaults();
557 let issues = validator.detect_duplicates(&entries);
558
559 assert!(!issues.is_empty());
560 assert!(issues
561 .iter()
562 .any(|i| matches!(i.category, IssueCategory::Duplicate)));
563 }
564}