Skip to main content

mecab_ko_dict_builder/
lib.rs

1//! # mecab-ko-dict-builder
2//!
3//! 한국어 형태소 사전 빌더
4//!
5//! mecab-ko-dic CSV 형식에서 바이너리 사전을 생성합니다.
6//!
7//! ## 주요 기능
8//!
9//! - CSV 파싱 (12컬럼 형식)
10//! - Double Array Trie 구축
11//! - 연접 비용 매트릭스 생성
12//! - 미등록어 정의 처리
13//! - 바이너리 사전 압축
14//!
15//! ## 사용법
16//!
17//! ```bash
18//! mecab-ko-dict-builder --input ./mecab-ko-dic --output ./dict.bin
19//! ```
20
21#![warn(missing_docs)]
22#![deny(unsafe_code)]
23
24pub use builder::DictionaryBuilder;
25pub use error::{BuildError, Result};
26
27pub mod char_def_parser;
28pub mod inflect_gen;
29pub mod unk_def_parser;
30
31/// 빌더 에러 모듈
32pub mod error {
33    use thiserror::Error;
34
35    /// 사전 빌드 에러
36    #[derive(Error, Debug)]
37    pub enum BuildError {
38        /// IO 에러
39        #[error("IO error: {0}")]
40        Io(#[from] std::io::Error),
41
42        /// CSV 파싱 에러
43        #[error("CSV parsing error: {0}")]
44        Csv(#[from] csv::Error),
45
46        /// 포맷 에러
47        #[error("Invalid format: {0}")]
48        Format(String),
49
50        /// 인코딩 에러
51        #[error("Encoding error: {0}")]
52        Encoding(String),
53
54        /// Trie 빌드 에러
55        #[error("Trie build error: {0}")]
56        Trie(String),
57
58        /// Dictionary 에러
59        #[error("Dictionary error: {0}")]
60        Dict(#[from] mecab_ko_dict::error::DictError),
61    }
62
63    /// Result 타입 별칭
64    pub type Result<T> = std::result::Result<T, BuildError>;
65}
66
67/// mecab-ko-dic CSV 형식 파서
68///
69/// 12컬럼 CSV 형식을 파싱합니다.
70#[allow(clippy::mixed_attributes_style)]
71pub mod csv_parser {
72    //! mecab-ko-dic CSV 형식 파서 구현
73
74    use super::{BuildError, Result};
75    use std::fs::File;
76    use std::io::BufReader;
77    use std::path::Path;
78
79    use encoding_rs::{EUC_KR, UTF_8};
80    use mecab_ko_hangul::has_jongseong;
81
82    /// CSV 엔트리 (12컬럼)
83    ///
84    /// mecab-ko-dic 형식:
85    /// 표면형,좌ID,우ID,비용,품사,품사세분류,종성유무,읽기,타입,첫품사,마지막품사,표현
86    #[derive(Debug, Clone)]
87    pub struct CsvEntry {
88        /// 표면형
89        pub surface: String,
90        /// 좌문맥 ID
91        pub left_id: u16,
92        /// 우문맥 ID
93        pub right_id: u16,
94        /// 비용
95        pub cost: i16,
96        /// 품사
97        pub pos: String,
98        /// 품사 세분류
99        pub pos_detail: String,
100        /// 종성 유무 (T/F/*)
101        pub jongseong: String,
102        /// 읽기
103        pub reading: String,
104        /// 타입
105        pub entry_type: String,
106        /// 첫 품사
107        pub first_pos: String,
108        /// 마지막 품사
109        pub last_pos: String,
110        /// 표현 (복합어 분석)
111        pub expression: String,
112    }
113
114    impl CsvEntry {
115        /// Feature 문자열 생성 (`MeCab` 형식)
116        ///
117        /// 형식: 품사,품사세분류,종성유무,읽기,타입,첫품사,마지막품사,표현
118        #[must_use]
119        pub fn to_feature(&self) -> String {
120            format!(
121                "{},{},{},{},{},{},{},{}",
122                self.pos,
123                self.pos_detail,
124                self.jongseong,
125                self.reading,
126                self.entry_type,
127                self.first_pos,
128                self.last_pos,
129                self.expression
130            )
131        }
132
133        /// 종성 유무 자동 설정
134        ///
135        /// CSV에서 '*'로 되어있는 경우 표면형에서 자동 판별
136        pub fn normalize_jongseong(&mut self) {
137            if self.jongseong == "*" && !self.surface.is_empty() {
138                // 마지막 문자의 종성 확인
139                if let Some(last_char) = self.surface.chars().last() {
140                    self.jongseong = match has_jongseong(last_char) {
141                        Some(true) => "T".to_string(),
142                        Some(false) => "F".to_string(),
143                        None => "*".to_string(), // 한글이 아닌 경우
144                    };
145                }
146            }
147        }
148    }
149
150    /// CSV 파일 파서
151    pub struct CsvParser {
152        /// 디렉토리 경로
153        dir_path: String,
154        /// 인코딩 (UTF-8 또는 EUC-KR)
155        encoding: Encoding,
156        /// 진행 상황 로깅
157        verbose: bool,
158    }
159
160    /// 파일 인코딩
161    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
162    pub enum Encoding {
163        /// UTF-8
164        Utf8,
165        /// EUC-KR (cp949)
166        EucKr,
167        /// 자동 감지
168        Auto,
169    }
170
171    impl CsvParser {
172        /// 새 파서 생성
173        #[must_use]
174        pub fn new<P: AsRef<Path>>(dir_path: P) -> Self {
175            Self {
176                dir_path: dir_path.as_ref().to_string_lossy().to_string(),
177                encoding: Encoding::Auto,
178                verbose: false,
179            }
180        }
181
182        /// 인코딩 설정
183        #[must_use]
184        pub const fn with_encoding(mut self, encoding: Encoding) -> Self {
185            self.encoding = encoding;
186            self
187        }
188
189        /// 자세한 출력 설정
190        #[must_use]
191        pub const fn verbose(mut self, verbose: bool) -> Self {
192            self.verbose = verbose;
193            self
194        }
195
196        /// 모든 CSV 파일 파싱
197        ///
198        /// 디렉토리 내의 모든 *.csv 파일을 파싱합니다.
199        ///
200        /// # Errors
201        ///
202        /// Returns an error if:
203        /// - The directory cannot be read
204        /// - Any CSV file cannot be opened or parsed
205        /// - CSV entries have invalid format or field values
206        pub fn parse_all(&self) -> Result<Vec<CsvEntry>> {
207            let dir = Path::new(&self.dir_path);
208            if !dir.is_dir() {
209                return Err(BuildError::Format(format!(
210                    "Directory not found: {}",
211                    self.dir_path
212                )));
213            }
214
215            let mut all_entries = Vec::new();
216            let csv_files = Self::find_csv_files(dir)?;
217
218            if self.verbose {
219                tracing::info!("Found {} CSV files", csv_files.len());
220            }
221
222            for csv_file in csv_files {
223                if self.verbose {
224                    tracing::debug!("Parsing {}", csv_file.display());
225                }
226
227                let entries = self.parse_file(&csv_file)?;
228                all_entries.extend(entries);
229            }
230
231            if self.verbose {
232                tracing::info!("Parsed {} total entries", all_entries.len());
233            }
234
235            Ok(all_entries)
236        }
237
238        /// CSV 파일 목록 찾기
239        fn find_csv_files(dir: &Path) -> Result<Vec<std::path::PathBuf>> {
240            let mut csv_files = Vec::new();
241
242            for entry in std::fs::read_dir(dir).map_err(BuildError::Io)? {
243                let entry = entry.map_err(BuildError::Io)?;
244                let path = entry.path();
245
246                if path.is_file() {
247                    if let Some(ext) = path.extension() {
248                        if ext == "csv" {
249                            csv_files.push(path);
250                        }
251                    }
252                }
253            }
254
255            csv_files.sort();
256            Ok(csv_files)
257        }
258
259        /// 단일 CSV 파일 파싱
260        ///
261        /// # Errors
262        ///
263        /// Returns an error if:
264        /// - The file cannot be opened or read
265        /// - The file encoding cannot be detected or decoded
266        /// - CSV content is malformed or has invalid field values
267        pub fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<CsvEntry>> {
268            let file = File::open(path.as_ref()).map_err(BuildError::Io)?;
269            let content = self.read_with_encoding(file)?;
270
271            self.parse_csv_content(&content)
272        }
273
274        /// 인코딩을 고려하여 파일 읽기
275        fn read_with_encoding(&self, file: File) -> Result<String> {
276            use std::io::Read;
277
278            let mut reader = BufReader::new(file);
279            let mut first_bytes = vec![0u8; 1024];
280            let n = reader
281                .by_ref()
282                .take(1024)
283                .read(&mut first_bytes)
284                .map_err(BuildError::Io)?;
285            first_bytes.truncate(n);
286
287            // 전체 파일 읽기
288            let mut all_bytes = first_bytes.clone();
289            reader.read_to_end(&mut all_bytes).map_err(BuildError::Io)?;
290
291            // 인코딩 감지 및 변환
292            let encoding = match self.encoding {
293                Encoding::Utf8 => UTF_8,
294                Encoding::EucKr => EUC_KR,
295                Encoding::Auto => {
296                    // UTF-8 유효성 검사
297                    if std::str::from_utf8(&first_bytes).is_ok() {
298                        UTF_8
299                    } else {
300                        EUC_KR
301                    }
302                }
303            };
304
305            let (decoded, _, had_errors) = encoding.decode(&all_bytes);
306            if had_errors && self.verbose {
307                tracing::warn!("Encoding errors detected during decoding");
308            }
309
310            Ok(decoded.into_owned())
311        }
312
313        /// CSV 내용 파싱
314        ///
315        /// # Errors
316        ///
317        /// Returns an error if:
318        /// - CSV content is malformed
319        /// - Fields cannot be parsed to the expected types (`left_id`, `right_id`, cost)
320        pub fn parse_csv_content(&self, content: &str) -> Result<Vec<CsvEntry>> {
321            let mut entries = Vec::new();
322            let mut csv_reader = csv::ReaderBuilder::new()
323                .has_headers(false)
324                .flexible(true)
325                .comment(Some(b'#'))
326                .from_reader(content.as_bytes());
327
328            for (line_num, result) in csv_reader.records().enumerate() {
329                let record = result.map_err(BuildError::Csv)?;
330
331                if record.len() < 12 {
332                    if self.verbose {
333                        tracing::warn!(
334                            "Line {}: Expected 12 fields, got {}. Skipping.",
335                            line_num + 1,
336                            record.len()
337                        );
338                    }
339                    continue;
340                }
341
342                let mut entry = CsvEntry {
343                    surface: record[0].to_string(),
344                    left_id: record[1].parse().map_err(|_| {
345                        BuildError::Format(format!("Invalid left_id at line {}", line_num + 1))
346                    })?,
347                    right_id: record[2].parse().map_err(|_| {
348                        BuildError::Format(format!("Invalid right_id at line {}", line_num + 1))
349                    })?,
350                    cost: record[3].parse().map_err(|_| {
351                        BuildError::Format(format!("Invalid cost at line {}", line_num + 1))
352                    })?,
353                    pos: record[4].to_string(),
354                    pos_detail: record[5].to_string(),
355                    jongseong: record[6].to_string(),
356                    reading: record[7].to_string(),
357                    entry_type: record[8].to_string(),
358                    first_pos: record[9].to_string(),
359                    last_pos: record[10].to_string(),
360                    expression: record[11].to_string(),
361                };
362
363                entry.normalize_jongseong();
364                entries.push(entry);
365            }
366
367            Ok(entries)
368        }
369    }
370}
371
372/// 사전 빌드 파이프라인
373///
374/// CSV → 바이너리 사전 변환
375#[allow(clippy::mixed_attributes_style)]
376pub mod builder {
377    //! 사전 빌더 구현
378
379    use super::char_def_parser::CharDef;
380    use super::csv_parser::{CsvEntry, CsvParser, Encoding};
381    use super::unk_def_parser::UnkDef;
382    use super::{BuildError, Result};
383    use mecab_ko_dict::dictionary::DictEntry;
384    use mecab_ko_dict::matrix::{DenseMatrix, Matrix};
385    use mecab_ko_dict::trie::TrieBuilder;
386    use std::collections::HashMap;
387    use std::path::Path;
388
389    /// 빌드 설정
390    #[derive(Debug, Clone)]
391    pub struct BuildConfig {
392        /// 입력 디렉토리
393        pub input_dir: String,
394        /// 출력 디렉토리
395        pub output_dir: String,
396        /// 압축 레벨 (0-22, 0=압축 안 함)
397        pub compression_level: i32,
398        /// 인코딩
399        pub encoding: Encoding,
400        /// 자세한 출력
401        pub verbose: bool,
402    }
403
404    impl Default for BuildConfig {
405        fn default() -> Self {
406            Self {
407                input_dir: ".".to_string(),
408                output_dir: "./dict".to_string(),
409                compression_level: 3,
410                encoding: Encoding::Auto,
411                verbose: false,
412            }
413        }
414    }
415
416    /// 사전 빌더
417    pub struct DictionaryBuilder {
418        config: BuildConfig,
419    }
420
421    impl DictionaryBuilder {
422        /// 새 빌더 생성
423        #[must_use]
424        pub const fn new(config: BuildConfig) -> Self {
425            Self { config }
426        }
427
428        /// 사전 빌드 실행
429        ///
430        /// # Errors
431        ///
432        /// Returns an error if:
433        /// - CSV files cannot be parsed
434        /// - `matrix.def` is missing or malformed
435        /// - Trie building fails
436        /// - Output files cannot be written
437        pub fn build(&self) -> Result<BuildResult> {
438            if self.config.verbose {
439                tracing::info!("Starting dictionary build");
440                tracing::info!("  Input: {}", self.config.input_dir);
441                tracing::info!("  Output: {}", self.config.output_dir);
442            }
443
444            // 1. CSV 파싱
445            let csv_entries = self.parse_csv_files()?;
446
447            // 2. matrix.def 파싱
448            let matrix = self.build_matrix()?;
449
450            // 3. char.def 파싱 (선택적)
451            let char_def = self.build_char_def().ok();
452
453            // 4. unk.def 파싱 (선택적)
454            let unk_def = self.build_unk_def().ok();
455
456            // 5. Trie 및 엔트리 빌드
457            let (trie_bytes, dict_entries) = self.build_trie_and_entries(&csv_entries)?;
458
459            // 6. 바이너리 출력
460            self.save_dictionary(
461                &trie_bytes,
462                &matrix,
463                &dict_entries,
464                char_def.as_ref(),
465                unk_def.as_ref(),
466            )?;
467
468            Ok(BuildResult {
469                entry_count: dict_entries.len(),
470                trie_size: trie_bytes.len(),
471                matrix_size: matrix.left_size() * matrix.right_size(),
472            })
473        }
474
475        /// CSV 파일 파싱
476        fn parse_csv_files(&self) -> Result<Vec<CsvEntry>> {
477            if self.config.verbose {
478                tracing::info!("Parsing CSV files...");
479            }
480
481            let parser = CsvParser::new(&self.config.input_dir)
482                .with_encoding(self.config.encoding)
483                .verbose(self.config.verbose);
484
485            parser.parse_all()
486        }
487
488        /// 연접 비용 행렬 빌드
489        fn build_matrix(&self) -> Result<DenseMatrix> {
490            let matrix_path = Path::new(&self.config.input_dir).join("matrix.def");
491
492            if self.config.verbose {
493                tracing::info!("Loading connection matrix from {}", matrix_path.display());
494            }
495
496            if !matrix_path.exists() {
497                return Err(BuildError::Format(format!(
498                    "matrix.def not found: {}",
499                    matrix_path.display()
500                )));
501            }
502
503            DenseMatrix::from_def_file(&matrix_path).map_err(BuildError::Dict)
504        }
505
506        /// char.def 파싱
507        fn build_char_def(&self) -> Result<CharDef> {
508            let char_def_path = Path::new(&self.config.input_dir).join("char.def");
509
510            if self.config.verbose {
511                tracing::info!("Loading char.def from {}", char_def_path.display());
512            }
513
514            if !char_def_path.exists() {
515                if self.config.verbose {
516                    tracing::warn!("char.def not found, skipping");
517                }
518                return Err(BuildError::Format("char.def not found".to_string()));
519            }
520
521            CharDef::from_file(&char_def_path)
522        }
523
524        /// unk.def 파싱
525        fn build_unk_def(&self) -> Result<UnkDef> {
526            let unk_def_path = Path::new(&self.config.input_dir).join("unk.def");
527
528            if self.config.verbose {
529                tracing::info!("Loading unk.def from {}", unk_def_path.display());
530            }
531
532            if !unk_def_path.exists() {
533                if self.config.verbose {
534                    tracing::warn!("unk.def not found, skipping");
535                }
536                return Err(BuildError::Format("unk.def not found".to_string()));
537            }
538
539            UnkDef::from_file(&unk_def_path)
540        }
541
542        /// Trie 및 사전 엔트리 빌드
543        ///
544        /// # Errors
545        ///
546        /// Returns an error if the trie cannot be built from the entries.
547        pub fn build_trie_and_entries(
548            &self,
549            csv_entries: &[CsvEntry],
550        ) -> Result<(Vec<u8>, Vec<DictEntry>)> {
551            if self.config.verbose {
552                tracing::info!("Building trie and dictionary entries...");
553            }
554
555            // 표면형별로 엔트리 그룹화
556            let mut surface_map: HashMap<String, Vec<&CsvEntry>> = HashMap::new();
557            for entry in csv_entries {
558                surface_map
559                    .entry(entry.surface.clone())
560                    .or_default()
561                    .push(entry);
562            }
563
564            // DictEntry 생성 (정렬하여 인덱스 일관성 보장)
565            let mut dict_entries = Vec::new();
566            let mut trie_entries = Vec::new();
567
568            let mut surfaces: Vec<_> = surface_map.keys().collect();
569            surfaces.sort();
570
571            for surface in surfaces {
572                let entries = &surface_map[surface];
573
574                // 이 표면형의 첫 번째 인덱스를 Trie 값으로 사용
575                #[allow(clippy::cast_possible_truncation)]
576                let first_index = dict_entries.len() as u32;
577                trie_entries.push((surface.as_str(), first_index));
578
579                // 모든 엔트리를 DictEntry로 변환
580                for csv_entry in entries {
581                    let dict_entry = DictEntry::new(
582                        csv_entry.surface.clone(),
583                        csv_entry.left_id,
584                        csv_entry.right_id,
585                        csv_entry.cost,
586                        csv_entry.to_feature(),
587                    );
588                    dict_entries.push(dict_entry);
589                }
590            }
591
592            if self.config.verbose {
593                tracing::info!("  Unique surfaces: {}", trie_entries.len());
594                tracing::info!("  Total entries: {}", dict_entries.len());
595            }
596
597            // Trie 빌드 (이미 정렬됨)
598            let trie_bytes = TrieBuilder::build(&trie_entries)
599                .map_err(|e| BuildError::Trie(format!("Failed to build trie: {e}")))?;
600
601            if self.config.verbose {
602                tracing::info!("  Trie size: {} bytes", trie_bytes.len());
603            }
604
605            Ok((trie_bytes, dict_entries))
606        }
607
608        /// 사전 파일 저장
609        fn save_dictionary(
610            &self,
611            trie_bytes: &[u8],
612            matrix: &DenseMatrix,
613            dict_entries: &[DictEntry],
614            char_def: Option<&CharDef>,
615            unk_def: Option<&UnkDef>,
616        ) -> Result<()> {
617            let output_dir = Path::new(&self.config.output_dir);
618            std::fs::create_dir_all(output_dir).map_err(BuildError::Io)?;
619
620            if self.config.verbose {
621                tracing::info!("Saving dictionary files to {}", output_dir.display());
622            }
623
624            // Trie 저장
625            let trie_path = output_dir.join("sys.dic");
626            if self.config.compression_level > 0 {
627                if self.config.verbose {
628                    tracing::info!(
629                        "  Saving compressed trie (level {})...",
630                        self.config.compression_level
631                    );
632                }
633                let compressed_path = output_dir.join("sys.dic.zst");
634                TrieBuilder::save_to_compressed_file(
635                    trie_bytes,
636                    &compressed_path,
637                    self.config.compression_level,
638                )
639                .map_err(BuildError::Dict)?;
640
641                if self.config.verbose {
642                    let compressed_size = std::fs::metadata(&compressed_path)
643                        .map_err(BuildError::Io)?
644                        .len();
645                    #[allow(clippy::cast_precision_loss)]
646                    let ratio = (compressed_size as f64 / trie_bytes.len() as f64) * 100.0;
647                    tracing::info!(
648                        "  Compressed trie: {} bytes (ratio: {:.2}%)",
649                        compressed_size,
650                        ratio
651                    );
652                }
653            } else {
654                TrieBuilder::save_to_file(trie_bytes, &trie_path).map_err(BuildError::Dict)?;
655            }
656
657            // Matrix 저장
658            let matrix_path = output_dir.join("matrix.bin");
659            if self.config.compression_level > 0 {
660                if self.config.verbose {
661                    tracing::info!("  Saving compressed matrix...");
662                }
663                let compressed_path = output_dir.join("matrix.bin.zst");
664                matrix
665                    .to_compressed_file(&compressed_path, self.config.compression_level)
666                    .map_err(BuildError::Dict)?;
667            } else {
668                matrix.to_bin_file(&matrix_path).map_err(BuildError::Dict)?;
669            }
670
671            // char.def 저장
672            if let Some(char_def) = char_def {
673                if self.config.verbose {
674                    tracing::info!("  Saving char.def...");
675                }
676                let char_def_bytes = char_def.to_bytes();
677                let char_def_path = output_dir.join("char.bin");
678                std::fs::write(&char_def_path, char_def_bytes).map_err(BuildError::Io)?;
679            }
680
681            // unk.def 저장
682            if let Some(unk_def) = unk_def {
683                if self.config.verbose {
684                    tracing::info!("  Saving unk.def...");
685                }
686                let unk_def_bytes = unk_def.to_bytes();
687                let unk_def_path = output_dir.join("unk.bin");
688                std::fs::write(&unk_def_path, unk_def_bytes).map_err(BuildError::Io)?;
689            }
690
691            // Entries 저장 (바이너리 + CSV)
692            if !dict_entries.is_empty() {
693                if self.config.verbose {
694                    tracing::info!("  Saving entries ({} entries)...", dict_entries.len());
695                }
696
697                // entries.bin (바이너리 포맷)
698                let entries_bin_path = output_dir.join("entries.bin");
699                mecab_ko_dict::dictionary::SystemDictionary::save_entries_bin(
700                    dict_entries,
701                    &entries_bin_path,
702                )
703                .map_err(BuildError::Dict)?;
704
705                // entries.csv (텍스트 포맷, 디버깅/검증용)
706                let entries_csv_path = output_dir.join("entries.csv");
707                mecab_ko_dict::dictionary::SystemDictionary::save_entries_csv(
708                    dict_entries,
709                    &entries_csv_path,
710                )
711                .map_err(BuildError::Dict)?;
712            }
713
714            if self.config.verbose {
715                tracing::info!("Dictionary build completed successfully");
716            }
717
718            Ok(())
719        }
720
721        /// 입력 디렉토리 설정
722        #[must_use]
723        pub fn input_dir<P: AsRef<Path>>(mut self, path: P) -> Self {
724            self.config.input_dir = path.as_ref().to_string_lossy().to_string();
725            self
726        }
727
728        /// 출력 디렉토리 설정
729        #[must_use]
730        pub fn output_dir<P: AsRef<Path>>(mut self, path: P) -> Self {
731            self.config.output_dir = path.as_ref().to_string_lossy().to_string();
732            self
733        }
734
735        /// 압축 레벨 설정
736        #[must_use]
737        pub const fn compression_level(mut self, level: i32) -> Self {
738            self.config.compression_level = level;
739            self
740        }
741    }
742
743    /// 빌드 결과
744    #[derive(Debug, Clone)]
745    pub struct BuildResult {
746        /// 엔트리 수
747        pub entry_count: usize,
748        /// Trie 크기 (바이트)
749        pub trie_size: usize,
750        /// Matrix 크기 (엔트리 수)
751        pub matrix_size: usize,
752    }
753
754    impl BuildResult {
755        /// 결과 출력
756        pub fn print_summary(&self) {
757            println!("\n=== Build Summary ===");
758            println!("Entries:      {}", self.entry_count);
759            println!("Trie size:    {} bytes", self.trie_size);
760            println!("Matrix size:  {} entries", self.matrix_size);
761        }
762    }
763}
764
765#[cfg(test)]
766#[allow(clippy::expect_used, clippy::unwrap_used)]
767mod tests {
768    use super::*;
769    use csv_parser::{CsvParser, Encoding};
770    use tempfile::TempDir;
771
772    #[test]
773    fn test_csv_entry_to_feature() {
774        let entry = csv_parser::CsvEntry {
775            surface: "가다".to_string(),
776            left_id: 1,
777            right_id: 1,
778            cost: 100,
779            pos: "VV".to_string(),
780            pos_detail: "*".to_string(),
781            jongseong: "F".to_string(),
782            reading: "가다".to_string(),
783            entry_type: "*".to_string(),
784            first_pos: "VV".to_string(),
785            last_pos: "VV".to_string(),
786            expression: "*".to_string(),
787        };
788
789        let feature = entry.to_feature();
790        assert!(feature.contains("VV"));
791        assert!(feature.contains("가다"));
792        assert_eq!(feature, "VV,*,F,가다,*,VV,VV,*");
793    }
794
795    #[test]
796    fn test_csv_entry_normalize_jongseong() {
797        let mut entry = csv_parser::CsvEntry {
798            surface: "가방".to_string(),
799            left_id: 1,
800            right_id: 1,
801            cost: 100,
802            pos: "NNG".to_string(),
803            pos_detail: "*".to_string(),
804            jongseong: "*".to_string(),
805            reading: "가방".to_string(),
806            entry_type: "*".to_string(),
807            first_pos: "NNG".to_string(),
808            last_pos: "NNG".to_string(),
809            expression: "*".to_string(),
810        };
811
812        entry.normalize_jongseong();
813        assert_eq!(entry.jongseong, "T"); // 가방 has 종성
814
815        entry.surface = "가다".to_string();
816        entry.jongseong = "*".to_string();
817        entry.normalize_jongseong();
818        assert_eq!(entry.jongseong, "F"); // 가다 no 종성
819    }
820
821    #[test]
822    fn test_csv_parser_basic() {
823        let temp_dir = TempDir::new().expect("failed to create temp dir");
824        let csv_path = temp_dir.path().join("test.csv");
825
826        let csv_content = "가,1,2,100,NNG,*,T,가,*,NNG,NNG,*\n\
827                          가다,2,3,200,VV,*,F,가다,*,VV,VV,*\n\
828                          가방,3,4,300,NNG,*,T,가방,*,NNG,NNG,*\n";
829
830        std::fs::write(&csv_path, csv_content).expect("failed to write test csv");
831
832        let parser = CsvParser::new(temp_dir.path());
833        let entries = parser.parse_file(&csv_path).expect("failed to parse");
834
835        assert_eq!(entries.len(), 3);
836        assert_eq!(entries[0].surface, "가");
837        assert_eq!(entries[0].left_id, 1);
838        assert_eq!(entries[0].cost, 100);
839
840        assert_eq!(entries[1].surface, "가다");
841        assert_eq!(entries[2].surface, "가방");
842    }
843
844    #[test]
845    fn test_csv_parser_encoding() {
846        // 인코딩 설정은 private이므로 builder 패턴이 작동하는지만 확인
847        let _parser = CsvParser::new(".").with_encoding(Encoding::Utf8);
848        let _parser = CsvParser::new(".").with_encoding(Encoding::EucKr);
849        let _parser = CsvParser::new(".");
850        // 파서가 생성되면 테스트 성공
851    }
852
853    #[test]
854    fn test_csv_parser_with_comments() {
855        let csv_content = "# This is a comment\n\
856                          가,1,2,100,NNG,*,T,가,*,NNG,NNG,*\n\
857                          # Another comment\n\
858                          가다,2,3,200,VV,*,F,가다,*,VV,VV,*\n";
859
860        let parser = CsvParser::new(".");
861        let entries = parser
862            .parse_csv_content(csv_content)
863            .expect("failed to parse");
864
865        // 주석은 무시되어야 함
866        assert_eq!(entries.len(), 2);
867    }
868
869    #[test]
870    fn test_builder_creation() {
871        let config = builder::BuildConfig::default();
872        let _builder = DictionaryBuilder::new(config);
873    }
874
875    #[test]
876    fn test_builder_config() {
877        let config = builder::BuildConfig {
878            input_dir: "./input".to_string(),
879            output_dir: "./output".to_string(),
880            compression_level: 5,
881            encoding: Encoding::Utf8,
882            verbose: true,
883        };
884
885        assert_eq!(config.input_dir, "./input");
886        assert_eq!(config.output_dir, "./output");
887        assert_eq!(config.compression_level, 5);
888        assert!(config.verbose);
889    }
890
891    #[test]
892    fn test_full_build_pipeline() {
893        let temp_dir = TempDir::new().expect("failed to create temp dir");
894
895        // 테스트 CSV 파일 생성
896        let csv_path = temp_dir.path().join("test.csv");
897        let csv_content = "가,1,1,100,NNG,*,T,가,*,NNG,NNG,*\n\
898                          가다,1,1,200,VV,*,F,가다,*,VV,VV,*\n\
899                          가방,1,1,150,NNG,*,T,가방,*,NNG,NNG,*\n";
900        std::fs::write(&csv_path, csv_content).expect("failed to write csv");
901
902        // 테스트 matrix.def 파일 생성
903        let matrix_path = temp_dir.path().join("matrix.def");
904        let matrix_content = "2 2\n0 0 100\n0 1 200\n1 0 150\n1 1 50\n";
905        std::fs::write(&matrix_path, matrix_content).expect("failed to write matrix");
906
907        // 출력 디렉토리
908        let output_dir = temp_dir.path().join("output");
909
910        // 빌드 설정
911        let config = builder::BuildConfig {
912            input_dir: temp_dir.path().to_string_lossy().to_string(),
913            output_dir: output_dir.to_string_lossy().to_string(),
914            compression_level: 0, // 테스트에서는 압축 안 함
915            encoding: Encoding::Utf8,
916            verbose: false,
917        };
918
919        // 빌더 실행
920        let builder = DictionaryBuilder::new(config);
921        let result = builder.build().expect("build should succeed");
922
923        // 결과 검증
924        assert_eq!(result.entry_count, 3);
925        assert!(result.trie_size > 0);
926        assert_eq!(result.matrix_size, 4); // 2x2 matrix
927
928        // 출력 파일 확인
929        assert!(output_dir.join("sys.dic").exists());
930        assert!(output_dir.join("matrix.bin").exists());
931    }
932
933    #[test]
934    fn test_trie_and_entries_building() {
935        use mecab_ko_dict::trie::Trie;
936
937        let csv_entries = vec![
938            csv_parser::CsvEntry {
939                surface: "가".to_string(),
940                left_id: 1,
941                right_id: 1,
942                cost: 100,
943                pos: "NNG".to_string(),
944                pos_detail: "*".to_string(),
945                jongseong: "T".to_string(),
946                reading: "가".to_string(),
947                entry_type: "*".to_string(),
948                first_pos: "NNG".to_string(),
949                last_pos: "NNG".to_string(),
950                expression: "*".to_string(),
951            },
952            csv_parser::CsvEntry {
953                surface: "가다".to_string(),
954                left_id: 2,
955                right_id: 2,
956                cost: 200,
957                pos: "VV".to_string(),
958                pos_detail: "*".to_string(),
959                jongseong: "F".to_string(),
960                reading: "가다".to_string(),
961                entry_type: "*".to_string(),
962                first_pos: "VV".to_string(),
963                last_pos: "VV".to_string(),
964                expression: "*".to_string(),
965            },
966        ];
967
968        let temp_dir = TempDir::new().expect("failed to create temp dir");
969        let config = builder::BuildConfig {
970            input_dir: temp_dir.path().to_string_lossy().to_string(),
971            output_dir: temp_dir.path().to_string_lossy().to_string(),
972            compression_level: 0,
973            encoding: Encoding::Utf8,
974            verbose: false,
975        };
976
977        let builder = DictionaryBuilder::new(config);
978        let (trie_bytes, dict_entries) = builder
979            .build_trie_and_entries(&csv_entries)
980            .expect("should build trie");
981
982        // Trie 검증
983        assert!(!trie_bytes.is_empty());
984        let trie = Trie::new(&trie_bytes);
985        assert!(trie.exact_match("가").is_some());
986        assert!(trie.exact_match("가다").is_some());
987        assert!(trie.exact_match("없음").is_none());
988
989        // 엔트리 검증
990        assert_eq!(dict_entries.len(), 2);
991        assert_eq!(dict_entries[0].surface, "가");
992        assert_eq!(dict_entries[1].surface, "가다");
993    }
994
995    #[test]
996    fn test_korean_text_processing() {
997        // 한글 종성 감지 테스트
998        let mut entry1 = csv_parser::CsvEntry {
999            surface: "안녕".to_string(),
1000            left_id: 1,
1001            right_id: 1,
1002            cost: 100,
1003            pos: "NNG".to_string(),
1004            pos_detail: "*".to_string(),
1005            jongseong: "*".to_string(),
1006            reading: "안녕".to_string(),
1007            entry_type: "*".to_string(),
1008            first_pos: "NNG".to_string(),
1009            last_pos: "NNG".to_string(),
1010            expression: "*".to_string(),
1011        };
1012
1013        entry1.normalize_jongseong();
1014        assert_eq!(entry1.jongseong, "T"); // 안녕 has 종성 (ㅇ)
1015
1016        let mut entry2 = csv_parser::CsvEntry {
1017            surface: "하세요".to_string(),
1018            left_id: 1,
1019            right_id: 1,
1020            cost: 100,
1021            pos: "VV".to_string(),
1022            pos_detail: "*".to_string(),
1023            jongseong: "*".to_string(),
1024            reading: "하세요".to_string(),
1025            entry_type: "*".to_string(),
1026            first_pos: "VV".to_string(),
1027            last_pos: "VV".to_string(),
1028            expression: "*".to_string(),
1029        };
1030
1031        entry2.normalize_jongseong();
1032        assert_eq!(entry2.jongseong, "F"); // 하세요 no 종성
1033    }
1034
1035    #[test]
1036    fn test_build_result() {
1037        let result = builder::BuildResult {
1038            entry_count: 1000,
1039            trie_size: 50000,
1040            matrix_size: 400,
1041        };
1042
1043        assert_eq!(result.entry_count, 1000);
1044        assert_eq!(result.trie_size, 50000);
1045        assert_eq!(result.matrix_size, 400);
1046
1047        // print_summary는 단순 출력이므로 패닉이 없어야 함
1048        result.print_summary();
1049    }
1050}