1#![warn(missing_docs)]
22#![deny(unsafe_code)]
23
24pub use builder::DictionaryBuilder;
25pub use error::{BuildError, Result};
26
27pub mod char_def_parser;
28pub mod inflect_gen;
29pub mod unk_def_parser;
30
31pub mod error {
33 use thiserror::Error;
34
35 #[derive(Error, Debug)]
37 pub enum BuildError {
38 #[error("IO error: {0}")]
40 Io(#[from] std::io::Error),
41
42 #[error("CSV parsing error: {0}")]
44 Csv(#[from] csv::Error),
45
46 #[error("Invalid format: {0}")]
48 Format(String),
49
50 #[error("Encoding error: {0}")]
52 Encoding(String),
53
54 #[error("Trie build error: {0}")]
56 Trie(String),
57
58 #[error("Dictionary error: {0}")]
60 Dict(#[from] mecab_ko_dict::error::DictError),
61 }
62
63 pub type Result<T> = std::result::Result<T, BuildError>;
65}
66
67#[allow(clippy::mixed_attributes_style)]
71pub mod csv_parser {
72 use super::{BuildError, Result};
75 use std::fs::File;
76 use std::io::BufReader;
77 use std::path::Path;
78
79 use encoding_rs::{EUC_KR, UTF_8};
80 use mecab_ko_hangul::has_jongseong;
81
82 #[derive(Debug, Clone)]
87 pub struct CsvEntry {
88 pub surface: String,
90 pub left_id: u16,
92 pub right_id: u16,
94 pub cost: i16,
96 pub pos: String,
98 pub pos_detail: String,
100 pub jongseong: String,
102 pub reading: String,
104 pub entry_type: String,
106 pub first_pos: String,
108 pub last_pos: String,
110 pub expression: String,
112 }
113
114 impl CsvEntry {
115 #[must_use]
119 pub fn to_feature(&self) -> String {
120 format!(
121 "{},{},{},{},{},{},{},{}",
122 self.pos,
123 self.pos_detail,
124 self.jongseong,
125 self.reading,
126 self.entry_type,
127 self.first_pos,
128 self.last_pos,
129 self.expression
130 )
131 }
132
133 pub fn normalize_jongseong(&mut self) {
137 if self.jongseong == "*" && !self.surface.is_empty() {
138 if let Some(last_char) = self.surface.chars().last() {
140 self.jongseong = match has_jongseong(last_char) {
141 Some(true) => "T".to_string(),
142 Some(false) => "F".to_string(),
143 None => "*".to_string(), };
145 }
146 }
147 }
148 }
149
150 pub struct CsvParser {
152 dir_path: String,
154 encoding: Encoding,
156 verbose: bool,
158 }
159
160 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
162 pub enum Encoding {
163 Utf8,
165 EucKr,
167 Auto,
169 }
170
171 impl CsvParser {
172 #[must_use]
174 pub fn new<P: AsRef<Path>>(dir_path: P) -> Self {
175 Self {
176 dir_path: dir_path.as_ref().to_string_lossy().to_string(),
177 encoding: Encoding::Auto,
178 verbose: false,
179 }
180 }
181
182 #[must_use]
184 pub const fn with_encoding(mut self, encoding: Encoding) -> Self {
185 self.encoding = encoding;
186 self
187 }
188
189 #[must_use]
191 pub const fn verbose(mut self, verbose: bool) -> Self {
192 self.verbose = verbose;
193 self
194 }
195
196 pub fn parse_all(&self) -> Result<Vec<CsvEntry>> {
207 let dir = Path::new(&self.dir_path);
208 if !dir.is_dir() {
209 return Err(BuildError::Format(format!(
210 "Directory not found: {}",
211 self.dir_path
212 )));
213 }
214
215 let mut all_entries = Vec::new();
216 let csv_files = Self::find_csv_files(dir)?;
217
218 if self.verbose {
219 tracing::info!("Found {} CSV files", csv_files.len());
220 }
221
222 for csv_file in csv_files {
223 if self.verbose {
224 tracing::debug!("Parsing {}", csv_file.display());
225 }
226
227 let entries = self.parse_file(&csv_file)?;
228 all_entries.extend(entries);
229 }
230
231 if self.verbose {
232 tracing::info!("Parsed {} total entries", all_entries.len());
233 }
234
235 Ok(all_entries)
236 }
237
238 fn find_csv_files(dir: &Path) -> Result<Vec<std::path::PathBuf>> {
240 let mut csv_files = Vec::new();
241
242 for entry in std::fs::read_dir(dir).map_err(BuildError::Io)? {
243 let entry = entry.map_err(BuildError::Io)?;
244 let path = entry.path();
245
246 if path.is_file() {
247 if let Some(ext) = path.extension() {
248 if ext == "csv" {
249 csv_files.push(path);
250 }
251 }
252 }
253 }
254
255 csv_files.sort();
256 Ok(csv_files)
257 }
258
259 pub fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<CsvEntry>> {
268 let file = File::open(path.as_ref()).map_err(BuildError::Io)?;
269 let content = self.read_with_encoding(file)?;
270
271 self.parse_csv_content(&content)
272 }
273
274 fn read_with_encoding(&self, file: File) -> Result<String> {
276 use std::io::Read;
277
278 let mut reader = BufReader::new(file);
279 let mut first_bytes = vec![0u8; 1024];
280 let n = reader
281 .by_ref()
282 .take(1024)
283 .read(&mut first_bytes)
284 .map_err(BuildError::Io)?;
285 first_bytes.truncate(n);
286
287 let mut all_bytes = first_bytes.clone();
289 reader.read_to_end(&mut all_bytes).map_err(BuildError::Io)?;
290
291 let encoding = match self.encoding {
293 Encoding::Utf8 => UTF_8,
294 Encoding::EucKr => EUC_KR,
295 Encoding::Auto => {
296 if std::str::from_utf8(&first_bytes).is_ok() {
298 UTF_8
299 } else {
300 EUC_KR
301 }
302 }
303 };
304
305 let (decoded, _, had_errors) = encoding.decode(&all_bytes);
306 if had_errors && self.verbose {
307 tracing::warn!("Encoding errors detected during decoding");
308 }
309
310 Ok(decoded.into_owned())
311 }
312
313 pub fn parse_csv_content(&self, content: &str) -> Result<Vec<CsvEntry>> {
321 let mut entries = Vec::new();
322 let mut csv_reader = csv::ReaderBuilder::new()
323 .has_headers(false)
324 .flexible(true)
325 .comment(Some(b'#'))
326 .from_reader(content.as_bytes());
327
328 for (line_num, result) in csv_reader.records().enumerate() {
329 let record = result.map_err(BuildError::Csv)?;
330
331 if record.len() < 12 {
332 if self.verbose {
333 tracing::warn!(
334 "Line {}: Expected 12 fields, got {}. Skipping.",
335 line_num + 1,
336 record.len()
337 );
338 }
339 continue;
340 }
341
342 let mut entry = CsvEntry {
343 surface: record[0].to_string(),
344 left_id: record[1].parse().map_err(|_| {
345 BuildError::Format(format!("Invalid left_id at line {}", line_num + 1))
346 })?,
347 right_id: record[2].parse().map_err(|_| {
348 BuildError::Format(format!("Invalid right_id at line {}", line_num + 1))
349 })?,
350 cost: record[3].parse().map_err(|_| {
351 BuildError::Format(format!("Invalid cost at line {}", line_num + 1))
352 })?,
353 pos: record[4].to_string(),
354 pos_detail: record[5].to_string(),
355 jongseong: record[6].to_string(),
356 reading: record[7].to_string(),
357 entry_type: record[8].to_string(),
358 first_pos: record[9].to_string(),
359 last_pos: record[10].to_string(),
360 expression: record[11].to_string(),
361 };
362
363 entry.normalize_jongseong();
364 entries.push(entry);
365 }
366
367 Ok(entries)
368 }
369 }
370}
371
372#[allow(clippy::mixed_attributes_style)]
376pub mod builder {
377 use super::char_def_parser::CharDef;
380 use super::csv_parser::{CsvEntry, CsvParser, Encoding};
381 use super::unk_def_parser::UnkDef;
382 use super::{BuildError, Result};
383 use mecab_ko_dict::dictionary::DictEntry;
384 use mecab_ko_dict::matrix::{DenseMatrix, Matrix};
385 use mecab_ko_dict::trie::TrieBuilder;
386 use std::collections::HashMap;
387 use std::path::Path;
388
389 #[derive(Debug, Clone)]
391 pub struct BuildConfig {
392 pub input_dir: String,
394 pub output_dir: String,
396 pub compression_level: i32,
398 pub encoding: Encoding,
400 pub verbose: bool,
402 }
403
404 impl Default for BuildConfig {
405 fn default() -> Self {
406 Self {
407 input_dir: ".".to_string(),
408 output_dir: "./dict".to_string(),
409 compression_level: 3,
410 encoding: Encoding::Auto,
411 verbose: false,
412 }
413 }
414 }
415
416 pub struct DictionaryBuilder {
418 config: BuildConfig,
419 }
420
421 impl DictionaryBuilder {
422 #[must_use]
424 pub const fn new(config: BuildConfig) -> Self {
425 Self { config }
426 }
427
428 pub fn build(&self) -> Result<BuildResult> {
438 if self.config.verbose {
439 tracing::info!("Starting dictionary build");
440 tracing::info!(" Input: {}", self.config.input_dir);
441 tracing::info!(" Output: {}", self.config.output_dir);
442 }
443
444 let csv_entries = self.parse_csv_files()?;
446
447 let matrix = self.build_matrix()?;
449
450 let char_def = self.build_char_def().ok();
452
453 let unk_def = self.build_unk_def().ok();
455
456 let (trie_bytes, dict_entries) = self.build_trie_and_entries(&csv_entries)?;
458
459 self.save_dictionary(
461 &trie_bytes,
462 &matrix,
463 &dict_entries,
464 char_def.as_ref(),
465 unk_def.as_ref(),
466 )?;
467
468 Ok(BuildResult {
469 entry_count: dict_entries.len(),
470 trie_size: trie_bytes.len(),
471 matrix_size: matrix.left_size() * matrix.right_size(),
472 })
473 }
474
475 fn parse_csv_files(&self) -> Result<Vec<CsvEntry>> {
477 if self.config.verbose {
478 tracing::info!("Parsing CSV files...");
479 }
480
481 let parser = CsvParser::new(&self.config.input_dir)
482 .with_encoding(self.config.encoding)
483 .verbose(self.config.verbose);
484
485 parser.parse_all()
486 }
487
488 fn build_matrix(&self) -> Result<DenseMatrix> {
490 let matrix_path = Path::new(&self.config.input_dir).join("matrix.def");
491
492 if self.config.verbose {
493 tracing::info!("Loading connection matrix from {}", matrix_path.display());
494 }
495
496 if !matrix_path.exists() {
497 return Err(BuildError::Format(format!(
498 "matrix.def not found: {}",
499 matrix_path.display()
500 )));
501 }
502
503 DenseMatrix::from_def_file(&matrix_path).map_err(BuildError::Dict)
504 }
505
506 fn build_char_def(&self) -> Result<CharDef> {
508 let char_def_path = Path::new(&self.config.input_dir).join("char.def");
509
510 if self.config.verbose {
511 tracing::info!("Loading char.def from {}", char_def_path.display());
512 }
513
514 if !char_def_path.exists() {
515 if self.config.verbose {
516 tracing::warn!("char.def not found, skipping");
517 }
518 return Err(BuildError::Format("char.def not found".to_string()));
519 }
520
521 CharDef::from_file(&char_def_path)
522 }
523
524 fn build_unk_def(&self) -> Result<UnkDef> {
526 let unk_def_path = Path::new(&self.config.input_dir).join("unk.def");
527
528 if self.config.verbose {
529 tracing::info!("Loading unk.def from {}", unk_def_path.display());
530 }
531
532 if !unk_def_path.exists() {
533 if self.config.verbose {
534 tracing::warn!("unk.def not found, skipping");
535 }
536 return Err(BuildError::Format("unk.def not found".to_string()));
537 }
538
539 UnkDef::from_file(&unk_def_path)
540 }
541
542 pub fn build_trie_and_entries(
548 &self,
549 csv_entries: &[CsvEntry],
550 ) -> Result<(Vec<u8>, Vec<DictEntry>)> {
551 if self.config.verbose {
552 tracing::info!("Building trie and dictionary entries...");
553 }
554
555 let mut surface_map: HashMap<String, Vec<&CsvEntry>> = HashMap::new();
557 for entry in csv_entries {
558 surface_map
559 .entry(entry.surface.clone())
560 .or_default()
561 .push(entry);
562 }
563
564 let mut dict_entries = Vec::new();
566 let mut trie_entries = Vec::new();
567
568 let mut surfaces: Vec<_> = surface_map.keys().collect();
569 surfaces.sort();
570
571 for surface in surfaces {
572 let entries = &surface_map[surface];
573
574 #[allow(clippy::cast_possible_truncation)]
576 let first_index = dict_entries.len() as u32;
577 trie_entries.push((surface.as_str(), first_index));
578
579 for csv_entry in entries {
581 let dict_entry = DictEntry::new(
582 csv_entry.surface.clone(),
583 csv_entry.left_id,
584 csv_entry.right_id,
585 csv_entry.cost,
586 csv_entry.to_feature(),
587 );
588 dict_entries.push(dict_entry);
589 }
590 }
591
592 if self.config.verbose {
593 tracing::info!(" Unique surfaces: {}", trie_entries.len());
594 tracing::info!(" Total entries: {}", dict_entries.len());
595 }
596
597 let trie_bytes = TrieBuilder::build(&trie_entries)
599 .map_err(|e| BuildError::Trie(format!("Failed to build trie: {e}")))?;
600
601 if self.config.verbose {
602 tracing::info!(" Trie size: {} bytes", trie_bytes.len());
603 }
604
605 Ok((trie_bytes, dict_entries))
606 }
607
608 fn save_dictionary(
610 &self,
611 trie_bytes: &[u8],
612 matrix: &DenseMatrix,
613 dict_entries: &[DictEntry],
614 char_def: Option<&CharDef>,
615 unk_def: Option<&UnkDef>,
616 ) -> Result<()> {
617 let output_dir = Path::new(&self.config.output_dir);
618 std::fs::create_dir_all(output_dir).map_err(BuildError::Io)?;
619
620 if self.config.verbose {
621 tracing::info!("Saving dictionary files to {}", output_dir.display());
622 }
623
624 let trie_path = output_dir.join("sys.dic");
626 if self.config.compression_level > 0 {
627 if self.config.verbose {
628 tracing::info!(
629 " Saving compressed trie (level {})...",
630 self.config.compression_level
631 );
632 }
633 let compressed_path = output_dir.join("sys.dic.zst");
634 TrieBuilder::save_to_compressed_file(
635 trie_bytes,
636 &compressed_path,
637 self.config.compression_level,
638 )
639 .map_err(BuildError::Dict)?;
640
641 if self.config.verbose {
642 let compressed_size = std::fs::metadata(&compressed_path)
643 .map_err(BuildError::Io)?
644 .len();
645 #[allow(clippy::cast_precision_loss)]
646 let ratio = (compressed_size as f64 / trie_bytes.len() as f64) * 100.0;
647 tracing::info!(
648 " Compressed trie: {} bytes (ratio: {:.2}%)",
649 compressed_size,
650 ratio
651 );
652 }
653 } else {
654 TrieBuilder::save_to_file(trie_bytes, &trie_path).map_err(BuildError::Dict)?;
655 }
656
657 let matrix_path = output_dir.join("matrix.bin");
659 if self.config.compression_level > 0 {
660 if self.config.verbose {
661 tracing::info!(" Saving compressed matrix...");
662 }
663 let compressed_path = output_dir.join("matrix.bin.zst");
664 matrix
665 .to_compressed_file(&compressed_path, self.config.compression_level)
666 .map_err(BuildError::Dict)?;
667 } else {
668 matrix.to_bin_file(&matrix_path).map_err(BuildError::Dict)?;
669 }
670
671 if let Some(char_def) = char_def {
673 if self.config.verbose {
674 tracing::info!(" Saving char.def...");
675 }
676 let char_def_bytes = char_def.to_bytes();
677 let char_def_path = output_dir.join("char.bin");
678 std::fs::write(&char_def_path, char_def_bytes).map_err(BuildError::Io)?;
679 }
680
681 if let Some(unk_def) = unk_def {
683 if self.config.verbose {
684 tracing::info!(" Saving unk.def...");
685 }
686 let unk_def_bytes = unk_def.to_bytes();
687 let unk_def_path = output_dir.join("unk.bin");
688 std::fs::write(&unk_def_path, unk_def_bytes).map_err(BuildError::Io)?;
689 }
690
691 if !dict_entries.is_empty() {
693 if self.config.verbose {
694 tracing::info!(" Saving entries ({} entries)...", dict_entries.len());
695 }
696
697 let entries_bin_path = output_dir.join("entries.bin");
699 mecab_ko_dict::dictionary::SystemDictionary::save_entries_bin(
700 dict_entries,
701 &entries_bin_path,
702 )
703 .map_err(BuildError::Dict)?;
704
705 let entries_csv_path = output_dir.join("entries.csv");
707 mecab_ko_dict::dictionary::SystemDictionary::save_entries_csv(
708 dict_entries,
709 &entries_csv_path,
710 )
711 .map_err(BuildError::Dict)?;
712 }
713
714 if self.config.verbose {
715 tracing::info!("Dictionary build completed successfully");
716 }
717
718 Ok(())
719 }
720
721 #[must_use]
723 pub fn input_dir<P: AsRef<Path>>(mut self, path: P) -> Self {
724 self.config.input_dir = path.as_ref().to_string_lossy().to_string();
725 self
726 }
727
728 #[must_use]
730 pub fn output_dir<P: AsRef<Path>>(mut self, path: P) -> Self {
731 self.config.output_dir = path.as_ref().to_string_lossy().to_string();
732 self
733 }
734
735 #[must_use]
737 pub const fn compression_level(mut self, level: i32) -> Self {
738 self.config.compression_level = level;
739 self
740 }
741 }
742
743 #[derive(Debug, Clone)]
745 pub struct BuildResult {
746 pub entry_count: usize,
748 pub trie_size: usize,
750 pub matrix_size: usize,
752 }
753
754 impl BuildResult {
755 pub fn print_summary(&self) {
757 println!("\n=== Build Summary ===");
758 println!("Entries: {}", self.entry_count);
759 println!("Trie size: {} bytes", self.trie_size);
760 println!("Matrix size: {} entries", self.matrix_size);
761 }
762 }
763}
764
765#[cfg(test)]
766#[allow(clippy::expect_used, clippy::unwrap_used)]
767mod tests {
768 use super::*;
769 use csv_parser::{CsvParser, Encoding};
770 use tempfile::TempDir;
771
772 #[test]
773 fn test_csv_entry_to_feature() {
774 let entry = csv_parser::CsvEntry {
775 surface: "가다".to_string(),
776 left_id: 1,
777 right_id: 1,
778 cost: 100,
779 pos: "VV".to_string(),
780 pos_detail: "*".to_string(),
781 jongseong: "F".to_string(),
782 reading: "가다".to_string(),
783 entry_type: "*".to_string(),
784 first_pos: "VV".to_string(),
785 last_pos: "VV".to_string(),
786 expression: "*".to_string(),
787 };
788
789 let feature = entry.to_feature();
790 assert!(feature.contains("VV"));
791 assert!(feature.contains("가다"));
792 assert_eq!(feature, "VV,*,F,가다,*,VV,VV,*");
793 }
794
795 #[test]
796 fn test_csv_entry_normalize_jongseong() {
797 let mut entry = csv_parser::CsvEntry {
798 surface: "가방".to_string(),
799 left_id: 1,
800 right_id: 1,
801 cost: 100,
802 pos: "NNG".to_string(),
803 pos_detail: "*".to_string(),
804 jongseong: "*".to_string(),
805 reading: "가방".to_string(),
806 entry_type: "*".to_string(),
807 first_pos: "NNG".to_string(),
808 last_pos: "NNG".to_string(),
809 expression: "*".to_string(),
810 };
811
812 entry.normalize_jongseong();
813 assert_eq!(entry.jongseong, "T"); entry.surface = "가다".to_string();
816 entry.jongseong = "*".to_string();
817 entry.normalize_jongseong();
818 assert_eq!(entry.jongseong, "F"); }
820
821 #[test]
822 fn test_csv_parser_basic() {
823 let temp_dir = TempDir::new().expect("failed to create temp dir");
824 let csv_path = temp_dir.path().join("test.csv");
825
826 let csv_content = "가,1,2,100,NNG,*,T,가,*,NNG,NNG,*\n\
827 가다,2,3,200,VV,*,F,가다,*,VV,VV,*\n\
828 가방,3,4,300,NNG,*,T,가방,*,NNG,NNG,*\n";
829
830 std::fs::write(&csv_path, csv_content).expect("failed to write test csv");
831
832 let parser = CsvParser::new(temp_dir.path());
833 let entries = parser.parse_file(&csv_path).expect("failed to parse");
834
835 assert_eq!(entries.len(), 3);
836 assert_eq!(entries[0].surface, "가");
837 assert_eq!(entries[0].left_id, 1);
838 assert_eq!(entries[0].cost, 100);
839
840 assert_eq!(entries[1].surface, "가다");
841 assert_eq!(entries[2].surface, "가방");
842 }
843
844 #[test]
845 fn test_csv_parser_encoding() {
846 let _parser = CsvParser::new(".").with_encoding(Encoding::Utf8);
848 let _parser = CsvParser::new(".").with_encoding(Encoding::EucKr);
849 let _parser = CsvParser::new(".");
850 }
852
853 #[test]
854 fn test_csv_parser_with_comments() {
855 let csv_content = "# This is a comment\n\
856 가,1,2,100,NNG,*,T,가,*,NNG,NNG,*\n\
857 # Another comment\n\
858 가다,2,3,200,VV,*,F,가다,*,VV,VV,*\n";
859
860 let parser = CsvParser::new(".");
861 let entries = parser
862 .parse_csv_content(csv_content)
863 .expect("failed to parse");
864
865 assert_eq!(entries.len(), 2);
867 }
868
869 #[test]
870 fn test_builder_creation() {
871 let config = builder::BuildConfig::default();
872 let _builder = DictionaryBuilder::new(config);
873 }
874
875 #[test]
876 fn test_builder_config() {
877 let config = builder::BuildConfig {
878 input_dir: "./input".to_string(),
879 output_dir: "./output".to_string(),
880 compression_level: 5,
881 encoding: Encoding::Utf8,
882 verbose: true,
883 };
884
885 assert_eq!(config.input_dir, "./input");
886 assert_eq!(config.output_dir, "./output");
887 assert_eq!(config.compression_level, 5);
888 assert!(config.verbose);
889 }
890
891 #[test]
892 fn test_full_build_pipeline() {
893 let temp_dir = TempDir::new().expect("failed to create temp dir");
894
895 let csv_path = temp_dir.path().join("test.csv");
897 let csv_content = "가,1,1,100,NNG,*,T,가,*,NNG,NNG,*\n\
898 가다,1,1,200,VV,*,F,가다,*,VV,VV,*\n\
899 가방,1,1,150,NNG,*,T,가방,*,NNG,NNG,*\n";
900 std::fs::write(&csv_path, csv_content).expect("failed to write csv");
901
902 let matrix_path = temp_dir.path().join("matrix.def");
904 let matrix_content = "2 2\n0 0 100\n0 1 200\n1 0 150\n1 1 50\n";
905 std::fs::write(&matrix_path, matrix_content).expect("failed to write matrix");
906
907 let output_dir = temp_dir.path().join("output");
909
910 let config = builder::BuildConfig {
912 input_dir: temp_dir.path().to_string_lossy().to_string(),
913 output_dir: output_dir.to_string_lossy().to_string(),
914 compression_level: 0, encoding: Encoding::Utf8,
916 verbose: false,
917 };
918
919 let builder = DictionaryBuilder::new(config);
921 let result = builder.build().expect("build should succeed");
922
923 assert_eq!(result.entry_count, 3);
925 assert!(result.trie_size > 0);
926 assert_eq!(result.matrix_size, 4); assert!(output_dir.join("sys.dic").exists());
930 assert!(output_dir.join("matrix.bin").exists());
931 }
932
933 #[test]
934 fn test_trie_and_entries_building() {
935 use mecab_ko_dict::trie::Trie;
936
937 let csv_entries = vec![
938 csv_parser::CsvEntry {
939 surface: "가".to_string(),
940 left_id: 1,
941 right_id: 1,
942 cost: 100,
943 pos: "NNG".to_string(),
944 pos_detail: "*".to_string(),
945 jongseong: "T".to_string(),
946 reading: "가".to_string(),
947 entry_type: "*".to_string(),
948 first_pos: "NNG".to_string(),
949 last_pos: "NNG".to_string(),
950 expression: "*".to_string(),
951 },
952 csv_parser::CsvEntry {
953 surface: "가다".to_string(),
954 left_id: 2,
955 right_id: 2,
956 cost: 200,
957 pos: "VV".to_string(),
958 pos_detail: "*".to_string(),
959 jongseong: "F".to_string(),
960 reading: "가다".to_string(),
961 entry_type: "*".to_string(),
962 first_pos: "VV".to_string(),
963 last_pos: "VV".to_string(),
964 expression: "*".to_string(),
965 },
966 ];
967
968 let temp_dir = TempDir::new().expect("failed to create temp dir");
969 let config = builder::BuildConfig {
970 input_dir: temp_dir.path().to_string_lossy().to_string(),
971 output_dir: temp_dir.path().to_string_lossy().to_string(),
972 compression_level: 0,
973 encoding: Encoding::Utf8,
974 verbose: false,
975 };
976
977 let builder = DictionaryBuilder::new(config);
978 let (trie_bytes, dict_entries) = builder
979 .build_trie_and_entries(&csv_entries)
980 .expect("should build trie");
981
982 assert!(!trie_bytes.is_empty());
984 let trie = Trie::new(&trie_bytes);
985 assert!(trie.exact_match("가").is_some());
986 assert!(trie.exact_match("가다").is_some());
987 assert!(trie.exact_match("없음").is_none());
988
989 assert_eq!(dict_entries.len(), 2);
991 assert_eq!(dict_entries[0].surface, "가");
992 assert_eq!(dict_entries[1].surface, "가다");
993 }
994
995 #[test]
996 fn test_korean_text_processing() {
997 let mut entry1 = csv_parser::CsvEntry {
999 surface: "안녕".to_string(),
1000 left_id: 1,
1001 right_id: 1,
1002 cost: 100,
1003 pos: "NNG".to_string(),
1004 pos_detail: "*".to_string(),
1005 jongseong: "*".to_string(),
1006 reading: "안녕".to_string(),
1007 entry_type: "*".to_string(),
1008 first_pos: "NNG".to_string(),
1009 last_pos: "NNG".to_string(),
1010 expression: "*".to_string(),
1011 };
1012
1013 entry1.normalize_jongseong();
1014 assert_eq!(entry1.jongseong, "T"); let mut entry2 = csv_parser::CsvEntry {
1017 surface: "하세요".to_string(),
1018 left_id: 1,
1019 right_id: 1,
1020 cost: 100,
1021 pos: "VV".to_string(),
1022 pos_detail: "*".to_string(),
1023 jongseong: "*".to_string(),
1024 reading: "하세요".to_string(),
1025 entry_type: "*".to_string(),
1026 first_pos: "VV".to_string(),
1027 last_pos: "VV".to_string(),
1028 expression: "*".to_string(),
1029 };
1030
1031 entry2.normalize_jongseong();
1032 assert_eq!(entry2.jongseong, "F"); }
1034
1035 #[test]
1036 fn test_build_result() {
1037 let result = builder::BuildResult {
1038 entry_count: 1000,
1039 trie_size: 50000,
1040 matrix_size: 400,
1041 };
1042
1043 assert_eq!(result.entry_count, 1000);
1044 assert_eq!(result.trie_size, 50000);
1045 assert_eq!(result.matrix_size, 400);
1046
1047 result.print_summary();
1049 }
1050}