lindera-dictionary 3.0.7

A morphological dictionary library.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
use std::collections::HashMap;
use std::io::{BufReader, Read};
use std::path::Path;

use anyhow::Result;

use super::feature_extractor::FeatureExtractor;
use super::feature_rewriter::DictionaryRewriter;
use crate::dictionary::Dictionary;
use crate::dictionary::character_definition::CharacterDefinition;
use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
use crate::dictionary::metadata::Metadata;
use crate::dictionary::prefix_dictionary::PrefixDictionary;
use crate::dictionary::unknown_dictionary::UnknownDictionary;

/// Configuration for training.
pub struct TrainerConfig {
    pub(crate) dict: Dictionary,
    pub(crate) surfaces: Vec<String>,
    /// Feature strings for each entry (parallel to surfaces)
    pub(crate) features: Vec<String>,
    /// Maps surface forms to their original feature strings from the lexicon
    pub(crate) surface_features: HashMap<String, String>,
    /// User lexicon entries for additional vocabulary
    pub(crate) user_lexicon: HashMap<String, String>,
    pub(crate) feature_extractor: FeatureExtractor,
    pub(crate) dictionary_rewriter: DictionaryRewriter,
    /// Cost factor for converting CRF weights to i16 costs (MeCab's cost-factor)
    pub(crate) cost_factor: i32,
    /// Metadata from which encoding and schema information is derived
    pub(crate) metadata: Metadata,
    /// Maps unknown word category names to their feature strings from unk.def
    /// Format: category -> "pos,feature1,feature2,..."
    pub(crate) unk_categories: HashMap<String, String>,
    /// Maps unknown word category names to their costs from unk.def
    /// Format: category -> cost
    pub(crate) unk_costs: HashMap<String, i32>,
    /// Raw content of the character definition file (char.def)
    /// Preserved from training input for export
    pub(crate) char_def_content: String,
    /// Raw content of the feature definition file (feature.def)
    /// Preserved from training input for export
    pub(crate) feature_def_content: String,
    /// Raw content of the rewrite rule definition file (rewrite.def)
    /// Preserved from training input for export
    pub(crate) rewrite_def_content: String,
}

impl TrainerConfig {
    /// Access system lexicon for morphological analysis
    pub fn system_lexicon(&self) -> &PrefixDictionary {
        &self.dict.prefix_dictionary
    }

    /// Access dictionary (for compatibility)
    pub fn dict(&self) -> &Dictionary {
        &self.dict
    }

    /// Access unknown word handler for out-of-vocabulary processing
    pub fn unk_handler(&self) -> &crate::dictionary::unknown_dictionary::UnknownDictionary {
        &self.dict.unknown_dictionary
    }
}

impl TrainerConfig {
    /// Creates a new trainer configuration from readers.
    ///
    /// # Arguments
    ///
    /// * `lexicon_rdr` - Reader for the seed lexicon file (lex.csv)
    /// * `char_prop_rdr` - Reader for the character property file (char.def)
    /// * `unk_handler_rdr` - Reader for the unknown word file (unk.def)
    /// * `feature_templates_rdr` - Reader for the feature templates file (feature.def)
    /// * `rewrite_rules_rdr` - Reader for the rewrite rules file (rewrite.def)
    pub fn from_readers<R1, R2, R3, R4, R5>(
        lexicon_rdr: R1,
        char_prop_rdr: R2,
        unk_handler_rdr: R3,
        feature_templates_rdr: R4,
        rewrite_rules_rdr: R5,
    ) -> Result<Self>
    where
        R1: Read,
        R2: Read,
        R3: Read,
        R4: Read,
        R5: Read,
    {
        // Parse lexicon to extract surfaces and features
        let mut surfaces = Vec::new();
        let mut features = Vec::new();
        let mut surface_features = HashMap::new();
        let mut lexicon_content = String::new();
        {
            let mut lexicon_reader = BufReader::new(lexicon_rdr);
            std::io::Read::read_to_string(&mut lexicon_reader, &mut lexicon_content)?;
        }

        for line in lexicon_content.lines() {
            if line.trim().is_empty() || line.starts_with('#') {
                continue;
            }
            let parts: Vec<&str> = line.split(',').collect();

            // Accept any dictionary format with at least 5 columns
            // Format: surface,left_id,right_id,cost,feature1,feature2,...
            // - IPADIC:    13 columns (pos + 8 feature fields)
            // - UniDic:    21+ columns (pos + 16+ feature fields)
            // - ko-dic:    8 columns (pos + 3 feature fields)
            // - CC-CEDICT: 8 columns (pos + 3 feature fields)
            if parts.len() >= 5 {
                let surface = parts[0].to_string();
                // Extract features from columns 4 onwards (skip surface,left_id,right_id,cost)
                // This works for any dictionary format
                let feature_str = parts[4..].join(",");
                surfaces.push(surface.clone());
                features.push(feature_str.clone());
                surface_features.insert(surface, feature_str);
            }
        }

        // Create feature extractor from templates
        let mut feature_content = String::new();
        {
            let mut template_reader = BufReader::new(feature_templates_rdr);
            std::io::Read::read_to_string(&mut template_reader, &mut feature_content)?;
        }

        // Parse templates into unigram and bigram categories
        let mut unigram_templates = Vec::new();
        let mut bigram_templates = Vec::new();

        for line in feature_content.lines() {
            if line.trim().is_empty() || line.starts_with('#') {
                continue;
            }
            // Parse template format: MeCab-compatible feature.def
            // UNIGRAM U00:%F[0]  or  UNIGRAM:%F[0]
            // BIGRAM B00:%L[0]/%R[0]  or  BIGRAM:%L[0]/%R[0]
            if let Some(rest) = line.strip_prefix("UNIGRAM") {
                // Extract the template part after optional label (e.g., "U00:")
                let rest = rest.trim_start().trim_start_matches(':').trim_start();
                let template = if let Some(idx) = rest.find('%') {
                    &rest[idx..]
                } else {
                    rest
                };
                unigram_templates.push(template.to_string());
            } else if let Some(rest) = line.strip_prefix("BIGRAM") {
                // Extract the template part after optional label (e.g., "B00:")
                let rest = rest.trim_start().trim_start_matches(':').trim_start();
                let template = if let Some(idx) = rest.find('%') {
                    &rest[idx..]
                } else {
                    rest
                };
                if let Some((left, right)) = template.split_once('/') {
                    bigram_templates.push((left.to_string(), right.to_string()));
                }
            } else {
                // Default unigram template (bare template without prefix)
                unigram_templates.push(line.to_string());
            }
        }

        // Create feature extractor with parsed templates
        let feature_extractor =
            FeatureExtractor::from_templates(&unigram_templates, &bigram_templates);

        // Read rewrite rules content
        let mut rewrite_def_content = String::new();
        {
            let mut rewrite_reader = BufReader::new(rewrite_rules_rdr);
            std::io::Read::read_to_string(&mut rewrite_reader, &mut rewrite_def_content)?;
        }

        // Create dictionary rewriter with 3-section support
        let dictionary_rewriter =
            DictionaryRewriter::from_reader(std::io::Cursor::new(rewrite_def_content.as_bytes()))?;

        // Parse unk.def to extract category-to-features mapping
        let mut unk_content = String::new();
        {
            let mut unk_reader = BufReader::new(unk_handler_rdr);
            std::io::Read::read_to_string(&mut unk_reader, &mut unk_content)?;
        }

        let mut unk_categories = HashMap::new();
        let mut unk_costs = HashMap::new();
        for line in unk_content.lines() {
            if line.trim().is_empty() || line.starts_with('#') {
                continue;
            }
            let parts: Vec<&str> = line.split(',').collect();
            // Format: category,left_id,right_id,cost,feature1,feature2,...
            if parts.len() >= 5 {
                let category = parts[0].to_string();
                let features = parts[4..].join(",");
                unk_categories.insert(category.clone(), features);

                // Parse cost (4th column)
                if let Ok(cost) = parts[3].parse::<i32>() {
                    unk_costs.insert(category, cost);
                }
            }
        }

        // Read character properties content
        let mut char_def_content = String::new();
        {
            let mut char_prop_reader = BufReader::new(char_prop_rdr);
            std::io::Read::read_to_string(&mut char_prop_reader, &mut char_def_content)?;
        }

        // Build dictionary from readers (need to re-create readers from content strings)
        use std::io::Cursor;
        let dict = Self::build_dictionary_from_readers(
            &lexicon_content,
            Cursor::new(char_def_content.as_bytes()),
            Cursor::new(unk_content.as_bytes()),
        )?;

        Ok(Self {
            dict,
            surfaces,
            features,
            surface_features,
            user_lexicon: HashMap::new(), // Initialize empty user lexicon
            feature_extractor,
            dictionary_rewriter,
            cost_factor: 700,              // MeCab default cost-factor
            metadata: Metadata::default(), // Use default metadata for backward compatibility
            unk_categories,
            unk_costs,
            char_def_content,
            feature_def_content: feature_content,
            rewrite_def_content,
        })
    }

    /// Get the surfaces extracted from the lexicon
    pub fn surfaces(&self) -> &[String] {
        &self.surfaces
    }

    /// Get the surface features mapping
    pub fn surface_features(&self) -> &HashMap<String, String> {
        &self.surface_features
    }

    /// Get the user lexicon mapping
    pub fn user_lexicon(&self) -> &HashMap<String, String> {
        &self.user_lexicon
    }

    /// Add user lexicon entry (user dictionary support)
    pub fn add_user_lexicon_entry(&mut self, surface: String, features: String) {
        self.user_lexicon.insert(surface, features);
    }

    /// Get features for a given surface form
    pub fn get_features(&self, surface: &str) -> Option<String> {
        // First check user lexicon, then surface features
        self.user_lexicon
            .get(surface)
            .or_else(|| self.surface_features.get(surface))
            .cloned()
    }

    /// Load user lexicon from CSV content
    pub fn load_user_lexicon_from_content(&mut self, content: &str) -> Result<()> {
        for line in content.lines() {
            if line.trim().is_empty() || line.starts_with('#') {
                continue;
            }

            let parts: Vec<&str> = line.split(',').collect();
            if parts.len() >= 5 {
                let surface = parts[0].to_string();
                // Extract features from columns 4 onwards (skip surface,left_id,right_id,cost)
                let features = parts[4..].join(",");
                self.user_lexicon.insert(surface, features);
            }
        }
        Ok(())
    }

    /// Creates a new trainer configuration from file paths.
    pub fn from_paths(
        lexicon_path: &Path,
        char_prop_path: &Path,
        unk_handler_path: &Path,
        feature_templates_path: &Path,
        rewrite_rules_path: &Path,
    ) -> Result<Self> {
        use std::fs::File;

        Self::from_readers(
            File::open(lexicon_path)?,
            File::open(char_prop_path)?,
            File::open(unk_handler_path)?,
            File::open(feature_templates_path)?,
            File::open(rewrite_rules_path)?,
        )
    }

    /// Get the metadata
    pub fn metadata(&self) -> &Metadata {
        &self.metadata
    }

    /// Builds a dictionary from raw file contents
    fn build_dictionary_from_readers<R2, R3>(
        lexicon_content: &str,
        char_prop_rdr: R2,
        unk_handler_rdr: R3,
    ) -> Result<Dictionary>
    where
        R2: Read,
        R3: Read,
    {
        // Read character properties
        let mut char_prop_content = String::new();
        let mut char_prop_reader = BufReader::new(char_prop_rdr);
        std::io::Read::read_to_string(&mut char_prop_reader, &mut char_prop_content)?;

        // Read unknown word definitions
        let mut unk_content = String::new();
        let mut unk_reader = BufReader::new(unk_handler_rdr);
        std::io::Read::read_to_string(&mut unk_reader, &mut unk_content)?;

        // Build character definition
        let char_def = Self::build_char_def_from_content(&char_prop_content)?;

        // Build unknown dictionary
        let unknown_dict = Self::build_unknown_dict_from_content(&unk_content, &char_def)?;

        // Build prefix dictionary (lexicon)
        let prefix_dict = Self::build_prefix_dict_from_content(lexicon_content)?;

        // Create minimal connection cost matrix
        let conn_matrix = Self::create_minimal_connection_matrix()?;

        Ok(Dictionary {
            prefix_dictionary: prefix_dict,
            connection_cost_matrix: conn_matrix,
            character_definition: char_def,
            unknown_dictionary: unknown_dict,
            metadata: Metadata::default(),
        })
    }

    fn build_char_def_from_content(content: &str) -> Result<CharacterDefinition> {
        use crate::dictionary::character_definition::{CategoryData, CategoryId, LookupTable};
        use std::collections::HashMap;

        let mut category_definitions = Vec::new();
        let mut category_names = Vec::new();
        let mut category_map = HashMap::new(); // Name -> Index
        let mut char_ranges = Vec::new();

        // Always add DEFAULT as category 0
        category_names.push("DEFAULT".to_string());
        category_map.insert("DEFAULT".to_string(), 0);
        category_definitions.push(CategoryData {
            invoke: false,
            group: true,
            length: 0,
        });

        // Parse the char.def file
        for line in content.lines() {
            let line = line.trim();

            // Skip comments and empty lines
            if line.is_empty() || line.starts_with('#') {
                continue;
            }

            // Parse character range mappings (e.g., "0x3041..0x3096 HIRAGANA")
            if line.starts_with("0x") {
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 2 {
                    let range_str = parts[0];
                    let category = parts[1];

                    // Parse range (e.g., "0x3041..0x3096")
                    if let Some(range_parts) = range_str.split_once("..") {
                        let start = u32::from_str_radix(&range_parts.0[2..], 16)?;
                        let end = u32::from_str_radix(&range_parts.1[2..], 16)?;

                        // Get or create category index
                        let cat_idx =
                            *category_map.entry(category.to_string()).or_insert_with(|| {
                                let idx = category_names.len();
                                category_names.push(category.to_string());
                                // Default category data - will be overridden if defined
                                category_definitions.push(CategoryData {
                                    invoke: true,
                                    group: true,
                                    length: 0,
                                });
                                idx
                            });

                        char_ranges.push((start, end, cat_idx));
                    }
                }
            } else {
                // Parse category definitions (e.g., "HIRAGANA 1 1 0")
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 4 {
                    let name = parts[0];
                    let invoke = parts[1] != "0";
                    let group = parts[2] != "0";
                    let length = parts[3].parse::<u8>().unwrap_or(0);

                    // Get or create category index
                    let cat_idx = *category_map.entry(name.to_string()).or_insert_with(|| {
                        let idx = category_names.len();
                        category_names.push(name.to_string());
                        category_definitions.push(CategoryData {
                            invoke,
                            group,
                            length: length.into(),
                        });
                        idx
                    });

                    // Update category definition if it already exists
                    if cat_idx < category_definitions.len() {
                        category_definitions[cat_idx] = CategoryData {
                            invoke,
                            group,
                            length: length.into(),
                        };
                    }
                }
            }
        }

        // Sort char ranges by start position
        char_ranges.sort_by_key(|&(start, _, _)| start);

        // Build boundaries and mapping function
        let mut boundaries = vec![0u32];
        for &(start, end, _) in &char_ranges {
            if start > boundaries[boundaries.len() - 1] {
                boundaries.push(start);
            }
            boundaries.push(end + 1);
        }
        if boundaries[boundaries.len() - 1] < 0x10FFFF {
            boundaries.push(0x10FFFF);
        }

        // Create lookup table with proper category mappings
        let ranges_clone = char_ranges.clone();
        let mapping = LookupTable::from_fn(boundaries, &|c, buff| {
            let code = c;

            // Find which category this character belongs to
            for &(start, end, cat_idx) in &ranges_clone {
                if code >= start && code <= end {
                    buff.push(CategoryId(cat_idx));
                    return;
                }
            }

            // Default to category 0 (DEFAULT)
            buff.push(CategoryId(0));
        });

        Ok(CharacterDefinition {
            category_definitions,
            category_names,
            mapping,
        })
    }

    fn build_unknown_dict_from_content(
        _content: &str,
        _char_def: &CharacterDefinition,
    ) -> Result<UnknownDictionary> {
        // Create minimal unknown dictionary for training
        Ok(UnknownDictionary {
            category_references: vec![vec![0]; 6], // One for each category
            costs: vec![],                         // Will be filled during training
            words_idx_data: vec![],
            words_data: vec![],
        })
    }

    fn build_prefix_dict_from_content(_content: &str) -> Result<PrefixDictionary> {
        use crate::util::Data;
        use daachorse::DoubleArrayAhoCorasickBuilder;

        // Create minimal prefix dictionary structure for training
        // In production, this would parse the lexicon CSV format
        let keys: &[&str] = &["\0"];
        let da = DoubleArrayAhoCorasickBuilder::new().build(keys).unwrap();

        Ok(PrefixDictionary {
            da,
            vals_data: Data::from(vec![]),
            words_idx_data: Data::from(vec![]),
            words_data: Data::from(vec![]),
            is_system: true,
        })
    }

    fn create_minimal_connection_matrix() -> Result<ConnectionCostMatrix> {
        // Create minimal 6x6 connection matrix for the basic categories
        let matrix_size = 6u16;
        let mut matrix_data = vec![0u8; 4]; // Header: forward_size(2) + backward_size(2)

        // Write matrix dimensions
        matrix_data[0..2].copy_from_slice(&matrix_size.to_le_bytes());
        matrix_data[2..4].copy_from_slice(&matrix_size.to_le_bytes());

        // Add connection costs (all zero for simplicity)
        let cost_data_size = (matrix_size as usize) * (matrix_size as usize) * 2; // 2 bytes per cost
        matrix_data.extend(vec![0u8; cost_data_size]);

        Ok(ConnectionCostMatrix::load(matrix_data)?)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn test_ipadic_format_13_columns() {
        // IPADIC format: 13 columns
        let seed_csv = "東京,0,0,5000,名詞,固有名詞,地域,一般,*,*,東京,トウキョウ,トーキョー\n\
                        行く,1,1,4000,動詞,自立,*,*,五段・カ行促音便,基本形,行く,イク,イク\n";
        let char_def = "DEFAULT 0 1 0\nHIRAGANA 1 1 0\n0x3042..0x3096 HIRAGANA\n";
        let unk_def = "DEFAULT,0,0,1500,名詞,一般,*,*,*,*,*,*,*\n";
        let feature_def = "UNIGRAM:%F[0]\nUNIGRAM:%F[1]\n";
        let rewrite_def = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(seed_csv),
            Cursor::new(char_def),
            Cursor::new(unk_def),
            Cursor::new(feature_def),
            Cursor::new(rewrite_def),
        )
        .unwrap();

        assert_eq!(config.surfaces().len(), 2);
        assert!(config.surfaces().contains(&"東京".to_string()));
        assert!(config.surfaces().contains(&"行く".to_string()));

        // Verify features are correctly extracted (9 fields after surface,left_id,right_id,cost)
        let tokyo_features = config.surface_features().get("東京").unwrap();
        assert_eq!(
            tokyo_features,
            "名詞,固有名詞,地域,一般,*,*,東京,トウキョウ,トーキョー"
        );
    }

    #[test]
    fn test_ko_dic_format_8_columns() {
        // ko-dic format: 8 columns
        let seed_csv = "한국,0,0,5000,NNG,Korea,F,han-guk\n\
                        안녕,1,1,4000,NNG,hello,F,an-nyeong\n";
        let char_def = "DEFAULT 0 1 0\nHANGUL 1 1 0\n0xAC00..0xD7A3 HANGUL\n";
        let unk_def = "DEFAULT,0,0,1500,NNG,unknown,F,*\n";
        let feature_def = "UNIGRAM:%F[0]\n";
        let rewrite_def = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(seed_csv),
            Cursor::new(char_def),
            Cursor::new(unk_def),
            Cursor::new(feature_def),
            Cursor::new(rewrite_def),
        )
        .unwrap();

        assert_eq!(config.surfaces().len(), 2);
        assert!(config.surfaces().contains(&"한국".to_string()));
        assert!(config.surfaces().contains(&"안녕".to_string()));

        // Verify features (4 fields after surface,left_id,right_id,cost)
        let korea_features = config.surface_features().get("한국").unwrap();
        assert_eq!(korea_features, "NNG,Korea,F,han-guk");
    }

    #[test]
    fn test_cc_cedict_format_8_columns() {
        // CC-CEDICT format: 8 columns
        let seed_csv = "中国,0,0,5000,n,China,*,zhong1guo2\n\
                        你好,1,1,4000,x,hello,*,ni3hao3\n";
        let char_def = "DEFAULT 0 1 0\nHANZI 1 1 0\n0x4E00..0x9FFF HANZI\n";
        let unk_def = "DEFAULT,0,0,1500,n,unknown,*,*\n";
        let feature_def = "UNIGRAM:%F[0]\n";
        let rewrite_def = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(seed_csv),
            Cursor::new(char_def),
            Cursor::new(unk_def),
            Cursor::new(feature_def),
            Cursor::new(rewrite_def),
        )
        .unwrap();

        assert_eq!(config.surfaces().len(), 2);
        assert!(config.surfaces().contains(&"中国".to_string()));
        assert!(config.surfaces().contains(&"你好".to_string()));

        // Verify features (4 fields after surface,left_id,right_id,cost)
        let china_features = config.surface_features().get("中国").unwrap();
        assert_eq!(china_features, "n,China,*,zhong1guo2");
    }

    #[test]
    fn test_unidic_format_21_columns() {
        // UniDic format: 21 columns (simplified example)
        let seed_csv = "東京,0,0,5000,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,東京,東京,東京,東京,トウキョウ,トーキョー,東京,東京,1\n";
        let char_def = "DEFAULT 0 1 0\nKANJI 0 0 2\n0x4E00..0x9FFF KANJI\n";
        let unk_def = "DEFAULT,0,0,1500,名詞,普通名詞,一般,*,*,*,*,*,*,*,*,*,*,*,*,*,*\n";
        let feature_def = "UNIGRAM:%F[0]\nUNIGRAM:%F[1]\n";
        let rewrite_def = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(seed_csv),
            Cursor::new(char_def),
            Cursor::new(unk_def),
            Cursor::new(feature_def),
            Cursor::new(rewrite_def),
        )
        .unwrap();

        assert_eq!(config.surfaces().len(), 1);
        assert!(config.surfaces().contains(&"東京".to_string()));

        // Verify features (17 fields after surface,left_id,right_id,cost)
        let tokyo_features = config.surface_features().get("東京").unwrap();
        assert_eq!(
            tokyo_features,
            "名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,東京,東京,東京,東京,トウキョウ,トーキョー,東京,東京,1"
        );
    }

    #[test]
    fn test_mixed_column_counts() {
        // Test that we can handle files with varying column counts
        let seed_csv = "東京,0,0,5000,名詞,固有名詞,地域,一般,*,*,東京,トウキョウ,トーキョー\n\
                        한국,1,1,4000,NNG,Korea,F,han-guk\n\
                        中国,2,2,3000,n,China,*,zhong1guo2\n";
        let char_def = "DEFAULT 0 1 0\n";
        let unk_def = "DEFAULT,0,0,1500,*,*,*,*\n";
        let feature_def = "UNIGRAM:%F[0]\n";
        let rewrite_def = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(seed_csv),
            Cursor::new(char_def),
            Cursor::new(unk_def),
            Cursor::new(feature_def),
            Cursor::new(rewrite_def),
        )
        .unwrap();

        assert_eq!(config.surfaces().len(), 3);

        // Each row has different number of feature fields, all should be accepted
        assert_eq!(
            config.surface_features().get("東京").unwrap(),
            "名詞,固有名詞,地域,一般,*,*,東京,トウキョウ,トーキョー"
        );
        assert_eq!(
            config.surface_features().get("한국").unwrap(),
            "NNG,Korea,F,han-guk"
        );
        assert_eq!(
            config.surface_features().get("中国").unwrap(),
            "n,China,*,zhong1guo2"
        );
    }

    #[test]
    fn test_trainer_config_creation() {
        // Test that TrainerConfig can be created with minimal valid data
        let lexicon_data = "外国,0,0,5000,名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n人,1,1,5000,名詞,接尾,一般,*,*,*,人,ジン,ジン\n";
        let char_data = "# char.def placeholder\n";
        let unk_data = "# unk.def placeholder\n";
        let feature_data = "UNIGRAM:%F[0]\nLEFT:%L[0]\nRIGHT:%R[0]\n";
        let rewrite_data = "# rewrite.def placeholder\n";

        let result = TrainerConfig::from_readers(
            Cursor::new(lexicon_data.as_bytes()),
            Cursor::new(char_data.as_bytes()),
            Cursor::new(unk_data.as_bytes()),
            Cursor::new(feature_data.as_bytes()),
            Cursor::new(rewrite_data.as_bytes()),
        );

        // Config creation should now succeed with the fixed implementation
        assert!(result.is_ok());
        let config = result.unwrap();
        // Verify that surfaces were extracted correctly using the getter
        assert_eq!(config.surfaces().len(), 2);
        assert!(config.surfaces().contains(&"外国".to_string()));
        assert!(config.surfaces().contains(&"".to_string()));
    }

    #[test]
    fn test_unk_categories_ipadic() {
        // Test that unk_categories are correctly extracted for IPADIC format
        let lexicon_data = "東京,0,0,5000,名詞,固有名詞,地域,一般,*,*,東京,トウキョウ,トーキョー\n";
        let char_data = "DEFAULT 0 1 0\nHIRAGANA 1 1 0\n";
        let unk_data = "DEFAULT,0,0,1500,名詞,一般,*,*,*,*,*,*,*\nHIRAGANA,1,1,2000,名詞,代名詞,一般,*,*,*,*,*,*\n";
        let feature_data = "UNIGRAM:%F[0]\n";
        let rewrite_data = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(lexicon_data),
            Cursor::new(char_data),
            Cursor::new(unk_data),
            Cursor::new(feature_data),
            Cursor::new(rewrite_data),
        )
        .unwrap();

        // Verify unk_categories extracted correctly
        assert_eq!(config.unk_categories.len(), 2);
        assert_eq!(
            config.unk_categories.get("DEFAULT").unwrap(),
            "名詞,一般,*,*,*,*,*,*,*"
        );
        assert_eq!(
            config.unk_categories.get("HIRAGANA").unwrap(),
            "名詞,代名詞,一般,*,*,*,*,*,*"
        );
    }

    #[test]
    fn test_unk_categories_ko_dic() {
        // Test that unk_categories work for Korean dictionary format
        let lexicon_data = "한국,0,0,5000,NNG,Korea,F,han-guk\n";
        let char_data = "DEFAULT 0 1 0\n";
        let unk_data = "DEFAULT,0,0,1500,NNG,unknown,F,*\n";
        let feature_data = "UNIGRAM:%F[0]\n";
        let rewrite_data = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(lexicon_data),
            Cursor::new(char_data),
            Cursor::new(unk_data),
            Cursor::new(feature_data),
            Cursor::new(rewrite_data),
        )
        .unwrap();

        assert_eq!(config.unk_categories.len(), 1);
        assert_eq!(
            config.unk_categories.get("DEFAULT").unwrap(),
            "NNG,unknown,F,*"
        );
    }

    #[test]
    fn test_unk_categories_cc_cedict() {
        // Test that unk_categories work for Chinese dictionary format
        let lexicon_data = "中国,0,0,5000,n,China,*,zhong1guo2\n";
        let char_data = "DEFAULT 0 1 0\n";
        let unk_data = "DEFAULT,0,0,1500,n,unknown,*,*\n";
        let feature_data = "UNIGRAM:%F[0]\n";
        let rewrite_data = "*\tUNK\n";

        let config = TrainerConfig::from_readers(
            Cursor::new(lexicon_data),
            Cursor::new(char_data),
            Cursor::new(unk_data),
            Cursor::new(feature_data),
            Cursor::new(rewrite_data),
        )
        .unwrap();

        assert_eq!(config.unk_categories.len(), 1);
        assert_eq!(
            config.unk_categories.get("DEFAULT").unwrap(),
            "n,unknown,*,*"
        );
    }
}