wetext_rs/
config.rs

1//! Configuration types for WeText-RS
2
3/// Text normalization operation type
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
5pub enum Operator {
6    /// Text Normalization: numbers → words (e.g., "123" → "一百二十三")
7    #[default]
8    Tn,
9    /// Inverse Text Normalization: words → numbers (e.g., "一百二十三" → "123")
10    Itn,
11}
12
13/// Language type
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
15pub enum Language {
16    /// Auto-detect language
17    #[default]
18    Auto,
19    /// English
20    En,
21    /// Chinese
22    Zh,
23    /// Japanese
24    Ja,
25}
26
27/// Normalizer configuration
28#[derive(Debug, Clone, Default)]
29pub struct NormalizerConfig {
30    /// Language setting, Auto means auto-detect
31    pub lang: Language,
32
33    /// Operation type: TN or ITN
34    pub operator: Operator,
35
36    /// Whether to fix English contractions (e.g., "don't" → "do not")
37    pub fix_contractions: bool,
38
39    /// Whether to convert Traditional Chinese to Simplified Chinese
40    pub traditional_to_simple: bool,
41
42    /// Whether to convert full-width characters to half-width
43    pub full_to_half: bool,
44
45    /// Whether to remove interjections (e.g., "嗯", "啊")
46    pub remove_interjections: bool,
47
48    /// Whether to remove punctuation marks
49    pub remove_puncts: bool,
50
51    /// Whether to tag OOV (out-of-vocabulary) words
52    pub tag_oov: bool,
53
54    /// Whether to enable 0-9 digit conversion in ITN
55    pub enable_0_to_9: bool,
56
57    /// Whether to remove erhua (儿化音) (e.g., "哪儿" → "哪")
58    pub remove_erhua: bool,
59}
60
61impl NormalizerConfig {
62    /// Create a new configuration with default values
63    pub fn new() -> Self {
64        Self::default()
65    }
66
67    /// Set the language
68    pub fn with_lang(mut self, lang: Language) -> Self {
69        self.lang = lang;
70        self
71    }
72
73    /// Set the operator
74    pub fn with_operator(mut self, operator: Operator) -> Self {
75        self.operator = operator;
76        self
77    }
78
79    /// Set whether to fix contractions
80    pub fn with_fix_contractions(mut self, fix: bool) -> Self {
81        self.fix_contractions = fix;
82        self
83    }
84
85    /// Set whether to convert traditional to simplified Chinese
86    pub fn with_traditional_to_simple(mut self, convert: bool) -> Self {
87        self.traditional_to_simple = convert;
88        self
89    }
90
91    /// Set whether to convert full-width to half-width
92    pub fn with_full_to_half(mut self, convert: bool) -> Self {
93        self.full_to_half = convert;
94        self
95    }
96
97    /// Set whether to remove interjections
98    pub fn with_remove_interjections(mut self, remove: bool) -> Self {
99        self.remove_interjections = remove;
100        self
101    }
102
103    /// Set whether to remove punctuation
104    pub fn with_remove_puncts(mut self, remove: bool) -> Self {
105        self.remove_puncts = remove;
106        self
107    }
108
109    /// Set whether to remove erhua
110    pub fn with_remove_erhua(mut self, remove: bool) -> Self {
111        self.remove_erhua = remove;
112        self
113    }
114
115    /// Set whether to tag OOV words
116    pub fn with_tag_oov(mut self, tag: bool) -> Self {
117        self.tag_oov = tag;
118        self
119    }
120
121    /// Set whether to enable 0-9 conversion in ITN
122    pub fn with_enable_0_to_9(mut self, enable: bool) -> Self {
123        self.enable_0_to_9 = enable;
124        self
125    }
126}