lnmp_codec/
normalizer.rs

1//! Value normalization system for semantic equivalence.
2//!
3//! This module provides value normalization to ensure semantically equivalent values
4//! produce identical checksums. Normalization rules include:
5//! - Boolean: Convert all representations (true/false, yes/no, 1/0) to canonical form
6//! - Float: Convert -0.0 to 0.0, remove trailing zeros
7//! - String: Apply case transformation based on configuration
8
9use lnmp_core::LnmpValue;
10use lnmp_sfe::SemanticDictionary;
11
12/// String case transformation rules
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
14pub enum StringCaseRule {
15    /// Convert to lowercase
16    Lower,
17    /// Convert to uppercase
18    Upper,
19    /// No case transformation
20    #[default]
21    None,
22}
23
24// Default implementation derived via #[derive(Default)]
25
26/// Configuration for value normalization
27#[derive(Debug, Clone)]
28pub struct NormalizationConfig {
29    /// String case transformation rule
30    pub string_case: StringCaseRule,
31    /// Optional decimal precision for floats
32    pub float_precision: Option<usize>,
33    /// Whether to remove trailing zeros from floats
34    pub remove_trailing_zeros: bool,
35    /// Optional semantic dictionary for equivalence normalization
36    pub semantic_dictionary: Option<SemanticDictionary>,
37}
38
39impl Default for NormalizationConfig {
40    fn default() -> Self {
41        Self {
42            string_case: StringCaseRule::None,
43            float_precision: None,
44            remove_trailing_zeros: true,
45            semantic_dictionary: None,
46        }
47    }
48}
49
50/// Value normalizer for semantic equivalence
51#[derive(Debug)]
52pub struct ValueNormalizer {
53    config: NormalizationConfig,
54}
55
56impl ValueNormalizer {
57    /// Creates a new normalizer with the given configuration
58    pub fn new(config: NormalizationConfig) -> Self {
59        Self { config }
60    }
61
62    /// Normalizes a value to its canonical form (no field context).
63    pub fn normalize(&self, value: &LnmpValue) -> LnmpValue {
64        self.normalize_with_fid(None, value)
65    }
66
67    /// Normalizes a value with field context for dictionary-based mapping.
68    pub fn normalize_with_fid(&self, fid: Option<u16>, value: &LnmpValue) -> LnmpValue {
69        match value {
70            LnmpValue::Int(i) => LnmpValue::Int(*i),
71            LnmpValue::Float(f) => LnmpValue::Float(self.normalize_float(*f)),
72            LnmpValue::Bool(b) => LnmpValue::Bool(*b),
73            LnmpValue::String(s) => LnmpValue::String(self.normalize_string_for(fid, s)),
74            LnmpValue::StringArray(arr) => LnmpValue::StringArray(
75                arr.iter()
76                    .map(|s| self.normalize_string_for(fid, s))
77                    .collect(),
78            ),
79            LnmpValue::NestedRecord(record) => LnmpValue::NestedRecord(record.clone()),
80            LnmpValue::NestedArray(records) => LnmpValue::NestedArray(records.clone()),
81            LnmpValue::Embedding(vec) => LnmpValue::Embedding(vec.clone()),
82            LnmpValue::EmbeddingDelta(delta) => LnmpValue::EmbeddingDelta(delta.clone()),
83            LnmpValue::QuantizedEmbedding(qv) => LnmpValue::QuantizedEmbedding(qv.clone()),
84            LnmpValue::IntArray(arr) => LnmpValue::IntArray(arr.clone()),
85            LnmpValue::FloatArray(arr) => {
86                // Normalize each float in the array
87                let normalized_arr = arr.iter().map(|f| self.normalize_float(*f)).collect();
88                LnmpValue::FloatArray(normalized_arr)
89            }
90            LnmpValue::BoolArray(arr) => LnmpValue::BoolArray(arr.clone()),
91        }
92    }
93
94    /// Normalizes boolean representations to canonical form
95    ///
96    /// Converts common boolean representations:
97    /// - "true", "yes", "1" → true
98    /// - "false", "no", "0" → false
99    pub fn normalize_bool(&self, value: &str) -> Option<bool> {
100        match value.to_lowercase().as_str() {
101            "true" | "yes" | "1" => Some(true),
102            "false" | "no" | "0" => Some(false),
103            _ => None,
104        }
105    }
106
107    /// Normalizes float representations
108    ///
109    /// - Converts -0.0 to 0.0
110    /// - Removes trailing zeros after decimal point (if configured)
111    /// - Applies precision rounding (if configured)
112    fn normalize_float(&self, f: f64) -> f64 {
113        // Convert -0.0 to 0.0
114        let mut normalized = if f == 0.0 { 0.0 } else { f };
115
116        // Apply precision if configured
117        if let Some(precision) = self.config.float_precision {
118            let multiplier = 10_f64.powi(precision as i32);
119            normalized = (normalized * multiplier).round() / multiplier;
120        }
121
122        normalized
123    }
124
125    /// Normalizes string representations
126    ///
127    /// Applies case transformation based on configuration
128    fn normalize_string_for(&self, fid: Option<u16>, s: &str) -> String {
129        if let (Some(dict), Some(fid)) = (&self.config.semantic_dictionary, fid) {
130            if let Some(eq) = dict.get_equivalence(fid, s) {
131                return eq.to_string();
132            }
133            if let Some(eq) = dict.get_equivalence_normalized(fid, s) {
134                return eq.to_string();
135            }
136        }
137
138        match self.config.string_case {
139            StringCaseRule::Lower => s.to_lowercase(),
140            StringCaseRule::Upper => s.to_uppercase(),
141            StringCaseRule::None => s.to_string(),
142        }
143    }
144
145    /// Formats a normalized float as a string with trailing zeros removed
146    pub fn format_float(&self, f: f64) -> String {
147        if !self.config.remove_trailing_zeros {
148            return f.to_string();
149        }
150
151        let s = f.to_string();
152
153        // If there's no decimal point, return as-is
154        if !s.contains('.') {
155            return s;
156        }
157
158        // Remove trailing zeros after decimal point
159        let trimmed = s.trim_end_matches('0').trim_end_matches('.');
160        trimmed.to_string()
161    }
162}
163
164impl Default for ValueNormalizer {
165    fn default() -> Self {
166        Self::new(NormalizationConfig::default())
167    }
168}
169
170#[cfg(test)]
171mod tests {
172    #![allow(clippy::approx_constant)]
173
174    use super::*;
175
176    #[test]
177    fn test_default_config() {
178        let config = NormalizationConfig::default();
179        assert_eq!(config.string_case, StringCaseRule::None);
180        assert_eq!(config.float_precision, None);
181        assert!(config.remove_trailing_zeros);
182    }
183
184    #[test]
185    fn test_normalize_int() {
186        let normalizer = ValueNormalizer::default();
187        let value = LnmpValue::Int(42);
188        let normalized = normalizer.normalize(&value);
189        assert_eq!(normalized, LnmpValue::Int(42));
190    }
191
192    #[test]
193    fn test_normalize_bool() {
194        let normalizer = ValueNormalizer::default();
195        let value = LnmpValue::Bool(true);
196        let normalized = normalizer.normalize(&value);
197        assert_eq!(normalized, LnmpValue::Bool(true));
198    }
199
200    #[test]
201    fn test_normalize_bool_from_string() {
202        let normalizer = ValueNormalizer::default();
203
204        assert_eq!(normalizer.normalize_bool("true"), Some(true));
205        assert_eq!(normalizer.normalize_bool("True"), Some(true));
206        assert_eq!(normalizer.normalize_bool("TRUE"), Some(true));
207        assert_eq!(normalizer.normalize_bool("yes"), Some(true));
208        assert_eq!(normalizer.normalize_bool("Yes"), Some(true));
209        assert_eq!(normalizer.normalize_bool("1"), Some(true));
210
211        assert_eq!(normalizer.normalize_bool("false"), Some(false));
212        assert_eq!(normalizer.normalize_bool("False"), Some(false));
213        assert_eq!(normalizer.normalize_bool("FALSE"), Some(false));
214        assert_eq!(normalizer.normalize_bool("no"), Some(false));
215        assert_eq!(normalizer.normalize_bool("No"), Some(false));
216        assert_eq!(normalizer.normalize_bool("0"), Some(false));
217
218        assert_eq!(normalizer.normalize_bool("invalid"), None);
219        assert_eq!(normalizer.normalize_bool(""), None);
220    }
221
222    #[test]
223    fn test_normalize_float_negative_zero() {
224        let normalizer = ValueNormalizer::default();
225        let value = LnmpValue::Float(-0.0);
226        let normalized = normalizer.normalize(&value);
227        assert_eq!(normalized, LnmpValue::Float(0.0));
228    }
229
230    #[test]
231    fn test_normalize_float_positive_zero() {
232        let normalizer = ValueNormalizer::default();
233        let value = LnmpValue::Float(0.0);
234        let normalized = normalizer.normalize(&value);
235        assert_eq!(normalized, LnmpValue::Float(0.0));
236    }
237
238    #[test]
239    fn test_normalize_float_regular() {
240        let normalizer = ValueNormalizer::default();
241        let value = LnmpValue::Float(3.14);
242        let normalized = normalizer.normalize(&value);
243        assert_eq!(normalized, LnmpValue::Float(3.14));
244    }
245
246    #[test]
247    fn test_normalize_float_with_precision() {
248        let config = NormalizationConfig {
249            string_case: StringCaseRule::None,
250            float_precision: Some(2),
251            remove_trailing_zeros: true,
252            semantic_dictionary: None,
253        };
254        let normalizer = ValueNormalizer::new(config);
255
256        let value = LnmpValue::Float(3.14159);
257        let normalized = normalizer.normalize(&value);
258        assert_eq!(normalized, LnmpValue::Float(3.14));
259    }
260
261    #[test]
262    fn test_format_float_remove_trailing_zeros() {
263        let normalizer = ValueNormalizer::default();
264
265        assert_eq!(normalizer.format_float(3.140), "3.14");
266        assert_eq!(normalizer.format_float(3.100), "3.1");
267        assert_eq!(normalizer.format_float(3.000), "3");
268        assert_eq!(normalizer.format_float(3.14), "3.14");
269        assert_eq!(normalizer.format_float(0.0), "0");
270    }
271
272    #[test]
273    fn test_format_float_keep_trailing_zeros() {
274        let config = NormalizationConfig {
275            string_case: StringCaseRule::None,
276            float_precision: None,
277            remove_trailing_zeros: false,
278            semantic_dictionary: None,
279        };
280        let normalizer = ValueNormalizer::new(config);
281
282        let formatted = normalizer.format_float(3.14);
283        assert!(formatted.starts_with("3.14"));
284    }
285
286    #[test]
287    fn test_normalize_string_no_case() {
288        let normalizer = ValueNormalizer::default();
289        let value = LnmpValue::String("Test".to_string());
290        let normalized = normalizer.normalize(&value);
291        assert_eq!(normalized, LnmpValue::String("Test".to_string()));
292    }
293
294    #[test]
295    fn test_normalize_string_lowercase() {
296        let config = NormalizationConfig {
297            string_case: StringCaseRule::Lower,
298            float_precision: None,
299            remove_trailing_zeros: true,
300            semantic_dictionary: None,
301        };
302        let normalizer = ValueNormalizer::new(config);
303
304        let value = LnmpValue::String("TeSt".to_string());
305        let normalized = normalizer.normalize(&value);
306        assert_eq!(normalized, LnmpValue::String("test".to_string()));
307    }
308
309    #[test]
310    fn test_normalize_string_uppercase() {
311        let config = NormalizationConfig {
312            string_case: StringCaseRule::Upper,
313            float_precision: None,
314            remove_trailing_zeros: true,
315            semantic_dictionary: None,
316        };
317        let normalizer = ValueNormalizer::new(config);
318
319        let value = LnmpValue::String("TeSt".to_string());
320        let normalized = normalizer.normalize(&value);
321        assert_eq!(normalized, LnmpValue::String("TEST".to_string()));
322    }
323
324    #[test]
325    fn test_normalize_string_array() {
326        let config = NormalizationConfig {
327            string_case: StringCaseRule::Lower,
328            float_precision: None,
329            remove_trailing_zeros: true,
330            semantic_dictionary: None,
331        };
332        let normalizer = ValueNormalizer::new(config);
333
334        let value = LnmpValue::StringArray(vec![
335            "Admin".to_string(),
336            "Developer".to_string(),
337            "USER".to_string(),
338        ]);
339        let normalized = normalizer.normalize(&value);
340
341        assert_eq!(
342            normalized,
343            LnmpValue::StringArray(vec![
344                "admin".to_string(),
345                "developer".to_string(),
346                "user".to_string(),
347            ])
348        );
349    }
350
351    #[test]
352    fn test_normalize_nested_record() {
353        use lnmp_core::{LnmpField, LnmpRecord};
354
355        let normalizer = ValueNormalizer::default();
356
357        let mut record = LnmpRecord::new();
358        record.add_field(LnmpField {
359            fid: 1,
360            value: LnmpValue::Int(42),
361        });
362
363        let value = LnmpValue::NestedRecord(Box::new(record.clone()));
364        let normalized = normalizer.normalize(&value);
365
366        // Nested records are not modified by normalization
367        assert_eq!(normalized, LnmpValue::NestedRecord(Box::new(record)));
368    }
369
370    #[test]
371    fn test_normalize_nested_array() {
372        use lnmp_core::{LnmpField, LnmpRecord};
373
374        let normalizer = ValueNormalizer::default();
375
376        let mut record = LnmpRecord::new();
377        record.add_field(LnmpField {
378            fid: 1,
379            value: LnmpValue::Int(42),
380        });
381
382        let value = LnmpValue::NestedArray(vec![record.clone()]);
383        let normalized = normalizer.normalize(&value);
384
385        // Nested arrays are not modified by normalization
386        assert_eq!(normalized, LnmpValue::NestedArray(vec![record]));
387    }
388
389    #[test]
390    fn test_string_case_rule_default() {
391        assert_eq!(StringCaseRule::default(), StringCaseRule::None);
392    }
393}