matchy_data_format/
lib.rs

1//! Data section encoding and decoding for Paraglob v2
2//!
3//! Provides full MMDB-compatible data encoding for storing pattern-associated data.
4//! Implements the complete MaxMind DB data type specification.
5//!
6//! # Supported Types
7//!
8//! Complete MMDB type support:
9//! - **Pointer**: Reference to another data item (with base handling)
10//! - **String**: UTF-8 text data
11//! - **Double**: 64-bit floating point (IEEE 754)
12//! - **Bytes**: Raw byte arrays
13//! - **Uint16**: Unsigned 16-bit integers
14//! - **Uint32**: Unsigned 32-bit integers
15//! - **Map**: Key-value pairs (string keys)
16//! - **Int32**: Signed 32-bit integers
17//! - **Uint64**: Unsigned 64-bit integers
18//! - **Uint128**: Unsigned 128-bit integers
19//! - **Array**: Ordered lists of values
20//! - **Bool**: Boolean values
21//! - **Float**: 32-bit floating point (IEEE 754)
22//!
23//! # Format
24//!
25//! Uses MMDB encoding: control byte(s) followed by data.
26//! Control byte encodes type (3 bits) and size/payload (5 bits).
27//!
28//! See: <https://maxmind.github.io/MaxMind-DB/>
29
30use chrono::{DateTime, TimeZone, Utc};
31use std::collections::HashMap;
32use std::hash::{Hash, Hasher};
33
34/// Extended type ID for Timestamp (Matchy extension, avoids collision with MaxMind types 1-15)
35const TIMESTAMP_EXTENDED_TYPE: u8 = 121; // Type 128 = 7 + 121
36
37fn try_parse_iso8601(s: &str) -> Option<i64> {
38    DateTime::parse_from_rfc3339(s)
39        .ok()
40        .map(|dt| dt.timestamp())
41}
42
43fn format_iso8601(epoch: i64) -> String {
44    Utc.timestamp_opt(epoch, 0)
45        .single()
46        .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
47        .unwrap_or_else(|| format!("{epoch}"))
48}
49
50mod validation;
51pub use validation::{
52    validate_data_section, validate_data_value_pointers, validate_data_value_utf8,
53    validate_value_strings_utf8, DataFormatStats, DataFormatValidationResult,
54    PointerValidationError, PointerValidationResult, PointerValidationStats, MAX_POINTER_DEPTH,
55    MAX_TOTAL_DEPTH,
56};
57
58/// Data value that can be stored in the data section
59///
60/// This enum represents all MMDB data types and can be used
61/// for both standalone .pgb files and MMDB-embedded data.
62///
63/// Note: `Pointer` is excluded from JSON serialization/deserialization as it's
64/// an internal MMDB format detail (data section offset), not a user-facing type.
65#[derive(Debug, Clone, PartialEq)]
66pub enum DataValue {
67    /// Pointer to another data item (offset) - internal use only, not for JSON
68    #[allow(dead_code)]
69    Pointer(u32),
70    /// UTF-8 string
71    String(String),
72    /// IEEE 754 double precision float
73    Double(f64),
74    /// Raw byte array
75    Bytes(Vec<u8>),
76    /// Unsigned 16-bit integer
77    Uint16(u16),
78    /// Unsigned 32-bit integer
79    Uint32(u32),
80    /// Key-value map (string keys only per MMDB spec)
81    Map(HashMap<String, Self>),
82    /// Signed 32-bit integer
83    Int32(i32),
84    /// Unsigned 64-bit integer
85    Uint64(u64),
86    /// Unsigned 128-bit integer
87    Uint128(u128),
88    /// Array of values
89    Array(Vec<Self>),
90    /// Boolean value
91    Bool(bool),
92    /// IEEE 754 single precision float
93    Float(f32),
94    /// Unix timestamp (seconds since 1970-01-01 00:00:00 UTC)
95    ///
96    /// Stored compactly as a variable-length i64 using Matchy extended type 128.
97    /// Serializes to/from ISO 8601 strings (e.g., "2025-10-02T18:44:31Z") in JSON,
98    /// making the optimization transparent to API consumers.
99    ///
100    /// This is a Matchy extension to the MMDB format. Standard MMDB readers
101    /// will not recognize this type.
102    Timestamp(i64),
103}
104
105// Custom serialization that excludes Pointer (internal format detail)
106impl serde::Serialize for DataValue {
107    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
108    where
109        S: serde::Serializer,
110    {
111        match self {
112            Self::Pointer(_) => Err(serde::ser::Error::custom(
113                "Pointer is an internal type and cannot be serialized to JSON",
114            )),
115            Self::String(s) => serializer.serialize_str(s),
116            Self::Double(d) => serializer.serialize_f64(*d),
117            Self::Bytes(b) => serializer.serialize_bytes(b),
118            Self::Uint16(n) => serializer.serialize_u16(*n),
119            Self::Uint32(n) => serializer.serialize_u32(*n),
120            Self::Map(m) => m.serialize(serializer),
121            Self::Int32(n) => serializer.serialize_i32(*n),
122            Self::Uint64(n) => serializer.serialize_u64(*n),
123            Self::Uint128(n) => serializer.serialize_u128(*n),
124            Self::Array(a) => a.serialize(serializer),
125            Self::Bool(b) => serializer.serialize_bool(*b),
126            Self::Float(f) => serializer.serialize_f32(*f),
127            Self::Timestamp(epoch) => serializer.serialize_str(&format_iso8601(*epoch)),
128        }
129    }
130}
131
132// Custom deserialization that properly handles JSON numbers
133impl<'de> serde::Deserialize<'de> for DataValue {
134    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
135    where
136        D: serde::Deserializer<'de>,
137    {
138        struct DataValueVisitor;
139
140        impl<'de> serde::de::Visitor<'de> for DataValueVisitor {
141            type Value = DataValue;
142
143            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
144                formatter.write_str("a valid MMDB data value")
145            }
146
147            fn visit_bool<E>(self, v: bool) -> Result<DataValue, E> {
148                Ok(DataValue::Bool(v))
149            }
150
151            fn visit_i32<E>(self, v: i32) -> Result<DataValue, E> {
152                Ok(DataValue::Int32(v))
153            }
154
155            fn visit_i64<E>(self, v: i64) -> Result<DataValue, E>
156            where
157                E: serde::de::Error,
158            {
159                // Choose appropriate integer type based on value
160                if v >= 0 {
161                    if v <= i64::from(u16::MAX) {
162                        Ok(DataValue::Uint16(u16::try_from(v).unwrap()))
163                    } else if v <= i64::from(u32::MAX) {
164                        Ok(DataValue::Uint32(u32::try_from(v).unwrap()))
165                    } else {
166                        Ok(DataValue::Uint64(u64::try_from(v).unwrap()))
167                    }
168                } else if v >= i64::from(i32::MIN) {
169                    Ok(DataValue::Int32(i32::try_from(v).unwrap()))
170                } else {
171                    // MMDB format only supports Int32 for signed integers.
172                    // Values outside i32 range cannot be stored without precision loss.
173                    Err(serde::de::Error::custom(format!(
174                        "value {v} is outside the supported signed integer range \
175                         ({} to {}). MMDB format only supports Int32. \
176                         Consider using a string or unsigned integer instead.",
177                        i32::MIN,
178                        i32::MAX
179                    )))
180                }
181            }
182
183            fn visit_u64<E>(self, v: u64) -> Result<DataValue, E> {
184                // Choose appropriate unsigned integer type
185                if v <= u64::from(u16::MAX) {
186                    Ok(DataValue::Uint16(u16::try_from(v).unwrap()))
187                } else if v <= u64::from(u32::MAX) {
188                    Ok(DataValue::Uint32(u32::try_from(v).unwrap()))
189                } else {
190                    Ok(DataValue::Uint64(v))
191                }
192            }
193
194            fn visit_f32<E>(self, v: f32) -> Result<DataValue, E> {
195                Ok(DataValue::Float(v))
196            }
197
198            fn visit_f64<E>(self, v: f64) -> Result<DataValue, E> {
199                Ok(DataValue::Double(v))
200            }
201
202            fn visit_str<E>(self, v: &str) -> Result<DataValue, E> {
203                if let Some(epoch) = try_parse_iso8601(v) {
204                    return Ok(DataValue::Timestamp(epoch));
205                }
206                Ok(DataValue::String(v.to_string()))
207            }
208
209            fn visit_string<E>(self, v: String) -> Result<DataValue, E> {
210                if let Some(epoch) = try_parse_iso8601(&v) {
211                    return Ok(DataValue::Timestamp(epoch));
212                }
213                Ok(DataValue::String(v))
214            }
215
216            fn visit_bytes<E>(self, v: &[u8]) -> Result<DataValue, E> {
217                Ok(DataValue::Bytes(v.to_vec()))
218            }
219
220            fn visit_byte_buf<E>(self, v: Vec<u8>) -> Result<DataValue, E> {
221                Ok(DataValue::Bytes(v))
222            }
223
224            fn visit_seq<A>(self, mut seq: A) -> Result<DataValue, A::Error>
225            where
226                A: serde::de::SeqAccess<'de>,
227            {
228                let mut array = Vec::new();
229                while let Some(value) = seq.next_element()? {
230                    array.push(value);
231                }
232                Ok(DataValue::Array(array))
233            }
234
235            fn visit_map<A>(self, mut map: A) -> Result<DataValue, A::Error>
236            where
237                A: serde::de::MapAccess<'de>,
238            {
239                let mut hash_map = HashMap::new();
240                while let Some((key, value)) = map.next_entry()? {
241                    hash_map.insert(key, value);
242                }
243                Ok(DataValue::Map(hash_map))
244            }
245        }
246
247        deserializer.deserialize_any(DataValueVisitor)
248    }
249}
250
251// Implement Hash for DataValue to enable fast deduplication
252impl Hash for DataValue {
253    fn hash<H: Hasher>(&self, state: &mut H) {
254        // Hash the discriminant first
255        std::mem::discriminant(self).hash(state);
256
257        match self {
258            Self::Pointer(v) | Self::Uint32(v) => v.hash(state),
259            Self::String(v) => v.hash(state),
260            Self::Double(v) => {
261                // For floats, hash the bit representation to handle NaN consistently
262                v.to_bits().hash(state);
263            }
264            Self::Bytes(v) => v.hash(state),
265            Self::Uint16(v) => v.hash(state),
266            Self::Map(m) => {
267                // Hash maps require sorted keys for deterministic hashing
268                let mut keys: Vec<&String> = m.keys().collect();
269                keys.sort_unstable();
270                keys.len().hash(state);
271                for key in keys {
272                    key.hash(state);
273                    m[key].hash(state);
274                }
275            }
276            Self::Int32(v) => v.hash(state),
277            Self::Uint64(v) => v.hash(state),
278            Self::Uint128(v) => v.hash(state),
279            Self::Array(v) => {
280                v.len().hash(state);
281                for item in v {
282                    item.hash(state);
283                }
284            }
285            Self::Bool(v) => v.hash(state),
286            Self::Float(v) => {
287                // For floats, hash the bit representation to handle NaN consistently
288                v.to_bits().hash(state);
289            }
290            Self::Timestamp(v) => v.hash(state),
291        }
292    }
293}
294
295/// Data section encoder
296///
297/// Builds a data section by encoding values and tracking offsets.
298/// Supports deduplication - identical values get the same offset.
299/// Also supports string interning - duplicate strings are replaced with pointers.
300pub struct DataEncoder {
301    /// Encoded data buffer
302    buffer: Vec<u8>,
303    /// Map from serialized value to offset (for deduplication)
304    dedup_map: HashMap<Vec<u8>, u32>,
305    /// Map from string content to its first occurrence offset (for string interning)
306    string_cache: HashMap<String, u32>,
307    /// Enable string interning (default: true)
308    intern_strings: bool,
309}
310
311impl DataEncoder {
312    /// Create a new encoder with string interning enabled
313    #[must_use]
314    pub fn new() -> Self {
315        Self {
316            buffer: Vec::new(),
317            dedup_map: HashMap::new(),
318            string_cache: HashMap::new(),
319            intern_strings: true,
320        }
321    }
322
323    /// Create a new encoder without string interning (legacy behavior)
324    #[must_use]
325    pub fn new_without_interning() -> Self {
326        Self {
327            buffer: Vec::new(),
328            dedup_map: HashMap::new(),
329            string_cache: HashMap::new(),
330            intern_strings: false,
331        }
332    }
333
334    /// Encode a value and return its offset
335    ///
336    /// If the value was previously encoded, returns the existing offset.
337    /// This enables automatic deduplication at the value level.
338    /// String interning happens during encoding for sub-strings within maps/arrays.
339    pub fn encode(&mut self, value: &DataValue) -> u32 {
340        // For whole-value deduplication, we still use the temp buffer approach
341        // But we need to be careful about string interning during serialization
342
343        // Temporarily disable interning for the dedup check
344        let saved_intern = self.intern_strings;
345        self.intern_strings = false;
346
347        let mut temp = Vec::new();
348        Self::encode_to_buffer(value, &mut temp);
349
350        // Restore interning setting
351        self.intern_strings = saved_intern;
352
353        // Check deduplication map
354        if let Some(&offset) = self.dedup_map.get(&temp) {
355            return offset;
356        }
357
358        // New value - encode with interning
359        let offset = u32::try_from(self.buffer.len()).expect("Data section exceeds u32::MAX bytes");
360        self.encode_value_interned(value);
361        self.dedup_map.insert(temp, offset);
362        offset
363    }
364
365    /// Get the final encoded data section
366    #[must_use]
367    pub fn into_bytes(self) -> Vec<u8> {
368        self.buffer
369    }
370
371    /// Get current buffer size
372    #[must_use]
373    pub fn size(&self) -> usize {
374        self.buffer.len()
375    }
376
377    /// Encode a value with string interning
378    ///
379    /// This is the main entry point that handles interning.
380    fn encode_value_interned(&mut self, value: &DataValue) {
381        match value {
382            DataValue::String(s) if self.intern_strings => {
383                // Check if we've seen this string before
384                if let Some(&existing_offset) = self.string_cache.get(s) {
385                    // Use a pointer to the existing string
386                    Self::encode_pointer(existing_offset, &mut self.buffer);
387                } else {
388                    // First occurrence - encode the string and cache its offset
389                    let offset = u32::try_from(self.buffer.len())
390                        .expect("Data section exceeds u32::MAX bytes");
391                    Self::encode_string(s, &mut self.buffer);
392                    self.string_cache.insert(s.clone(), offset);
393                }
394            }
395            DataValue::Map(m) => self.encode_map_interned(m),
396            DataValue::Array(a) => self.encode_array_interned(a),
397            // All other types use the static encoding
398            _ => Self::encode_to_buffer(value, &mut self.buffer),
399        }
400    }
401
402    /// Encode a value to a buffer (static version, no interning)
403    fn encode_to_buffer(value: &DataValue, buffer: &mut Vec<u8>) {
404        match value {
405            DataValue::Pointer(offset) => Self::encode_pointer(*offset, buffer),
406            DataValue::String(s) => Self::encode_string(s, buffer),
407            DataValue::Double(d) => Self::encode_double(*d, buffer),
408            DataValue::Bytes(b) => Self::encode_bytes(b, buffer),
409            DataValue::Uint16(n) => Self::encode_uint16(*n, buffer),
410            DataValue::Uint32(n) => Self::encode_uint32(*n, buffer),
411            DataValue::Map(m) => Self::encode_map(m, buffer),
412            DataValue::Int32(n) => Self::encode_int32(*n, buffer),
413            DataValue::Uint64(n) => Self::encode_uint64(*n, buffer),
414            DataValue::Uint128(n) => Self::encode_uint128(*n, buffer),
415            DataValue::Array(a) => Self::encode_array(a, buffer),
416            DataValue::Bool(b) => Self::encode_bool(*b, buffer),
417            DataValue::Float(f) => Self::encode_float(*f, buffer),
418            DataValue::Timestamp(t) => Self::encode_timestamp(*t, buffer),
419        }
420    }
421
422    // Type 1: Pointer
423    fn encode_pointer(offset: u32, buffer: &mut Vec<u8>) {
424        let size = if offset < 2048 {
425            0 // 11 bits (0-2047)
426        } else if offset < 2048 + 524288 {
427            1 // 19 bits (2048-526335)
428        } else if offset < 2048 + 524288 + 134217728 {
429            2 // 27 bits (526336-134744063)
430        } else {
431            3 // 32 bits
432        };
433
434        match size {
435            0 => {
436                // 11 bits: 3 bits in control byte (high bits) + 8 bits in next byte (low bits)
437                // Decode reconstructs as: (low_3_bits << 8) | next_byte
438                let high_3_bits = ((offset >> 8) & 0x7) as u8;
439                let low_8_bits = (offset & 0xFF) as u8;
440                let ctrl = 0x20 | high_3_bits; // Type 1, size 0, high 3 bits
441                buffer.push(ctrl);
442                buffer.push(low_8_bits);
443            }
444            1 => {
445                // 19 bits: 3 bits in control byte + 16 bits in next 2 bytes, offset by 2048
446                // Decode reconstructs as: 2048 + ((low_3_bits << 16) | (b0 << 8) | b1)
447                let adjusted = offset - 2048;
448                let high_3_bits = ((adjusted >> 16) & 0x7) as u8;
449                let mid_8_bits = ((adjusted >> 8) & 0xFF) as u8;
450                let low_8_bits = (adjusted & 0xFF) as u8;
451                let ctrl = 0x20 | (1 << 3) | high_3_bits; // Type 1, size 1, high 3 bits
452                buffer.push(ctrl);
453                buffer.push(mid_8_bits);
454                buffer.push(low_8_bits);
455            }
456            2 => {
457                // 27 bits: 3 bits in control byte + 24 bits in next 3 bytes, offset by 526336
458                // Decode reconstructs as: 526336 + ((low_3_bits << 24) | (b0 << 16) | (b1 << 8) | b2)
459                let adjusted = offset - 526336;
460                let high_3_bits = ((adjusted >> 24) & 0x7) as u8;
461                let b0 = ((adjusted >> 16) & 0xFF) as u8;
462                let b1 = ((adjusted >> 8) & 0xFF) as u8;
463                let b2 = (adjusted & 0xFF) as u8;
464                let ctrl = 0x20 | (2 << 3) | high_3_bits; // Type 1, size 2, high 3 bits
465                buffer.push(ctrl);
466                buffer.push(b0);
467                buffer.push(b1);
468                buffer.push(b2);
469            }
470            _ => {
471                // 32 bits: payload bits ignored, full 32 bits in next 4 bytes
472                let ctrl = 0x20 | (3 << 3); // Type 1, size 3, payload bits unused
473                buffer.push(ctrl);
474                buffer.extend_from_slice(&offset.to_be_bytes());
475            }
476        }
477    }
478
479    // Type 2: String (UTF-8)
480    fn encode_string(s: &str, buffer: &mut Vec<u8>) {
481        let bytes = s.as_bytes();
482        Self::encode_with_size(2, bytes.len(), buffer);
483        buffer.extend_from_slice(bytes);
484    }
485
486    // Type 3: Double (IEEE 754, 64-bit)
487    fn encode_double(d: f64, buffer: &mut Vec<u8>) {
488        buffer.push(0x68); // Type 3 << 5, size 8
489        buffer.extend_from_slice(&d.to_be_bytes());
490    }
491
492    // Type 4: Bytes (raw binary)
493    fn encode_bytes(b: &[u8], buffer: &mut Vec<u8>) {
494        Self::encode_with_size(4, b.len(), buffer);
495        buffer.extend_from_slice(b);
496    }
497
498    // Type 5: Uint16
499    fn encode_uint16(n: u16, buffer: &mut Vec<u8>) {
500        buffer.push(0xA2); // Type 5 << 5, size 2
501        buffer.extend_from_slice(&n.to_be_bytes());
502    }
503
504    // Type 6: Uint32
505    fn encode_uint32(n: u32, buffer: &mut Vec<u8>) {
506        buffer.push(0xC4); // Type 6 << 5, size 4
507        buffer.extend_from_slice(&n.to_be_bytes());
508    }
509
510    // Type 7: Map (with interning)
511    fn encode_map_interned(&mut self, m: &HashMap<String, DataValue>) {
512        Self::encode_with_size(7, m.len(), &mut self.buffer);
513
514        // Encode key-value pairs (sorted by key for deterministic output)
515        let mut pairs: Vec<_> = m.iter().collect();
516        pairs.sort_by_key(|(k, _)| *k);
517
518        for (key, value) in pairs {
519            // Intern the map key
520            if self.intern_strings {
521                if let Some(&existing_offset) = self.string_cache.get(key) {
522                    Self::encode_pointer(existing_offset, &mut self.buffer);
523                } else {
524                    let offset = u32::try_from(self.buffer.len())
525                        .expect("Data section exceeds u32::MAX bytes");
526                    Self::encode_string(key, &mut self.buffer);
527                    self.string_cache.insert(key.clone(), offset);
528                }
529            } else {
530                Self::encode_string(key, &mut self.buffer);
531            }
532
533            // Recursively encode value with interning
534            self.encode_value_interned(value);
535        }
536    }
537
538    // Type 7: Map (static version, no interning)
539    fn encode_map(m: &HashMap<String, DataValue>, buffer: &mut Vec<u8>) {
540        Self::encode_with_size(7, m.len(), buffer);
541
542        // Encode key-value pairs (sorted by key for deterministic output)
543        let mut pairs: Vec<_> = m.iter().collect();
544        pairs.sort_by_key(|(k, _)| *k);
545
546        for (key, value) in pairs {
547            Self::encode_string(key, buffer);
548            Self::encode_to_buffer(value, buffer);
549        }
550    }
551
552    // Extended types (type 0)
553
554    // Type 8: Int32 (extended type 1)
555    fn encode_int32(n: i32, buffer: &mut Vec<u8>) {
556        buffer.push(0x04); // Type 0 << 5, size 4
557        buffer.push(0x01); // Extended type: 8 - 7 = 1
558        buffer.extend_from_slice(&n.to_be_bytes());
559    }
560
561    // Type 9: Uint64 (extended type 2)
562    fn encode_uint64(n: u64, buffer: &mut Vec<u8>) {
563        buffer.push(0x08); // Type 0 << 5, size 8
564        buffer.push(0x02); // Extended type: 9 - 7 = 2
565        buffer.extend_from_slice(&n.to_be_bytes());
566    }
567
568    // Type 10: Uint128 (extended type 3)
569    fn encode_uint128(n: u128, buffer: &mut Vec<u8>) {
570        buffer.push(0x10); // Type 0 << 5, size 16
571        buffer.push(0x03); // Extended type: 10 - 7 = 3
572        buffer.extend_from_slice(&n.to_be_bytes());
573    }
574
575    // Type 11: Array (with interning)
576    fn encode_array_interned(&mut self, a: &[DataValue]) {
577        let size = a.len();
578
579        // Control byte: type 0 << 5 | size bits
580        if size < 29 {
581            self.buffer.push(u8::try_from(size).unwrap());
582        } else if size < 29 + 256 {
583            self.buffer.push(29);
584            self.buffer.push(u8::try_from(size - 29).unwrap());
585        } else if size < 29 + 256 + 65536 {
586            self.buffer.push(30);
587            let adjusted = size - 29 - 256;
588            self.buffer
589                .extend_from_slice(&u16::try_from(adjusted).unwrap().to_be_bytes());
590        } else {
591            self.buffer.push(31);
592            let adjusted = size - 29 - 256 - 65536;
593            self.buffer
594                .extend_from_slice(&u32::try_from(adjusted).unwrap().to_be_bytes()[1..]);
595        }
596
597        // Extended type byte
598        self.buffer.push(0x04); // 11 - 7 = 4
599
600        // Recursively encode each element with interning
601        for value in a {
602            self.encode_value_interned(value);
603        }
604    }
605
606    // Type 11: Array (static version, no interning)
607    fn encode_array(a: &[DataValue], buffer: &mut Vec<u8>) {
608        // Extended type encoding:
609        // First byte: control byte with type 0 and size
610        // Second byte: raw extended type number (11 - 7 = 4)
611        let size = a.len();
612
613        // Control byte: type 0 << 5 | size bits
614        if size < 29 {
615            buffer.push(u8::try_from(size).unwrap());
616        } else if size < 29 + 256 {
617            buffer.push(29);
618            buffer.push(u8::try_from(size - 29).unwrap());
619        } else if size < 29 + 256 + 65536 {
620            buffer.push(30);
621            let adjusted = size - 29 - 256;
622            buffer.extend_from_slice(&u16::try_from(adjusted).unwrap().to_be_bytes());
623        } else {
624            buffer.push(31);
625            let adjusted = size - 29 - 256 - 65536;
626            buffer.extend_from_slice(&u32::try_from(adjusted).unwrap().to_be_bytes()[1..]);
627        }
628
629        // Extended type byte
630        buffer.push(0x04); // 11 - 7 = 4
631
632        for value in a {
633            Self::encode_to_buffer(value, buffer);
634        }
635    }
636
637    // Type 14: Bool (extended type 7)
638    fn encode_bool(b: bool, buffer: &mut Vec<u8>) {
639        if b {
640            buffer.push(0x01); // Type 0 << 5, size 1
641        } else {
642            buffer.push(0x00); // Type 0 << 5, size 0
643        }
644        buffer.push(0x07); // Extended type: 14 - 7 = 7
645    }
646
647    // Type 15: Float (IEEE 754, 32-bit) (extended type 8)
648    fn encode_float(f: f32, buffer: &mut Vec<u8>) {
649        buffer.push(0x04); // Type 0 << 5, size 4
650        buffer.push(0x08); // Extended type: 15 - 7 = 8
651        buffer.extend_from_slice(&f.to_be_bytes());
652    }
653
654    // Type 128: Timestamp (Matchy extension, extended type 121)
655    fn encode_timestamp(epoch: i64, buffer: &mut Vec<u8>) {
656        buffer.push(0x08); // Type 0 << 5, size 8
657        buffer.push(TIMESTAMP_EXTENDED_TYPE);
658        buffer.extend_from_slice(&epoch.to_be_bytes());
659    }
660
661    /// Encode control byte with size for standard types
662    fn encode_with_size(type_id: u8, size: usize, buffer: &mut Vec<u8>) {
663        let type_bits = type_id << 5;
664
665        if size < 29 {
666            buffer.push(type_bits | u8::try_from(size).unwrap());
667        } else if size < 29 + 256 {
668            buffer.push(type_bits | 29);
669            buffer.push(u8::try_from(size - 29).unwrap());
670        } else if size < 29 + 256 + 65536 {
671            buffer.push(type_bits | 30);
672            let adjusted = size - 29 - 256;
673            buffer.extend_from_slice(&u16::try_from(adjusted).unwrap().to_be_bytes());
674        } else {
675            buffer.push(type_bits | 31);
676            let adjusted = size - 29 - 256 - 65536;
677            buffer.extend_from_slice(&u32::try_from(adjusted).unwrap().to_be_bytes()[1..]);
678        }
679    }
680}
681
682impl Default for DataEncoder {
683    fn default() -> Self {
684        Self::new()
685    }
686}
687
688/// Data section decoder
689///
690/// Decodes values from an encoded data section buffer.
691/// Fully compatible with MMDB format.
692pub struct DataDecoder<'a> {
693    buffer: &'a [u8],
694    base_offset: usize,
695}
696
697impl<'a> DataDecoder<'a> {
698    /// Create a decoder for a data section
699    ///
700    /// # Arguments
701    /// * `buffer` - The encoded data buffer
702    /// * `base_offset` - Base offset for pointer calculations (0 for standalone data)
703    #[must_use]
704    pub fn new(buffer: &'a [u8], base_offset: usize) -> Self {
705        Self {
706            buffer,
707            base_offset,
708        }
709    }
710
711    /// Decode a value at the given offset
712    pub fn decode(&self, offset: u32) -> Result<DataValue, &'static str> {
713        let mut cursor = offset as usize;
714        if cursor < self.base_offset {
715            return Err("Offset before base");
716        }
717        cursor -= self.base_offset;
718        let value = self.decode_at(&mut cursor)?;
719        // Recursively resolve pointers in the returned value
720        self.resolve_pointers(value)
721    }
722
723    fn decode_at(&self, cursor: &mut usize) -> Result<DataValue, &'static str> {
724        if *cursor >= self.buffer.len() {
725            return Err("Cursor out of bounds");
726        }
727
728        let ctrl = self.buffer[*cursor];
729        *cursor += 1;
730
731        let type_id = ctrl >> 5;
732        let payload = ctrl & 0x1F;
733
734        match type_id {
735            0 => self.decode_extended(cursor, payload),
736            1 => self.decode_pointer(cursor, payload),
737            2 => self.decode_string(cursor, payload),
738            3 => self.decode_double(cursor),
739            4 => self.decode_bytes(cursor, payload),
740            5 => self.decode_uint16(cursor, payload),
741            6 => self.decode_uint32(cursor, payload),
742            7 => self.decode_map(cursor, payload),
743            _ => Err("Invalid type"),
744        }
745    }
746
747    fn decode_extended(
748        &self,
749        cursor: &mut usize,
750        size_from_ctrl: u8,
751    ) -> Result<DataValue, &'static str> {
752        if *cursor >= self.buffer.len() {
753            return Err("Extended type truncated");
754        }
755
756        // The next byte contains the raw extended type number
757        // Actual type = 7 + raw_ext_type (per libmaxminddb)
758        let raw_ext_type = self.buffer[*cursor];
759        let type_id = 7 + raw_ext_type;
760        *cursor += 1;
761
762        match type_id {
763            8 => self.decode_int32(cursor, size_from_ctrl), // Extended type 1
764            9 => self.decode_uint64(cursor, size_from_ctrl), // Extended type 2
765            10 => self.decode_uint128(cursor, size_from_ctrl), // Extended type 3
766            11 => self.decode_array(cursor, size_from_ctrl), // Extended type 4
767            14 => Ok(DataValue::Bool(size_from_ctrl != 0)), // Extended type 7
768            15 => self.decode_float(cursor, size_from_ctrl), // Extended type 8
769            128 => self.decode_timestamp(cursor, size_from_ctrl), // Matchy extension
770            _ => {
771                eprintln!(
772                    "Unknown extended type: raw_ext_type={}, type_id={}, size_from_ctrl={}, offset={}",
773                    raw_ext_type, type_id, size_from_ctrl, *cursor - 1
774                );
775                Err("Unknown extended type")
776            }
777        }
778    }
779
780    fn decode_pointer(&self, cursor: &mut usize, payload: u8) -> Result<DataValue, &'static str> {
781        let size_bits = (payload >> 3) & 0x3; // Extract bits 3-4
782        let offset = match size_bits {
783            0 => {
784                // 11 bits: 3 bits from payload + 8 bits from next byte
785                if *cursor >= self.buffer.len() {
786                    return Err("Pointer data truncated");
787                }
788                let low_3_bits = u32::from(payload & 0x7);
789                let next_byte = u32::from(self.buffer[*cursor]);
790                *cursor += 1;
791                (low_3_bits << 8) | next_byte
792            }
793            1 => {
794                // 19 bits: 3 bits from payload + 16 bits from next 2 bytes, offset by 2048
795                if *cursor + 1 >= self.buffer.len() {
796                    return Err("Pointer data truncated");
797                }
798                let low_3_bits = u32::from(payload & 0x7);
799                let b0 = u32::from(self.buffer[*cursor]);
800                let b1 = u32::from(self.buffer[*cursor + 1]);
801                *cursor += 2;
802                2048 + ((low_3_bits << 16) | (b0 << 8) | b1)
803            }
804            2 => {
805                // 27 bits: 3 bits from payload + 24 bits from next 3 bytes, offset by 526336
806                if *cursor + 2 >= self.buffer.len() {
807                    return Err("Pointer data truncated");
808                }
809                let low_3_bits = u32::from(payload & 0x7);
810                let b0 = u32::from(self.buffer[*cursor]);
811                let b1 = u32::from(self.buffer[*cursor + 1]);
812                let b2 = u32::from(self.buffer[*cursor + 2]);
813                *cursor += 3;
814                526336 + ((low_3_bits << 24) | (b0 << 16) | (b1 << 8) | b2)
815            }
816            3 => {
817                // 32 bits: payload bits ignored, full 32 bits from next 4 bytes
818                if *cursor + 3 >= self.buffer.len() {
819                    return Err("Pointer data truncated");
820                }
821                let mut bytes = [0u8; 4];
822                bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 4]);
823                *cursor += 4;
824                u32::from_be_bytes(bytes)
825            }
826            _ => return Err("Invalid pointer size"),
827        };
828
829        Ok(DataValue::Pointer(offset))
830    }
831
832    fn decode_string(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
833        let len = self.decode_size(cursor, size_bits)?;
834
835        if *cursor + len > self.buffer.len() {
836            return Err("String data out of bounds");
837        }
838
839        let s = std::str::from_utf8(&self.buffer[*cursor..*cursor + len])
840            .map_err(|_| "Invalid UTF-8")?;
841        *cursor += len;
842
843        Ok(DataValue::String(s.to_string()))
844    }
845
846    fn decode_double(&self, cursor: &mut usize) -> Result<DataValue, &'static str> {
847        if *cursor + 8 > self.buffer.len() {
848            return Err("Double data out of bounds");
849        }
850
851        let mut bytes = [0u8; 8];
852        bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 8]);
853        *cursor += 8;
854
855        Ok(DataValue::Double(f64::from_be_bytes(bytes)))
856    }
857
858    fn decode_bytes(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
859        let len = self.decode_size(cursor, size_bits)?;
860
861        if *cursor + len > self.buffer.len() {
862            return Err("Bytes data out of bounds");
863        }
864
865        let bytes = self.buffer[*cursor..*cursor + len].to_vec();
866        *cursor += len;
867
868        Ok(DataValue::Bytes(bytes))
869    }
870
871    fn decode_uint16(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
872        let size = self.decode_size(cursor, size_bits)?;
873
874        if size > 2 {
875            return Err("Uint16 size too large");
876        }
877
878        if *cursor + size > self.buffer.len() {
879            return Err("Uint16 data out of bounds");
880        }
881
882        // Read variable number of bytes and convert to u16
883        let mut value = 0u16;
884        for i in 0..size {
885            value = (value << 8) | u16::from(self.buffer[*cursor + i]);
886        }
887        *cursor += size;
888
889        Ok(DataValue::Uint16(value))
890    }
891
892    fn decode_uint32(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
893        let size = self.decode_size(cursor, size_bits)?;
894
895        if size > 4 {
896            return Err("Uint32 size too large");
897        }
898
899        if *cursor + size > self.buffer.len() {
900            return Err("Uint32 data out of bounds");
901        }
902
903        // Read variable number of bytes and convert to u32
904        let mut value = 0u32;
905        for i in 0..size {
906            value = (value << 8) | u32::from(self.buffer[*cursor + i]);
907        }
908        *cursor += size;
909
910        Ok(DataValue::Uint32(value))
911    }
912
913    fn decode_map(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
914        let count = self.decode_size(cursor, size_bits)?;
915        let mut map = HashMap::new();
916
917        for _ in 0..count {
918            // Decode key - can be String or Pointer (MMDB uses pointers for deduplication)
919            let key_value = self.decode_at(cursor)?;
920            let key = match key_value {
921                DataValue::String(s) => s,
922                DataValue::Pointer(offset) => {
923                    // Follow pointer to get the actual key string
924                    match self.decode(offset)? {
925                        DataValue::String(s) => s,
926                        _ => return Err("Pointer in map key must point to string"),
927                    }
928                }
929                _ => return Err("Map key must be string or pointer to string"),
930            };
931
932            let value = self.decode_at(cursor)?;
933            map.insert(key, value);
934        }
935
936        Ok(DataValue::Map(map))
937    }
938
939    fn decode_int32(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
940        let size = self.decode_size(cursor, size_bits)?;
941
942        if size > 4 {
943            return Err("Int32 size too large");
944        }
945
946        if *cursor + size > self.buffer.len() {
947            return Err("Int32 data out of bounds");
948        }
949
950        // Read variable number of bytes and convert to i32 with sign extension
951        let mut value = 0i32;
952        if size > 0 {
953            // Check if the high bit is set (negative number)
954            let is_negative = (self.buffer[*cursor] & 0x80) != 0;
955
956            if is_negative {
957                // Start with all 1s for sign extension
958                value = -1;
959            }
960
961            for i in 0..size {
962                value = (value << 8) | i32::from(self.buffer[*cursor + i]);
963            }
964        }
965        *cursor += size;
966
967        Ok(DataValue::Int32(value))
968    }
969
970    fn decode_uint64(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
971        let size = self.decode_size(cursor, size_bits)?;
972
973        if size > 8 {
974            return Err("Uint64 size too large");
975        }
976
977        if *cursor + size > self.buffer.len() {
978            return Err("Uint64 data out of bounds");
979        }
980
981        // Read variable number of bytes and convert to u64
982        let mut value = 0u64;
983        for i in 0..size {
984            value = (value << 8) | u64::from(self.buffer[*cursor + i]);
985        }
986        *cursor += size;
987
988        Ok(DataValue::Uint64(value))
989    }
990
991    fn decode_uint128(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
992        let size = self.decode_size(cursor, size_bits)?;
993
994        if size > 16 {
995            return Err("Uint128 size too large");
996        }
997
998        if *cursor + size > self.buffer.len() {
999            return Err("Uint128 data out of bounds");
1000        }
1001
1002        // Read variable number of bytes and convert to u128
1003        let mut value = 0u128;
1004        for i in 0..size {
1005            value = (value << 8) | u128::from(self.buffer[*cursor + i]);
1006        }
1007        *cursor += size;
1008
1009        Ok(DataValue::Uint128(value))
1010    }
1011
1012    fn decode_array(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
1013        let count = self.decode_size(cursor, size_bits)?;
1014        let mut array = Vec::with_capacity(count);
1015
1016        for _ in 0..count {
1017            array.push(self.decode_at(cursor)?);
1018        }
1019
1020        Ok(DataValue::Array(array))
1021    }
1022
1023    fn decode_float(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
1024        // Float should always be 4 bytes
1025        if size_bits != 4 {
1026            return Err("Float must be 4 bytes");
1027        }
1028
1029        if *cursor + 4 > self.buffer.len() {
1030            return Err("Float data out of bounds");
1031        }
1032
1033        let mut bytes = [0u8; 4];
1034        bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 4]);
1035        *cursor += 4;
1036
1037        Ok(DataValue::Float(f32::from_be_bytes(bytes)))
1038    }
1039
1040    fn decode_timestamp(
1041        &self,
1042        cursor: &mut usize,
1043        size_bits: u8,
1044    ) -> Result<DataValue, &'static str> {
1045        if size_bits != 8 {
1046            return Err("Timestamp must be 8 bytes");
1047        }
1048
1049        if *cursor + 8 > self.buffer.len() {
1050            return Err("Timestamp data out of bounds");
1051        }
1052
1053        let mut bytes = [0u8; 8];
1054        bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 8]);
1055        *cursor += 8;
1056
1057        Ok(DataValue::Timestamp(i64::from_be_bytes(bytes)))
1058    }
1059
1060    fn decode_size(&self, cursor: &mut usize, size_bits: u8) -> Result<usize, &'static str> {
1061        match size_bits {
1062            0..=28 => Ok(size_bits as usize),
1063            29 => {
1064                if *cursor >= self.buffer.len() {
1065                    return Err("Size byte out of bounds");
1066                }
1067                let size = self.buffer[*cursor] as usize;
1068                *cursor += 1;
1069                Ok(29 + size)
1070            }
1071            30 => {
1072                if *cursor + 2 > self.buffer.len() {
1073                    return Err("Size bytes out of bounds");
1074                }
1075                let mut bytes = [0u8; 2];
1076                bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 2]);
1077                *cursor += 2;
1078                Ok(29 + 256 + u16::from_be_bytes(bytes) as usize)
1079            }
1080            31 => {
1081                if *cursor + 3 > self.buffer.len() {
1082                    return Err("Size bytes out of bounds");
1083                }
1084                let b0 = self.buffer[*cursor] as usize;
1085                let b1 = self.buffer[*cursor + 1] as usize;
1086                let b2 = self.buffer[*cursor + 2] as usize;
1087                *cursor += 3;
1088                Ok(29 + 256 + 65536 + ((b0 << 16) | (b1 << 8) | b2))
1089            }
1090            _ => Err("Invalid size encoding"),
1091        }
1092    }
1093
1094    /// Recursively resolve all pointers in a decoded value
1095    fn resolve_pointers(&self, value: DataValue) -> Result<DataValue, &'static str> {
1096        match value {
1097            DataValue::Pointer(offset) => {
1098                // Follow the pointer and recursively resolve
1099                let mut cursor = offset as usize;
1100                if cursor < self.base_offset {
1101                    return Err("Pointer offset before base");
1102                }
1103                cursor -= self.base_offset;
1104                let pointed_value = self.decode_at(&mut cursor)?;
1105                self.resolve_pointers(pointed_value)
1106            }
1107            DataValue::Map(entries) => {
1108                // Recursively resolve pointers in map values
1109                let mut resolved_map = HashMap::new();
1110                for (key, val) in entries {
1111                    resolved_map.insert(key, self.resolve_pointers(val)?);
1112                }
1113                Ok(DataValue::Map(resolved_map))
1114            }
1115            DataValue::Array(items) => {
1116                // Recursively resolve pointers in array elements
1117                let mut resolved_array = Vec::new();
1118                for item in items {
1119                    resolved_array.push(self.resolve_pointers(item)?);
1120                }
1121                Ok(DataValue::Array(resolved_array))
1122            }
1123            // All other types have no pointers to resolve
1124            other => Ok(other),
1125        }
1126    }
1127}
1128
1129#[cfg(test)]
1130mod tests {
1131    use super::*;
1132
1133    #[test]
1134    fn test_encode_decode_all_types() {
1135        let mut encoder = DataEncoder::new();
1136
1137        // Test each type
1138        let string_val = DataValue::String("hello".to_string());
1139        let uint16_val = DataValue::Uint16(12345);
1140        let uint32_val = DataValue::Uint32(0xDEADBEEF);
1141        let uint64_val = DataValue::Uint64(0x123456789ABCDEF0);
1142        let uint128_val = DataValue::Uint128(0x0123456789ABCDEF0123456789ABCDEF);
1143        let int32_val = DataValue::Int32(-42);
1144        let double_val = DataValue::Double(std::f64::consts::PI);
1145        let float_val = DataValue::Float(std::f32::consts::E);
1146        let bool_val = DataValue::Bool(true);
1147        let bytes_val = DataValue::Bytes(vec![0xDE, 0xAD, 0xBE, 0xEF]);
1148
1149        let offsets = [
1150            encoder.encode(&string_val),
1151            encoder.encode(&uint16_val),
1152            encoder.encode(&uint32_val),
1153            encoder.encode(&uint64_val),
1154            encoder.encode(&uint128_val),
1155            encoder.encode(&int32_val),
1156            encoder.encode(&double_val),
1157            encoder.encode(&float_val),
1158            encoder.encode(&bool_val),
1159            encoder.encode(&bytes_val),
1160        ];
1161
1162        let bytes = encoder.into_bytes();
1163        let decoder = DataDecoder::new(&bytes, 0);
1164
1165        let values = vec![
1166            string_val,
1167            uint16_val,
1168            uint32_val,
1169            uint64_val,
1170            uint128_val,
1171            int32_val,
1172            double_val,
1173            float_val,
1174            bool_val,
1175            bytes_val,
1176        ];
1177
1178        for (offset, expected) in offsets.iter().zip(values.iter()) {
1179            let decoded = decoder.decode(*offset).unwrap();
1180            assert_eq!(&decoded, expected);
1181        }
1182    }
1183
1184    #[test]
1185    fn test_encode_decode_map() {
1186        let mut encoder = DataEncoder::new();
1187        let mut map = HashMap::new();
1188        map.insert("country".to_string(), DataValue::String("US".to_string()));
1189        map.insert("asn".to_string(), DataValue::Uint32(13335));
1190        map.insert("score".to_string(), DataValue::Double(0.95));
1191
1192        let value = DataValue::Map(map.clone());
1193        let offset = encoder.encode(&value);
1194
1195        let bytes = encoder.into_bytes();
1196        let decoder = DataDecoder::new(&bytes, 0);
1197        let decoded = decoder.decode(offset).unwrap();
1198
1199        assert_eq!(decoded, value);
1200    }
1201
1202    #[test]
1203    fn test_encode_decode_array() {
1204        let mut encoder = DataEncoder::new();
1205        let value = DataValue::Array(vec![
1206            DataValue::String("tag1".to_string()),
1207            DataValue::String("tag2".to_string()),
1208            DataValue::Uint32(123),
1209            DataValue::Bool(false),
1210        ]);
1211        let offset = encoder.encode(&value);
1212
1213        let bytes = encoder.into_bytes();
1214        let decoder = DataDecoder::new(&bytes, 0);
1215        let decoded = decoder.decode(offset).unwrap();
1216
1217        assert_eq!(decoded, value);
1218    }
1219
1220    #[test]
1221    fn test_deduplication() {
1222        let mut encoder = DataEncoder::new();
1223
1224        // Encode same value multiple times
1225        let value = DataValue::String("test".to_string());
1226        let offset1 = encoder.encode(&value);
1227        let offset2 = encoder.encode(&value);
1228        let offset3 = encoder.encode(&value);
1229
1230        // Should get same offset (deduplicated)
1231        assert_eq!(offset1, offset2);
1232        assert_eq!(offset2, offset3);
1233
1234        // Different value gets different offset
1235        let value2 = DataValue::String("different".to_string());
1236        let offset4 = encoder.encode(&value2);
1237        assert_ne!(offset1, offset4);
1238    }
1239
1240    #[test]
1241    fn test_complex_nested_structure() {
1242        let mut encoder = DataEncoder::new();
1243
1244        // Build threat intelligence data structure
1245        let mut threat_data = HashMap::new();
1246        threat_data.insert(
1247            "threat_level".to_string(),
1248            DataValue::String("high".to_string()),
1249        );
1250        threat_data.insert(
1251            "category".to_string(),
1252            DataValue::String("malware".to_string()),
1253        );
1254        threat_data.insert("confidence".to_string(), DataValue::Float(0.98));
1255        threat_data.insert("first_seen".to_string(), DataValue::Uint64(1704067200));
1256
1257        let mut indicators = HashMap::new();
1258        indicators.insert("ip_count".to_string(), DataValue::Uint32(42));
1259        indicators.insert("domain_count".to_string(), DataValue::Uint32(15));
1260
1261        threat_data.insert("indicators".to_string(), DataValue::Map(indicators));
1262        threat_data.insert(
1263            "tags".to_string(),
1264            DataValue::Array(vec![
1265                DataValue::String("botnet".to_string()),
1266                DataValue::String("c2".to_string()),
1267            ]),
1268        );
1269        threat_data.insert("active".to_string(), DataValue::Bool(true));
1270
1271        let value = DataValue::Map(threat_data);
1272        let offset = encoder.encode(&value);
1273
1274        let bytes = encoder.into_bytes();
1275        let decoder = DataDecoder::new(&bytes, 0);
1276        let decoded = decoder.decode(offset).unwrap();
1277
1278        assert_eq!(decoded, value);
1279    }
1280
1281    #[test]
1282    fn test_large_strings() {
1283        let mut encoder = DataEncoder::new();
1284
1285        // Test string size encodings
1286        let short = "x".repeat(28); // < 29
1287        let medium = "x".repeat(100); // 29..285
1288        let long = "x".repeat(1000); // > 285
1289
1290        let offset1 = encoder.encode(&DataValue::String(short.clone()));
1291        let offset2 = encoder.encode(&DataValue::String(medium.clone()));
1292        let offset3 = encoder.encode(&DataValue::String(long.clone()));
1293
1294        let bytes = encoder.into_bytes();
1295        let decoder = DataDecoder::new(&bytes, 0);
1296
1297        assert_eq!(decoder.decode(offset1).unwrap(), DataValue::String(short));
1298        assert_eq!(decoder.decode(offset2).unwrap(), DataValue::String(medium));
1299        assert_eq!(decoder.decode(offset3).unwrap(), DataValue::String(long));
1300    }
1301
1302    #[test]
1303    fn test_string_interning() {
1304        // Test that repeated strings within structures are interned
1305        let mut encoder = DataEncoder::new();
1306
1307        // Create multiple maps with repeated string values
1308        let mut map1 = HashMap::new();
1309        map1.insert(
1310            "threat_level".to_string(),
1311            DataValue::String("high".to_string()),
1312        );
1313        map1.insert(
1314            "category".to_string(),
1315            DataValue::String("malware".to_string()),
1316        );
1317        map1.insert("score".to_string(), DataValue::Uint32(95));
1318
1319        let mut map2 = HashMap::new();
1320        map2.insert(
1321            "threat_level".to_string(),
1322            DataValue::String("high".to_string()),
1323        ); // Repeated
1324        map2.insert(
1325            "category".to_string(),
1326            DataValue::String("phishing".to_string()),
1327        );
1328        map2.insert("score".to_string(), DataValue::Uint32(88));
1329
1330        let mut map3 = HashMap::new();
1331        map3.insert(
1332            "threat_level".to_string(),
1333            DataValue::String("high".to_string()),
1334        ); // Repeated
1335        map3.insert(
1336            "category".to_string(),
1337            DataValue::String("malware".to_string()),
1338        ); // Repeated
1339        map3.insert("score".to_string(), DataValue::Uint32(92));
1340
1341        // Encode all three maps
1342        let offset1 = encoder.encode(&DataValue::Map(map1.clone()));
1343        let offset2 = encoder.encode(&DataValue::Map(map2.clone()));
1344        let offset3 = encoder.encode(&DataValue::Map(map3.clone()));
1345
1346        let bytes_with_interning = encoder.into_bytes();
1347
1348        // Now encode WITHOUT interning to compare size
1349        let mut encoder_no_intern = DataEncoder::new_without_interning();
1350        encoder_no_intern.encode(&DataValue::Map(map1.clone()));
1351        encoder_no_intern.encode(&DataValue::Map(map2.clone()));
1352        encoder_no_intern.encode(&DataValue::Map(map3.clone()));
1353        let bytes_no_interning = encoder_no_intern.into_bytes();
1354
1355        // Interned version should be smaller
1356        println!("With interning: {} bytes", bytes_with_interning.len());
1357        println!("Without interning: {} bytes", bytes_no_interning.len());
1358        println!(
1359            "Savings: {} bytes ({:.1}%)",
1360            bytes_no_interning.len() - bytes_with_interning.len(),
1361            100.0 * (bytes_no_interning.len() - bytes_with_interning.len()) as f64
1362                / bytes_no_interning.len() as f64
1363        );
1364        assert!(bytes_with_interning.len() < bytes_no_interning.len());
1365
1366        // Verify decoding still works correctly
1367        let decoder = DataDecoder::new(&bytes_with_interning, 0);
1368        let decoded1 = decoder.decode(offset1).unwrap();
1369        let decoded2 = decoder.decode(offset2).unwrap();
1370        let decoded3 = decoder.decode(offset3).unwrap();
1371
1372        assert_eq!(decoded1, DataValue::Map(map1));
1373        assert_eq!(decoded2, DataValue::Map(map2));
1374        assert_eq!(decoded3, DataValue::Map(map3));
1375    }
1376
1377    #[test]
1378    fn test_string_interning_in_arrays() {
1379        // Test interning within arrays
1380        let mut encoder = DataEncoder::new();
1381
1382        let array = DataValue::Array(vec![
1383            DataValue::String("botnet".to_string()),
1384            DataValue::String("c2".to_string()),
1385            DataValue::String("botnet".to_string()), // Repeated
1386            DataValue::String("malware".to_string()),
1387            DataValue::String("c2".to_string()), // Repeated
1388        ]);
1389
1390        let offset = encoder.encode(&array);
1391        let bytes = encoder.into_bytes();
1392
1393        // Decode and verify
1394        let decoder = DataDecoder::new(&bytes, 0);
1395        let decoded = decoder.decode(offset).unwrap();
1396        assert_eq!(decoded, array);
1397    }
1398
1399    #[test]
1400    fn test_pointer_encoding() {
1401        // Test pointer resolution with actual data that pointers reference
1402        let mut encoder = DataEncoder::new();
1403
1404        // First encode some actual data that we'll point to
1405        let target_data = DataValue::String("shared_value".to_string());
1406        let target_offset = encoder.encode(&target_data);
1407
1408        // Now create a map that uses a pointer to reference that data (simulating deduplication)
1409        // In MMDB format, pointers are typically used within maps for deduplicated keys/values
1410        let mut map = HashMap::new();
1411        map.insert(
1412            "direct".to_string(),
1413            DataValue::String("direct_value".to_string()),
1414        );
1415        // Manually insert pointer (in real MMDB, encoder would do this for deduplication)
1416        map.insert("ptr_ref".to_string(), DataValue::Pointer(target_offset));
1417
1418        let map_offset = encoder.encode(&DataValue::Map(map));
1419
1420        let bytes = encoder.into_bytes();
1421        let decoder = DataDecoder::new(&bytes, 0);
1422
1423        // Decode the map - pointers should be automatically resolved
1424        let decoded = decoder.decode(map_offset).unwrap();
1425
1426        if let DataValue::Map(decoded_map) = decoded {
1427            // The pointer should have been resolved to the actual string value
1428            assert_eq!(
1429                decoded_map.get("direct"),
1430                Some(&DataValue::String("direct_value".to_string()))
1431            );
1432            assert_eq!(
1433                decoded_map.get("ptr_ref"),
1434                Some(&DataValue::String("shared_value".to_string()))
1435            );
1436        } else {
1437            panic!("Expected Map, got {decoded:?}");
1438        }
1439    }
1440
1441    #[test]
1442    fn test_large_negative_integer_rejected() {
1443        let json = format!("{}", i64::MIN);
1444        let result: Result<DataValue, _> = serde_json::from_str(&json);
1445        assert!(result.is_err());
1446        let err = result.unwrap_err().to_string();
1447        assert!(err.contains("outside the supported signed integer range"));
1448    }
1449
1450    #[test]
1451    fn test_i32_min_accepted() {
1452        let json = format!("{}", i32::MIN);
1453        let result: Result<DataValue, _> = serde_json::from_str(&json);
1454        assert!(result.is_ok());
1455        assert_eq!(result.unwrap(), DataValue::Int32(i32::MIN));
1456    }
1457
1458    #[test]
1459    fn test_timestamp_binary_roundtrip() {
1460        let mut encoder = DataEncoder::new();
1461        let epoch = 1727894671i64; // 2024-10-02T18:44:31Z
1462        let value = DataValue::Timestamp(epoch);
1463        let offset = encoder.encode(&value);
1464
1465        let bytes = encoder.into_bytes();
1466        let decoder = DataDecoder::new(&bytes, 0);
1467        let decoded = decoder.decode(offset).unwrap();
1468
1469        assert_eq!(decoded, DataValue::Timestamp(epoch));
1470    }
1471
1472    #[test]
1473    fn test_timestamp_json_serialize() {
1474        let value = DataValue::Timestamp(1727894671);
1475        let json = serde_json::to_string(&value).unwrap();
1476        assert_eq!(json, "\"2024-10-02T18:44:31Z\"");
1477    }
1478
1479    #[test]
1480    fn test_timestamp_json_deserialize() {
1481        let json = "\"2024-10-02T18:44:31Z\"";
1482        let value: DataValue = serde_json::from_str(json).unwrap();
1483        assert_eq!(value, DataValue::Timestamp(1727894671));
1484    }
1485
1486    #[test]
1487    fn test_timestamp_with_fractional_seconds() {
1488        let json = "\"2024-10-02T18:44:31.123456Z\"";
1489        let value: DataValue = serde_json::from_str(json).unwrap();
1490        if let DataValue::Timestamp(epoch) = value {
1491            assert_eq!(epoch, 1727894671);
1492        } else {
1493            panic!("Expected Timestamp, got {value:?}");
1494        }
1495    }
1496
1497    #[test]
1498    fn test_non_timestamp_string_stays_string() {
1499        let json = "\"hello world\"";
1500        let value: DataValue = serde_json::from_str(json).unwrap();
1501        assert_eq!(value, DataValue::String("hello world".to_string()));
1502    }
1503
1504    #[test]
1505    fn test_timestamp_negative_epoch() {
1506        let mut encoder = DataEncoder::new();
1507        let epoch = -86400i64; // 1969-12-31
1508        let value = DataValue::Timestamp(epoch);
1509        let offset = encoder.encode(&value);
1510
1511        let bytes = encoder.into_bytes();
1512        let decoder = DataDecoder::new(&bytes, 0);
1513        let decoded = decoder.decode(offset).unwrap();
1514
1515        assert_eq!(decoded, DataValue::Timestamp(epoch));
1516    }
1517
1518    #[test]
1519    fn test_timestamp_in_map() {
1520        let mut encoder = DataEncoder::new();
1521        let mut map = HashMap::new();
1522        map.insert("first_seen".to_string(), DataValue::Timestamp(1727894671));
1523        map.insert("last_seen".to_string(), DataValue::Timestamp(1727981071));
1524        map.insert("name".to_string(), DataValue::String("test".to_string()));
1525
1526        let offset = encoder.encode(&DataValue::Map(map.clone()));
1527
1528        let bytes = encoder.into_bytes();
1529        let decoder = DataDecoder::new(&bytes, 0);
1530        let decoded = decoder.decode(offset).unwrap();
1531
1532        if let DataValue::Map(decoded_map) = decoded {
1533            assert_eq!(
1534                decoded_map.get("first_seen"),
1535                Some(&DataValue::Timestamp(1727894671))
1536            );
1537            assert_eq!(
1538                decoded_map.get("last_seen"),
1539                Some(&DataValue::Timestamp(1727981071))
1540            );
1541            assert_eq!(
1542                decoded_map.get("name"),
1543                Some(&DataValue::String("test".to_string()))
1544            );
1545        } else {
1546            panic!("Expected Map, got {decoded:?}");
1547        }
1548    }
1549}