1use chrono::{DateTime, TimeZone, Utc};
31use std::collections::HashMap;
32use std::hash::{Hash, Hasher};
33
34const TIMESTAMP_EXTENDED_TYPE: u8 = 121; fn try_parse_iso8601(s: &str) -> Option<i64> {
38 DateTime::parse_from_rfc3339(s)
39 .ok()
40 .map(|dt| dt.timestamp())
41}
42
43fn format_iso8601(epoch: i64) -> String {
44 Utc.timestamp_opt(epoch, 0)
45 .single()
46 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
47 .unwrap_or_else(|| format!("{epoch}"))
48}
49
50mod validation;
51pub use validation::{
52 validate_data_section, validate_data_value_pointers, validate_data_value_utf8,
53 validate_value_strings_utf8, DataFormatStats, DataFormatValidationResult,
54 PointerValidationError, PointerValidationResult, PointerValidationStats, MAX_POINTER_DEPTH,
55 MAX_TOTAL_DEPTH,
56};
57
58#[derive(Debug, Clone, PartialEq)]
66pub enum DataValue {
67 #[allow(dead_code)]
69 Pointer(u32),
70 String(String),
72 Double(f64),
74 Bytes(Vec<u8>),
76 Uint16(u16),
78 Uint32(u32),
80 Map(HashMap<String, Self>),
82 Int32(i32),
84 Uint64(u64),
86 Uint128(u128),
88 Array(Vec<Self>),
90 Bool(bool),
92 Float(f32),
94 Timestamp(i64),
103}
104
105impl serde::Serialize for DataValue {
107 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
108 where
109 S: serde::Serializer,
110 {
111 match self {
112 Self::Pointer(_) => Err(serde::ser::Error::custom(
113 "Pointer is an internal type and cannot be serialized to JSON",
114 )),
115 Self::String(s) => serializer.serialize_str(s),
116 Self::Double(d) => serializer.serialize_f64(*d),
117 Self::Bytes(b) => serializer.serialize_bytes(b),
118 Self::Uint16(n) => serializer.serialize_u16(*n),
119 Self::Uint32(n) => serializer.serialize_u32(*n),
120 Self::Map(m) => m.serialize(serializer),
121 Self::Int32(n) => serializer.serialize_i32(*n),
122 Self::Uint64(n) => serializer.serialize_u64(*n),
123 Self::Uint128(n) => serializer.serialize_u128(*n),
124 Self::Array(a) => a.serialize(serializer),
125 Self::Bool(b) => serializer.serialize_bool(*b),
126 Self::Float(f) => serializer.serialize_f32(*f),
127 Self::Timestamp(epoch) => serializer.serialize_str(&format_iso8601(*epoch)),
128 }
129 }
130}
131
132impl<'de> serde::Deserialize<'de> for DataValue {
134 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
135 where
136 D: serde::Deserializer<'de>,
137 {
138 struct DataValueVisitor;
139
140 impl<'de> serde::de::Visitor<'de> for DataValueVisitor {
141 type Value = DataValue;
142
143 fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
144 formatter.write_str("a valid MMDB data value")
145 }
146
147 fn visit_bool<E>(self, v: bool) -> Result<DataValue, E> {
148 Ok(DataValue::Bool(v))
149 }
150
151 fn visit_i32<E>(self, v: i32) -> Result<DataValue, E> {
152 Ok(DataValue::Int32(v))
153 }
154
155 fn visit_i64<E>(self, v: i64) -> Result<DataValue, E>
156 where
157 E: serde::de::Error,
158 {
159 if v >= 0 {
161 if v <= i64::from(u16::MAX) {
162 Ok(DataValue::Uint16(u16::try_from(v).unwrap()))
163 } else if v <= i64::from(u32::MAX) {
164 Ok(DataValue::Uint32(u32::try_from(v).unwrap()))
165 } else {
166 Ok(DataValue::Uint64(u64::try_from(v).unwrap()))
167 }
168 } else if v >= i64::from(i32::MIN) {
169 Ok(DataValue::Int32(i32::try_from(v).unwrap()))
170 } else {
171 Err(serde::de::Error::custom(format!(
174 "value {v} is outside the supported signed integer range \
175 ({} to {}). MMDB format only supports Int32. \
176 Consider using a string or unsigned integer instead.",
177 i32::MIN,
178 i32::MAX
179 )))
180 }
181 }
182
183 fn visit_u64<E>(self, v: u64) -> Result<DataValue, E> {
184 if v <= u64::from(u16::MAX) {
186 Ok(DataValue::Uint16(u16::try_from(v).unwrap()))
187 } else if v <= u64::from(u32::MAX) {
188 Ok(DataValue::Uint32(u32::try_from(v).unwrap()))
189 } else {
190 Ok(DataValue::Uint64(v))
191 }
192 }
193
194 fn visit_f32<E>(self, v: f32) -> Result<DataValue, E> {
195 Ok(DataValue::Float(v))
196 }
197
198 fn visit_f64<E>(self, v: f64) -> Result<DataValue, E> {
199 Ok(DataValue::Double(v))
200 }
201
202 fn visit_str<E>(self, v: &str) -> Result<DataValue, E> {
203 if let Some(epoch) = try_parse_iso8601(v) {
204 return Ok(DataValue::Timestamp(epoch));
205 }
206 Ok(DataValue::String(v.to_string()))
207 }
208
209 fn visit_string<E>(self, v: String) -> Result<DataValue, E> {
210 if let Some(epoch) = try_parse_iso8601(&v) {
211 return Ok(DataValue::Timestamp(epoch));
212 }
213 Ok(DataValue::String(v))
214 }
215
216 fn visit_bytes<E>(self, v: &[u8]) -> Result<DataValue, E> {
217 Ok(DataValue::Bytes(v.to_vec()))
218 }
219
220 fn visit_byte_buf<E>(self, v: Vec<u8>) -> Result<DataValue, E> {
221 Ok(DataValue::Bytes(v))
222 }
223
224 fn visit_seq<A>(self, mut seq: A) -> Result<DataValue, A::Error>
225 where
226 A: serde::de::SeqAccess<'de>,
227 {
228 let mut array = Vec::new();
229 while let Some(value) = seq.next_element()? {
230 array.push(value);
231 }
232 Ok(DataValue::Array(array))
233 }
234
235 fn visit_map<A>(self, mut map: A) -> Result<DataValue, A::Error>
236 where
237 A: serde::de::MapAccess<'de>,
238 {
239 let mut hash_map = HashMap::new();
240 while let Some((key, value)) = map.next_entry()? {
241 hash_map.insert(key, value);
242 }
243 Ok(DataValue::Map(hash_map))
244 }
245 }
246
247 deserializer.deserialize_any(DataValueVisitor)
248 }
249}
250
251impl Hash for DataValue {
253 fn hash<H: Hasher>(&self, state: &mut H) {
254 std::mem::discriminant(self).hash(state);
256
257 match self {
258 Self::Pointer(v) | Self::Uint32(v) => v.hash(state),
259 Self::String(v) => v.hash(state),
260 Self::Double(v) => {
261 v.to_bits().hash(state);
263 }
264 Self::Bytes(v) => v.hash(state),
265 Self::Uint16(v) => v.hash(state),
266 Self::Map(m) => {
267 let mut keys: Vec<&String> = m.keys().collect();
269 keys.sort_unstable();
270 keys.len().hash(state);
271 for key in keys {
272 key.hash(state);
273 m[key].hash(state);
274 }
275 }
276 Self::Int32(v) => v.hash(state),
277 Self::Uint64(v) => v.hash(state),
278 Self::Uint128(v) => v.hash(state),
279 Self::Array(v) => {
280 v.len().hash(state);
281 for item in v {
282 item.hash(state);
283 }
284 }
285 Self::Bool(v) => v.hash(state),
286 Self::Float(v) => {
287 v.to_bits().hash(state);
289 }
290 Self::Timestamp(v) => v.hash(state),
291 }
292 }
293}
294
295pub struct DataEncoder {
301 buffer: Vec<u8>,
303 dedup_map: HashMap<Vec<u8>, u32>,
305 string_cache: HashMap<String, u32>,
307 intern_strings: bool,
309}
310
311impl DataEncoder {
312 #[must_use]
314 pub fn new() -> Self {
315 Self {
316 buffer: Vec::new(),
317 dedup_map: HashMap::new(),
318 string_cache: HashMap::new(),
319 intern_strings: true,
320 }
321 }
322
323 #[must_use]
325 pub fn new_without_interning() -> Self {
326 Self {
327 buffer: Vec::new(),
328 dedup_map: HashMap::new(),
329 string_cache: HashMap::new(),
330 intern_strings: false,
331 }
332 }
333
334 pub fn encode(&mut self, value: &DataValue) -> u32 {
340 let saved_intern = self.intern_strings;
345 self.intern_strings = false;
346
347 let mut temp = Vec::new();
348 Self::encode_to_buffer(value, &mut temp);
349
350 self.intern_strings = saved_intern;
352
353 if let Some(&offset) = self.dedup_map.get(&temp) {
355 return offset;
356 }
357
358 let offset = u32::try_from(self.buffer.len()).expect("Data section exceeds u32::MAX bytes");
360 self.encode_value_interned(value);
361 self.dedup_map.insert(temp, offset);
362 offset
363 }
364
365 #[must_use]
367 pub fn into_bytes(self) -> Vec<u8> {
368 self.buffer
369 }
370
371 #[must_use]
373 pub fn size(&self) -> usize {
374 self.buffer.len()
375 }
376
377 fn encode_value_interned(&mut self, value: &DataValue) {
381 match value {
382 DataValue::String(s) if self.intern_strings => {
383 if let Some(&existing_offset) = self.string_cache.get(s) {
385 Self::encode_pointer(existing_offset, &mut self.buffer);
387 } else {
388 let offset = u32::try_from(self.buffer.len())
390 .expect("Data section exceeds u32::MAX bytes");
391 Self::encode_string(s, &mut self.buffer);
392 self.string_cache.insert(s.clone(), offset);
393 }
394 }
395 DataValue::Map(m) => self.encode_map_interned(m),
396 DataValue::Array(a) => self.encode_array_interned(a),
397 _ => Self::encode_to_buffer(value, &mut self.buffer),
399 }
400 }
401
402 fn encode_to_buffer(value: &DataValue, buffer: &mut Vec<u8>) {
404 match value {
405 DataValue::Pointer(offset) => Self::encode_pointer(*offset, buffer),
406 DataValue::String(s) => Self::encode_string(s, buffer),
407 DataValue::Double(d) => Self::encode_double(*d, buffer),
408 DataValue::Bytes(b) => Self::encode_bytes(b, buffer),
409 DataValue::Uint16(n) => Self::encode_uint16(*n, buffer),
410 DataValue::Uint32(n) => Self::encode_uint32(*n, buffer),
411 DataValue::Map(m) => Self::encode_map(m, buffer),
412 DataValue::Int32(n) => Self::encode_int32(*n, buffer),
413 DataValue::Uint64(n) => Self::encode_uint64(*n, buffer),
414 DataValue::Uint128(n) => Self::encode_uint128(*n, buffer),
415 DataValue::Array(a) => Self::encode_array(a, buffer),
416 DataValue::Bool(b) => Self::encode_bool(*b, buffer),
417 DataValue::Float(f) => Self::encode_float(*f, buffer),
418 DataValue::Timestamp(t) => Self::encode_timestamp(*t, buffer),
419 }
420 }
421
422 fn encode_pointer(offset: u32, buffer: &mut Vec<u8>) {
424 let size = if offset < 2048 {
425 0 } else if offset < 2048 + 524288 {
427 1 } else if offset < 2048 + 524288 + 134217728 {
429 2 } else {
431 3 };
433
434 match size {
435 0 => {
436 let high_3_bits = ((offset >> 8) & 0x7) as u8;
439 let low_8_bits = (offset & 0xFF) as u8;
440 let ctrl = 0x20 | high_3_bits; buffer.push(ctrl);
442 buffer.push(low_8_bits);
443 }
444 1 => {
445 let adjusted = offset - 2048;
448 let high_3_bits = ((adjusted >> 16) & 0x7) as u8;
449 let mid_8_bits = ((adjusted >> 8) & 0xFF) as u8;
450 let low_8_bits = (adjusted & 0xFF) as u8;
451 let ctrl = 0x20 | (1 << 3) | high_3_bits; buffer.push(ctrl);
453 buffer.push(mid_8_bits);
454 buffer.push(low_8_bits);
455 }
456 2 => {
457 let adjusted = offset - 526336;
460 let high_3_bits = ((adjusted >> 24) & 0x7) as u8;
461 let b0 = ((adjusted >> 16) & 0xFF) as u8;
462 let b1 = ((adjusted >> 8) & 0xFF) as u8;
463 let b2 = (adjusted & 0xFF) as u8;
464 let ctrl = 0x20 | (2 << 3) | high_3_bits; buffer.push(ctrl);
466 buffer.push(b0);
467 buffer.push(b1);
468 buffer.push(b2);
469 }
470 _ => {
471 let ctrl = 0x20 | (3 << 3); buffer.push(ctrl);
474 buffer.extend_from_slice(&offset.to_be_bytes());
475 }
476 }
477 }
478
479 fn encode_string(s: &str, buffer: &mut Vec<u8>) {
481 let bytes = s.as_bytes();
482 Self::encode_with_size(2, bytes.len(), buffer);
483 buffer.extend_from_slice(bytes);
484 }
485
486 fn encode_double(d: f64, buffer: &mut Vec<u8>) {
488 buffer.push(0x68); buffer.extend_from_slice(&d.to_be_bytes());
490 }
491
492 fn encode_bytes(b: &[u8], buffer: &mut Vec<u8>) {
494 Self::encode_with_size(4, b.len(), buffer);
495 buffer.extend_from_slice(b);
496 }
497
498 fn encode_uint16(n: u16, buffer: &mut Vec<u8>) {
500 buffer.push(0xA2); buffer.extend_from_slice(&n.to_be_bytes());
502 }
503
504 fn encode_uint32(n: u32, buffer: &mut Vec<u8>) {
506 buffer.push(0xC4); buffer.extend_from_slice(&n.to_be_bytes());
508 }
509
510 fn encode_map_interned(&mut self, m: &HashMap<String, DataValue>) {
512 Self::encode_with_size(7, m.len(), &mut self.buffer);
513
514 let mut pairs: Vec<_> = m.iter().collect();
516 pairs.sort_by_key(|(k, _)| *k);
517
518 for (key, value) in pairs {
519 if self.intern_strings {
521 if let Some(&existing_offset) = self.string_cache.get(key) {
522 Self::encode_pointer(existing_offset, &mut self.buffer);
523 } else {
524 let offset = u32::try_from(self.buffer.len())
525 .expect("Data section exceeds u32::MAX bytes");
526 Self::encode_string(key, &mut self.buffer);
527 self.string_cache.insert(key.clone(), offset);
528 }
529 } else {
530 Self::encode_string(key, &mut self.buffer);
531 }
532
533 self.encode_value_interned(value);
535 }
536 }
537
538 fn encode_map(m: &HashMap<String, DataValue>, buffer: &mut Vec<u8>) {
540 Self::encode_with_size(7, m.len(), buffer);
541
542 let mut pairs: Vec<_> = m.iter().collect();
544 pairs.sort_by_key(|(k, _)| *k);
545
546 for (key, value) in pairs {
547 Self::encode_string(key, buffer);
548 Self::encode_to_buffer(value, buffer);
549 }
550 }
551
552 fn encode_int32(n: i32, buffer: &mut Vec<u8>) {
556 buffer.push(0x04); buffer.push(0x01); buffer.extend_from_slice(&n.to_be_bytes());
559 }
560
561 fn encode_uint64(n: u64, buffer: &mut Vec<u8>) {
563 buffer.push(0x08); buffer.push(0x02); buffer.extend_from_slice(&n.to_be_bytes());
566 }
567
568 fn encode_uint128(n: u128, buffer: &mut Vec<u8>) {
570 buffer.push(0x10); buffer.push(0x03); buffer.extend_from_slice(&n.to_be_bytes());
573 }
574
575 fn encode_array_interned(&mut self, a: &[DataValue]) {
577 let size = a.len();
578
579 if size < 29 {
581 self.buffer.push(u8::try_from(size).unwrap());
582 } else if size < 29 + 256 {
583 self.buffer.push(29);
584 self.buffer.push(u8::try_from(size - 29).unwrap());
585 } else if size < 29 + 256 + 65536 {
586 self.buffer.push(30);
587 let adjusted = size - 29 - 256;
588 self.buffer
589 .extend_from_slice(&u16::try_from(adjusted).unwrap().to_be_bytes());
590 } else {
591 self.buffer.push(31);
592 let adjusted = size - 29 - 256 - 65536;
593 self.buffer
594 .extend_from_slice(&u32::try_from(adjusted).unwrap().to_be_bytes()[1..]);
595 }
596
597 self.buffer.push(0x04); for value in a {
602 self.encode_value_interned(value);
603 }
604 }
605
606 fn encode_array(a: &[DataValue], buffer: &mut Vec<u8>) {
608 let size = a.len();
612
613 if size < 29 {
615 buffer.push(u8::try_from(size).unwrap());
616 } else if size < 29 + 256 {
617 buffer.push(29);
618 buffer.push(u8::try_from(size - 29).unwrap());
619 } else if size < 29 + 256 + 65536 {
620 buffer.push(30);
621 let adjusted = size - 29 - 256;
622 buffer.extend_from_slice(&u16::try_from(adjusted).unwrap().to_be_bytes());
623 } else {
624 buffer.push(31);
625 let adjusted = size - 29 - 256 - 65536;
626 buffer.extend_from_slice(&u32::try_from(adjusted).unwrap().to_be_bytes()[1..]);
627 }
628
629 buffer.push(0x04); for value in a {
633 Self::encode_to_buffer(value, buffer);
634 }
635 }
636
637 fn encode_bool(b: bool, buffer: &mut Vec<u8>) {
639 if b {
640 buffer.push(0x01); } else {
642 buffer.push(0x00); }
644 buffer.push(0x07); }
646
647 fn encode_float(f: f32, buffer: &mut Vec<u8>) {
649 buffer.push(0x04); buffer.push(0x08); buffer.extend_from_slice(&f.to_be_bytes());
652 }
653
654 fn encode_timestamp(epoch: i64, buffer: &mut Vec<u8>) {
656 buffer.push(0x08); buffer.push(TIMESTAMP_EXTENDED_TYPE);
658 buffer.extend_from_slice(&epoch.to_be_bytes());
659 }
660
661 fn encode_with_size(type_id: u8, size: usize, buffer: &mut Vec<u8>) {
663 let type_bits = type_id << 5;
664
665 if size < 29 {
666 buffer.push(type_bits | u8::try_from(size).unwrap());
667 } else if size < 29 + 256 {
668 buffer.push(type_bits | 29);
669 buffer.push(u8::try_from(size - 29).unwrap());
670 } else if size < 29 + 256 + 65536 {
671 buffer.push(type_bits | 30);
672 let adjusted = size - 29 - 256;
673 buffer.extend_from_slice(&u16::try_from(adjusted).unwrap().to_be_bytes());
674 } else {
675 buffer.push(type_bits | 31);
676 let adjusted = size - 29 - 256 - 65536;
677 buffer.extend_from_slice(&u32::try_from(adjusted).unwrap().to_be_bytes()[1..]);
678 }
679 }
680}
681
682impl Default for DataEncoder {
683 fn default() -> Self {
684 Self::new()
685 }
686}
687
688pub struct DataDecoder<'a> {
693 buffer: &'a [u8],
694 base_offset: usize,
695}
696
697impl<'a> DataDecoder<'a> {
698 #[must_use]
704 pub fn new(buffer: &'a [u8], base_offset: usize) -> Self {
705 Self {
706 buffer,
707 base_offset,
708 }
709 }
710
711 pub fn decode(&self, offset: u32) -> Result<DataValue, &'static str> {
713 let mut cursor = offset as usize;
714 if cursor < self.base_offset {
715 return Err("Offset before base");
716 }
717 cursor -= self.base_offset;
718 let value = self.decode_at(&mut cursor)?;
719 self.resolve_pointers(value)
721 }
722
723 fn decode_at(&self, cursor: &mut usize) -> Result<DataValue, &'static str> {
724 if *cursor >= self.buffer.len() {
725 return Err("Cursor out of bounds");
726 }
727
728 let ctrl = self.buffer[*cursor];
729 *cursor += 1;
730
731 let type_id = ctrl >> 5;
732 let payload = ctrl & 0x1F;
733
734 match type_id {
735 0 => self.decode_extended(cursor, payload),
736 1 => self.decode_pointer(cursor, payload),
737 2 => self.decode_string(cursor, payload),
738 3 => self.decode_double(cursor),
739 4 => self.decode_bytes(cursor, payload),
740 5 => self.decode_uint16(cursor, payload),
741 6 => self.decode_uint32(cursor, payload),
742 7 => self.decode_map(cursor, payload),
743 _ => Err("Invalid type"),
744 }
745 }
746
747 fn decode_extended(
748 &self,
749 cursor: &mut usize,
750 size_from_ctrl: u8,
751 ) -> Result<DataValue, &'static str> {
752 if *cursor >= self.buffer.len() {
753 return Err("Extended type truncated");
754 }
755
756 let raw_ext_type = self.buffer[*cursor];
759 let type_id = 7 + raw_ext_type;
760 *cursor += 1;
761
762 match type_id {
763 8 => self.decode_int32(cursor, size_from_ctrl), 9 => self.decode_uint64(cursor, size_from_ctrl), 10 => self.decode_uint128(cursor, size_from_ctrl), 11 => self.decode_array(cursor, size_from_ctrl), 14 => Ok(DataValue::Bool(size_from_ctrl != 0)), 15 => self.decode_float(cursor, size_from_ctrl), 128 => self.decode_timestamp(cursor, size_from_ctrl), _ => {
771 eprintln!(
772 "Unknown extended type: raw_ext_type={}, type_id={}, size_from_ctrl={}, offset={}",
773 raw_ext_type, type_id, size_from_ctrl, *cursor - 1
774 );
775 Err("Unknown extended type")
776 }
777 }
778 }
779
780 fn decode_pointer(&self, cursor: &mut usize, payload: u8) -> Result<DataValue, &'static str> {
781 let size_bits = (payload >> 3) & 0x3; let offset = match size_bits {
783 0 => {
784 if *cursor >= self.buffer.len() {
786 return Err("Pointer data truncated");
787 }
788 let low_3_bits = u32::from(payload & 0x7);
789 let next_byte = u32::from(self.buffer[*cursor]);
790 *cursor += 1;
791 (low_3_bits << 8) | next_byte
792 }
793 1 => {
794 if *cursor + 1 >= self.buffer.len() {
796 return Err("Pointer data truncated");
797 }
798 let low_3_bits = u32::from(payload & 0x7);
799 let b0 = u32::from(self.buffer[*cursor]);
800 let b1 = u32::from(self.buffer[*cursor + 1]);
801 *cursor += 2;
802 2048 + ((low_3_bits << 16) | (b0 << 8) | b1)
803 }
804 2 => {
805 if *cursor + 2 >= self.buffer.len() {
807 return Err("Pointer data truncated");
808 }
809 let low_3_bits = u32::from(payload & 0x7);
810 let b0 = u32::from(self.buffer[*cursor]);
811 let b1 = u32::from(self.buffer[*cursor + 1]);
812 let b2 = u32::from(self.buffer[*cursor + 2]);
813 *cursor += 3;
814 526336 + ((low_3_bits << 24) | (b0 << 16) | (b1 << 8) | b2)
815 }
816 3 => {
817 if *cursor + 3 >= self.buffer.len() {
819 return Err("Pointer data truncated");
820 }
821 let mut bytes = [0u8; 4];
822 bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 4]);
823 *cursor += 4;
824 u32::from_be_bytes(bytes)
825 }
826 _ => return Err("Invalid pointer size"),
827 };
828
829 Ok(DataValue::Pointer(offset))
830 }
831
832 fn decode_string(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
833 let len = self.decode_size(cursor, size_bits)?;
834
835 if *cursor + len > self.buffer.len() {
836 return Err("String data out of bounds");
837 }
838
839 let s = std::str::from_utf8(&self.buffer[*cursor..*cursor + len])
840 .map_err(|_| "Invalid UTF-8")?;
841 *cursor += len;
842
843 Ok(DataValue::String(s.to_string()))
844 }
845
846 fn decode_double(&self, cursor: &mut usize) -> Result<DataValue, &'static str> {
847 if *cursor + 8 > self.buffer.len() {
848 return Err("Double data out of bounds");
849 }
850
851 let mut bytes = [0u8; 8];
852 bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 8]);
853 *cursor += 8;
854
855 Ok(DataValue::Double(f64::from_be_bytes(bytes)))
856 }
857
858 fn decode_bytes(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
859 let len = self.decode_size(cursor, size_bits)?;
860
861 if *cursor + len > self.buffer.len() {
862 return Err("Bytes data out of bounds");
863 }
864
865 let bytes = self.buffer[*cursor..*cursor + len].to_vec();
866 *cursor += len;
867
868 Ok(DataValue::Bytes(bytes))
869 }
870
871 fn decode_uint16(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
872 let size = self.decode_size(cursor, size_bits)?;
873
874 if size > 2 {
875 return Err("Uint16 size too large");
876 }
877
878 if *cursor + size > self.buffer.len() {
879 return Err("Uint16 data out of bounds");
880 }
881
882 let mut value = 0u16;
884 for i in 0..size {
885 value = (value << 8) | u16::from(self.buffer[*cursor + i]);
886 }
887 *cursor += size;
888
889 Ok(DataValue::Uint16(value))
890 }
891
892 fn decode_uint32(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
893 let size = self.decode_size(cursor, size_bits)?;
894
895 if size > 4 {
896 return Err("Uint32 size too large");
897 }
898
899 if *cursor + size > self.buffer.len() {
900 return Err("Uint32 data out of bounds");
901 }
902
903 let mut value = 0u32;
905 for i in 0..size {
906 value = (value << 8) | u32::from(self.buffer[*cursor + i]);
907 }
908 *cursor += size;
909
910 Ok(DataValue::Uint32(value))
911 }
912
913 fn decode_map(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
914 let count = self.decode_size(cursor, size_bits)?;
915 let mut map = HashMap::new();
916
917 for _ in 0..count {
918 let key_value = self.decode_at(cursor)?;
920 let key = match key_value {
921 DataValue::String(s) => s,
922 DataValue::Pointer(offset) => {
923 match self.decode(offset)? {
925 DataValue::String(s) => s,
926 _ => return Err("Pointer in map key must point to string"),
927 }
928 }
929 _ => return Err("Map key must be string or pointer to string"),
930 };
931
932 let value = self.decode_at(cursor)?;
933 map.insert(key, value);
934 }
935
936 Ok(DataValue::Map(map))
937 }
938
939 fn decode_int32(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
940 let size = self.decode_size(cursor, size_bits)?;
941
942 if size > 4 {
943 return Err("Int32 size too large");
944 }
945
946 if *cursor + size > self.buffer.len() {
947 return Err("Int32 data out of bounds");
948 }
949
950 let mut value = 0i32;
952 if size > 0 {
953 let is_negative = (self.buffer[*cursor] & 0x80) != 0;
955
956 if is_negative {
957 value = -1;
959 }
960
961 for i in 0..size {
962 value = (value << 8) | i32::from(self.buffer[*cursor + i]);
963 }
964 }
965 *cursor += size;
966
967 Ok(DataValue::Int32(value))
968 }
969
970 fn decode_uint64(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
971 let size = self.decode_size(cursor, size_bits)?;
972
973 if size > 8 {
974 return Err("Uint64 size too large");
975 }
976
977 if *cursor + size > self.buffer.len() {
978 return Err("Uint64 data out of bounds");
979 }
980
981 let mut value = 0u64;
983 for i in 0..size {
984 value = (value << 8) | u64::from(self.buffer[*cursor + i]);
985 }
986 *cursor += size;
987
988 Ok(DataValue::Uint64(value))
989 }
990
991 fn decode_uint128(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
992 let size = self.decode_size(cursor, size_bits)?;
993
994 if size > 16 {
995 return Err("Uint128 size too large");
996 }
997
998 if *cursor + size > self.buffer.len() {
999 return Err("Uint128 data out of bounds");
1000 }
1001
1002 let mut value = 0u128;
1004 for i in 0..size {
1005 value = (value << 8) | u128::from(self.buffer[*cursor + i]);
1006 }
1007 *cursor += size;
1008
1009 Ok(DataValue::Uint128(value))
1010 }
1011
1012 fn decode_array(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
1013 let count = self.decode_size(cursor, size_bits)?;
1014 let mut array = Vec::with_capacity(count);
1015
1016 for _ in 0..count {
1017 array.push(self.decode_at(cursor)?);
1018 }
1019
1020 Ok(DataValue::Array(array))
1021 }
1022
1023 fn decode_float(&self, cursor: &mut usize, size_bits: u8) -> Result<DataValue, &'static str> {
1024 if size_bits != 4 {
1026 return Err("Float must be 4 bytes");
1027 }
1028
1029 if *cursor + 4 > self.buffer.len() {
1030 return Err("Float data out of bounds");
1031 }
1032
1033 let mut bytes = [0u8; 4];
1034 bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 4]);
1035 *cursor += 4;
1036
1037 Ok(DataValue::Float(f32::from_be_bytes(bytes)))
1038 }
1039
1040 fn decode_timestamp(
1041 &self,
1042 cursor: &mut usize,
1043 size_bits: u8,
1044 ) -> Result<DataValue, &'static str> {
1045 if size_bits != 8 {
1046 return Err("Timestamp must be 8 bytes");
1047 }
1048
1049 if *cursor + 8 > self.buffer.len() {
1050 return Err("Timestamp data out of bounds");
1051 }
1052
1053 let mut bytes = [0u8; 8];
1054 bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 8]);
1055 *cursor += 8;
1056
1057 Ok(DataValue::Timestamp(i64::from_be_bytes(bytes)))
1058 }
1059
1060 fn decode_size(&self, cursor: &mut usize, size_bits: u8) -> Result<usize, &'static str> {
1061 match size_bits {
1062 0..=28 => Ok(size_bits as usize),
1063 29 => {
1064 if *cursor >= self.buffer.len() {
1065 return Err("Size byte out of bounds");
1066 }
1067 let size = self.buffer[*cursor] as usize;
1068 *cursor += 1;
1069 Ok(29 + size)
1070 }
1071 30 => {
1072 if *cursor + 2 > self.buffer.len() {
1073 return Err("Size bytes out of bounds");
1074 }
1075 let mut bytes = [0u8; 2];
1076 bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 2]);
1077 *cursor += 2;
1078 Ok(29 + 256 + u16::from_be_bytes(bytes) as usize)
1079 }
1080 31 => {
1081 if *cursor + 3 > self.buffer.len() {
1082 return Err("Size bytes out of bounds");
1083 }
1084 let b0 = self.buffer[*cursor] as usize;
1085 let b1 = self.buffer[*cursor + 1] as usize;
1086 let b2 = self.buffer[*cursor + 2] as usize;
1087 *cursor += 3;
1088 Ok(29 + 256 + 65536 + ((b0 << 16) | (b1 << 8) | b2))
1089 }
1090 _ => Err("Invalid size encoding"),
1091 }
1092 }
1093
1094 fn resolve_pointers(&self, value: DataValue) -> Result<DataValue, &'static str> {
1096 match value {
1097 DataValue::Pointer(offset) => {
1098 let mut cursor = offset as usize;
1100 if cursor < self.base_offset {
1101 return Err("Pointer offset before base");
1102 }
1103 cursor -= self.base_offset;
1104 let pointed_value = self.decode_at(&mut cursor)?;
1105 self.resolve_pointers(pointed_value)
1106 }
1107 DataValue::Map(entries) => {
1108 let mut resolved_map = HashMap::new();
1110 for (key, val) in entries {
1111 resolved_map.insert(key, self.resolve_pointers(val)?);
1112 }
1113 Ok(DataValue::Map(resolved_map))
1114 }
1115 DataValue::Array(items) => {
1116 let mut resolved_array = Vec::new();
1118 for item in items {
1119 resolved_array.push(self.resolve_pointers(item)?);
1120 }
1121 Ok(DataValue::Array(resolved_array))
1122 }
1123 other => Ok(other),
1125 }
1126 }
1127}
1128
1129#[cfg(test)]
1130mod tests {
1131 use super::*;
1132
1133 #[test]
1134 fn test_encode_decode_all_types() {
1135 let mut encoder = DataEncoder::new();
1136
1137 let string_val = DataValue::String("hello".to_string());
1139 let uint16_val = DataValue::Uint16(12345);
1140 let uint32_val = DataValue::Uint32(0xDEADBEEF);
1141 let uint64_val = DataValue::Uint64(0x123456789ABCDEF0);
1142 let uint128_val = DataValue::Uint128(0x0123456789ABCDEF0123456789ABCDEF);
1143 let int32_val = DataValue::Int32(-42);
1144 let double_val = DataValue::Double(std::f64::consts::PI);
1145 let float_val = DataValue::Float(std::f32::consts::E);
1146 let bool_val = DataValue::Bool(true);
1147 let bytes_val = DataValue::Bytes(vec![0xDE, 0xAD, 0xBE, 0xEF]);
1148
1149 let offsets = [
1150 encoder.encode(&string_val),
1151 encoder.encode(&uint16_val),
1152 encoder.encode(&uint32_val),
1153 encoder.encode(&uint64_val),
1154 encoder.encode(&uint128_val),
1155 encoder.encode(&int32_val),
1156 encoder.encode(&double_val),
1157 encoder.encode(&float_val),
1158 encoder.encode(&bool_val),
1159 encoder.encode(&bytes_val),
1160 ];
1161
1162 let bytes = encoder.into_bytes();
1163 let decoder = DataDecoder::new(&bytes, 0);
1164
1165 let values = vec![
1166 string_val,
1167 uint16_val,
1168 uint32_val,
1169 uint64_val,
1170 uint128_val,
1171 int32_val,
1172 double_val,
1173 float_val,
1174 bool_val,
1175 bytes_val,
1176 ];
1177
1178 for (offset, expected) in offsets.iter().zip(values.iter()) {
1179 let decoded = decoder.decode(*offset).unwrap();
1180 assert_eq!(&decoded, expected);
1181 }
1182 }
1183
1184 #[test]
1185 fn test_encode_decode_map() {
1186 let mut encoder = DataEncoder::new();
1187 let mut map = HashMap::new();
1188 map.insert("country".to_string(), DataValue::String("US".to_string()));
1189 map.insert("asn".to_string(), DataValue::Uint32(13335));
1190 map.insert("score".to_string(), DataValue::Double(0.95));
1191
1192 let value = DataValue::Map(map.clone());
1193 let offset = encoder.encode(&value);
1194
1195 let bytes = encoder.into_bytes();
1196 let decoder = DataDecoder::new(&bytes, 0);
1197 let decoded = decoder.decode(offset).unwrap();
1198
1199 assert_eq!(decoded, value);
1200 }
1201
1202 #[test]
1203 fn test_encode_decode_array() {
1204 let mut encoder = DataEncoder::new();
1205 let value = DataValue::Array(vec![
1206 DataValue::String("tag1".to_string()),
1207 DataValue::String("tag2".to_string()),
1208 DataValue::Uint32(123),
1209 DataValue::Bool(false),
1210 ]);
1211 let offset = encoder.encode(&value);
1212
1213 let bytes = encoder.into_bytes();
1214 let decoder = DataDecoder::new(&bytes, 0);
1215 let decoded = decoder.decode(offset).unwrap();
1216
1217 assert_eq!(decoded, value);
1218 }
1219
1220 #[test]
1221 fn test_deduplication() {
1222 let mut encoder = DataEncoder::new();
1223
1224 let value = DataValue::String("test".to_string());
1226 let offset1 = encoder.encode(&value);
1227 let offset2 = encoder.encode(&value);
1228 let offset3 = encoder.encode(&value);
1229
1230 assert_eq!(offset1, offset2);
1232 assert_eq!(offset2, offset3);
1233
1234 let value2 = DataValue::String("different".to_string());
1236 let offset4 = encoder.encode(&value2);
1237 assert_ne!(offset1, offset4);
1238 }
1239
1240 #[test]
1241 fn test_complex_nested_structure() {
1242 let mut encoder = DataEncoder::new();
1243
1244 let mut threat_data = HashMap::new();
1246 threat_data.insert(
1247 "threat_level".to_string(),
1248 DataValue::String("high".to_string()),
1249 );
1250 threat_data.insert(
1251 "category".to_string(),
1252 DataValue::String("malware".to_string()),
1253 );
1254 threat_data.insert("confidence".to_string(), DataValue::Float(0.98));
1255 threat_data.insert("first_seen".to_string(), DataValue::Uint64(1704067200));
1256
1257 let mut indicators = HashMap::new();
1258 indicators.insert("ip_count".to_string(), DataValue::Uint32(42));
1259 indicators.insert("domain_count".to_string(), DataValue::Uint32(15));
1260
1261 threat_data.insert("indicators".to_string(), DataValue::Map(indicators));
1262 threat_data.insert(
1263 "tags".to_string(),
1264 DataValue::Array(vec![
1265 DataValue::String("botnet".to_string()),
1266 DataValue::String("c2".to_string()),
1267 ]),
1268 );
1269 threat_data.insert("active".to_string(), DataValue::Bool(true));
1270
1271 let value = DataValue::Map(threat_data);
1272 let offset = encoder.encode(&value);
1273
1274 let bytes = encoder.into_bytes();
1275 let decoder = DataDecoder::new(&bytes, 0);
1276 let decoded = decoder.decode(offset).unwrap();
1277
1278 assert_eq!(decoded, value);
1279 }
1280
1281 #[test]
1282 fn test_large_strings() {
1283 let mut encoder = DataEncoder::new();
1284
1285 let short = "x".repeat(28); let medium = "x".repeat(100); let long = "x".repeat(1000); let offset1 = encoder.encode(&DataValue::String(short.clone()));
1291 let offset2 = encoder.encode(&DataValue::String(medium.clone()));
1292 let offset3 = encoder.encode(&DataValue::String(long.clone()));
1293
1294 let bytes = encoder.into_bytes();
1295 let decoder = DataDecoder::new(&bytes, 0);
1296
1297 assert_eq!(decoder.decode(offset1).unwrap(), DataValue::String(short));
1298 assert_eq!(decoder.decode(offset2).unwrap(), DataValue::String(medium));
1299 assert_eq!(decoder.decode(offset3).unwrap(), DataValue::String(long));
1300 }
1301
1302 #[test]
1303 fn test_string_interning() {
1304 let mut encoder = DataEncoder::new();
1306
1307 let mut map1 = HashMap::new();
1309 map1.insert(
1310 "threat_level".to_string(),
1311 DataValue::String("high".to_string()),
1312 );
1313 map1.insert(
1314 "category".to_string(),
1315 DataValue::String("malware".to_string()),
1316 );
1317 map1.insert("score".to_string(), DataValue::Uint32(95));
1318
1319 let mut map2 = HashMap::new();
1320 map2.insert(
1321 "threat_level".to_string(),
1322 DataValue::String("high".to_string()),
1323 ); map2.insert(
1325 "category".to_string(),
1326 DataValue::String("phishing".to_string()),
1327 );
1328 map2.insert("score".to_string(), DataValue::Uint32(88));
1329
1330 let mut map3 = HashMap::new();
1331 map3.insert(
1332 "threat_level".to_string(),
1333 DataValue::String("high".to_string()),
1334 ); map3.insert(
1336 "category".to_string(),
1337 DataValue::String("malware".to_string()),
1338 ); map3.insert("score".to_string(), DataValue::Uint32(92));
1340
1341 let offset1 = encoder.encode(&DataValue::Map(map1.clone()));
1343 let offset2 = encoder.encode(&DataValue::Map(map2.clone()));
1344 let offset3 = encoder.encode(&DataValue::Map(map3.clone()));
1345
1346 let bytes_with_interning = encoder.into_bytes();
1347
1348 let mut encoder_no_intern = DataEncoder::new_without_interning();
1350 encoder_no_intern.encode(&DataValue::Map(map1.clone()));
1351 encoder_no_intern.encode(&DataValue::Map(map2.clone()));
1352 encoder_no_intern.encode(&DataValue::Map(map3.clone()));
1353 let bytes_no_interning = encoder_no_intern.into_bytes();
1354
1355 println!("With interning: {} bytes", bytes_with_interning.len());
1357 println!("Without interning: {} bytes", bytes_no_interning.len());
1358 println!(
1359 "Savings: {} bytes ({:.1}%)",
1360 bytes_no_interning.len() - bytes_with_interning.len(),
1361 100.0 * (bytes_no_interning.len() - bytes_with_interning.len()) as f64
1362 / bytes_no_interning.len() as f64
1363 );
1364 assert!(bytes_with_interning.len() < bytes_no_interning.len());
1365
1366 let decoder = DataDecoder::new(&bytes_with_interning, 0);
1368 let decoded1 = decoder.decode(offset1).unwrap();
1369 let decoded2 = decoder.decode(offset2).unwrap();
1370 let decoded3 = decoder.decode(offset3).unwrap();
1371
1372 assert_eq!(decoded1, DataValue::Map(map1));
1373 assert_eq!(decoded2, DataValue::Map(map2));
1374 assert_eq!(decoded3, DataValue::Map(map3));
1375 }
1376
1377 #[test]
1378 fn test_string_interning_in_arrays() {
1379 let mut encoder = DataEncoder::new();
1381
1382 let array = DataValue::Array(vec![
1383 DataValue::String("botnet".to_string()),
1384 DataValue::String("c2".to_string()),
1385 DataValue::String("botnet".to_string()), DataValue::String("malware".to_string()),
1387 DataValue::String("c2".to_string()), ]);
1389
1390 let offset = encoder.encode(&array);
1391 let bytes = encoder.into_bytes();
1392
1393 let decoder = DataDecoder::new(&bytes, 0);
1395 let decoded = decoder.decode(offset).unwrap();
1396 assert_eq!(decoded, array);
1397 }
1398
1399 #[test]
1400 fn test_pointer_encoding() {
1401 let mut encoder = DataEncoder::new();
1403
1404 let target_data = DataValue::String("shared_value".to_string());
1406 let target_offset = encoder.encode(&target_data);
1407
1408 let mut map = HashMap::new();
1411 map.insert(
1412 "direct".to_string(),
1413 DataValue::String("direct_value".to_string()),
1414 );
1415 map.insert("ptr_ref".to_string(), DataValue::Pointer(target_offset));
1417
1418 let map_offset = encoder.encode(&DataValue::Map(map));
1419
1420 let bytes = encoder.into_bytes();
1421 let decoder = DataDecoder::new(&bytes, 0);
1422
1423 let decoded = decoder.decode(map_offset).unwrap();
1425
1426 if let DataValue::Map(decoded_map) = decoded {
1427 assert_eq!(
1429 decoded_map.get("direct"),
1430 Some(&DataValue::String("direct_value".to_string()))
1431 );
1432 assert_eq!(
1433 decoded_map.get("ptr_ref"),
1434 Some(&DataValue::String("shared_value".to_string()))
1435 );
1436 } else {
1437 panic!("Expected Map, got {decoded:?}");
1438 }
1439 }
1440
1441 #[test]
1442 fn test_large_negative_integer_rejected() {
1443 let json = format!("{}", i64::MIN);
1444 let result: Result<DataValue, _> = serde_json::from_str(&json);
1445 assert!(result.is_err());
1446 let err = result.unwrap_err().to_string();
1447 assert!(err.contains("outside the supported signed integer range"));
1448 }
1449
1450 #[test]
1451 fn test_i32_min_accepted() {
1452 let json = format!("{}", i32::MIN);
1453 let result: Result<DataValue, _> = serde_json::from_str(&json);
1454 assert!(result.is_ok());
1455 assert_eq!(result.unwrap(), DataValue::Int32(i32::MIN));
1456 }
1457
1458 #[test]
1459 fn test_timestamp_binary_roundtrip() {
1460 let mut encoder = DataEncoder::new();
1461 let epoch = 1727894671i64; let value = DataValue::Timestamp(epoch);
1463 let offset = encoder.encode(&value);
1464
1465 let bytes = encoder.into_bytes();
1466 let decoder = DataDecoder::new(&bytes, 0);
1467 let decoded = decoder.decode(offset).unwrap();
1468
1469 assert_eq!(decoded, DataValue::Timestamp(epoch));
1470 }
1471
1472 #[test]
1473 fn test_timestamp_json_serialize() {
1474 let value = DataValue::Timestamp(1727894671);
1475 let json = serde_json::to_string(&value).unwrap();
1476 assert_eq!(json, "\"2024-10-02T18:44:31Z\"");
1477 }
1478
1479 #[test]
1480 fn test_timestamp_json_deserialize() {
1481 let json = "\"2024-10-02T18:44:31Z\"";
1482 let value: DataValue = serde_json::from_str(json).unwrap();
1483 assert_eq!(value, DataValue::Timestamp(1727894671));
1484 }
1485
1486 #[test]
1487 fn test_timestamp_with_fractional_seconds() {
1488 let json = "\"2024-10-02T18:44:31.123456Z\"";
1489 let value: DataValue = serde_json::from_str(json).unwrap();
1490 if let DataValue::Timestamp(epoch) = value {
1491 assert_eq!(epoch, 1727894671);
1492 } else {
1493 panic!("Expected Timestamp, got {value:?}");
1494 }
1495 }
1496
1497 #[test]
1498 fn test_non_timestamp_string_stays_string() {
1499 let json = "\"hello world\"";
1500 let value: DataValue = serde_json::from_str(json).unwrap();
1501 assert_eq!(value, DataValue::String("hello world".to_string()));
1502 }
1503
1504 #[test]
1505 fn test_timestamp_negative_epoch() {
1506 let mut encoder = DataEncoder::new();
1507 let epoch = -86400i64; let value = DataValue::Timestamp(epoch);
1509 let offset = encoder.encode(&value);
1510
1511 let bytes = encoder.into_bytes();
1512 let decoder = DataDecoder::new(&bytes, 0);
1513 let decoded = decoder.decode(offset).unwrap();
1514
1515 assert_eq!(decoded, DataValue::Timestamp(epoch));
1516 }
1517
1518 #[test]
1519 fn test_timestamp_in_map() {
1520 let mut encoder = DataEncoder::new();
1521 let mut map = HashMap::new();
1522 map.insert("first_seen".to_string(), DataValue::Timestamp(1727894671));
1523 map.insert("last_seen".to_string(), DataValue::Timestamp(1727981071));
1524 map.insert("name".to_string(), DataValue::String("test".to_string()));
1525
1526 let offset = encoder.encode(&DataValue::Map(map.clone()));
1527
1528 let bytes = encoder.into_bytes();
1529 let decoder = DataDecoder::new(&bytes, 0);
1530 let decoded = decoder.decode(offset).unwrap();
1531
1532 if let DataValue::Map(decoded_map) = decoded {
1533 assert_eq!(
1534 decoded_map.get("first_seen"),
1535 Some(&DataValue::Timestamp(1727894671))
1536 );
1537 assert_eq!(
1538 decoded_map.get("last_seen"),
1539 Some(&DataValue::Timestamp(1727981071))
1540 );
1541 assert_eq!(
1542 decoded_map.get("name"),
1543 Some(&DataValue::String("test".to_string()))
1544 );
1545 } else {
1546 panic!("Expected Map, got {decoded:?}");
1547 }
1548 }
1549}