Skip to main content

st/
mem8_binary.rs

1// MEM8 Binary Format - "The REAL wave-based memory!" 🌊
2// Proper binary .m8 format with wave interference and temporal encoding
3// "No more JSON pretenders!" - Hue
4
5use anyhow::{Context, Result};
6use std::collections::HashMap;
7use std::fs::{File, OpenOptions};
8use std::io::{Read, Seek, SeekFrom, Write};
9use std::path::{Path, PathBuf};
10use std::time::{SystemTime, UNIX_EPOCH};
11
12/// Magic bytes for .m8 files
13const M8_MAGIC: &[u8; 4] = b"MEM8";
14
15/// Version of the .m8 format
16const M8_VERSION: u8 = 1;
17
18/// Block size (4KB for efficiency)
19const BLOCK_SIZE: usize = 4096;
20
21/// Header for .m8 files
22#[repr(C, packed)]
23pub struct M8Header {
24    magic: [u8; 4],
25    version: u8,
26    flags: u8,
27    block_count: u32,
28    identity_freq: f64,
29    temporal_phase: f64,
30    crc32: u32,
31}
32
33impl M8Header {
34    fn to_bytes(&self) -> Vec<u8> {
35        let mut bytes = Vec::with_capacity(std::mem::size_of::<Self>());
36        bytes.extend_from_slice(&self.magic);
37        bytes.push(self.version);
38        bytes.push(self.flags);
39        bytes.extend_from_slice(&self.block_count.to_le_bytes());
40        bytes.extend_from_slice(&self.identity_freq.to_le_bytes());
41        bytes.extend_from_slice(&self.temporal_phase.to_le_bytes());
42        bytes.extend_from_slice(&self.crc32.to_le_bytes());
43        bytes
44    }
45
46    fn from_bytes(bytes: &[u8]) -> Result<Self> {
47        if bytes.len() < std::mem::size_of::<Self>() {
48            anyhow::bail!("Invalid header size");
49        }
50
51        let mut cursor = 0;
52        let mut magic = [0u8; 4];
53        magic.copy_from_slice(&bytes[cursor..cursor + 4]);
54        cursor += 4;
55
56        let version = bytes[cursor];
57        cursor += 1;
58
59        let flags = bytes[cursor];
60        cursor += 1;
61
62        let block_count = u32::from_le_bytes(bytes[cursor..cursor + 4].try_into()?);
63        cursor += 4;
64
65        let identity_freq = f64::from_le_bytes(bytes[cursor..cursor + 8].try_into()?);
66        cursor += 8;
67
68        let temporal_phase = f64::from_le_bytes(bytes[cursor..cursor + 8].try_into()?);
69        cursor += 8;
70
71        let crc32 = u32::from_le_bytes(bytes[cursor..cursor + 4].try_into()?);
72
73        Ok(Self {
74            magic,
75            version,
76            flags,
77            block_count,
78            identity_freq,
79            temporal_phase,
80            crc32,
81        })
82    }
83}
84
85/// Memory block in .m8 format
86#[repr(C)]
87#[derive(Debug, Clone)]
88pub struct M8Block {
89    /// Block index (even = consciousness, odd = context)
90    pub index: u64,
91
92    /// Wave signature (16 bytes)
93    pub wave_signature: [u8; 16],
94
95    /// Temporal timestamp (microseconds since epoch)
96    pub timestamp: u64,
97
98    /// Importance score (0.0 to 1.0 encoded as u16)
99    pub importance: u16,
100
101    /// Token ID (dynamic tokenization)
102    pub token_id: u16,
103
104    /// Previous block hash (for chaining)
105    pub prev_hash: [u8; 32],
106
107    /// Content length
108    pub content_len: u32,
109
110    /// Content (variable length, padded to BLOCK_SIZE)
111    pub content: Vec<u8>,
112}
113
114/// Token mapping for dynamic tokenization
115pub struct TokenMap {
116    /// String to token ID
117    str_to_token: HashMap<String, u16>,
118
119    /// Token ID to string
120    token_to_str: HashMap<u16, String>,
121
122    /// Next available token ID
123    next_id: u16,
124
125    /// Reserved tokens (0x80 = node_modules, etc)
126    reserved: HashMap<u16, String>,
127}
128
129impl Default for TokenMap {
130    fn default() -> Self {
131        Self::new()
132    }
133}
134
135impl TokenMap {
136    pub fn new() -> Self {
137        let mut reserved = HashMap::new();
138        reserved.insert(0x80, "node_modules".to_string());
139        reserved.insert(0x81, ".git".to_string());
140        reserved.insert(0x82, "target".to_string());
141        reserved.insert(0x83, "dist".to_string());
142        reserved.insert(0x84, "build".to_string());
143        reserved.insert(0x90, ".rs".to_string());
144        reserved.insert(0x91, ".py".to_string());
145        reserved.insert(0x92, ".js".to_string());
146        reserved.insert(0x93, ".ts".to_string());
147        reserved.insert(0xFFFE, "Claude".to_string());
148        reserved.insert(0xFFFF, "Hue".to_string());
149
150        let mut str_to_token = HashMap::new();
151        let mut token_to_str = HashMap::new();
152
153        for (&id, value) in &reserved {
154            str_to_token.insert(value.clone(), id);
155            token_to_str.insert(id, value.clone());
156        }
157
158        Self {
159            str_to_token,
160            token_to_str,
161            next_id: 0x100, // Start after reserved range
162            reserved,
163        }
164    }
165
166    /// Get or create token for string
167    pub fn get_token(&mut self, s: &str) -> u16 {
168        if let Some(&token) = self.str_to_token.get(s) {
169            return token;
170        }
171
172        let token = self.next_id;
173        self.next_id += 1;
174
175        self.str_to_token.insert(s.to_string(), token);
176        self.token_to_str.insert(token, s.to_string());
177
178        token
179    }
180
181    /// Decode token to string
182    pub fn decode_token(&self, token: u16) -> Option<&str> {
183        self.token_to_str.get(&token).map(|s| s.as_str())
184    }
185}
186
187/// MEM8 Binary File - append-only wave-based memory
188pub struct M8BinaryFile {
189    path: PathBuf,
190    file: File,
191    header: M8Header,
192    tokens: TokenMap,
193
194    /// Current position for backwards reading
195    read_position: u64,
196
197    /// Cache of important blocks for quick access
198    importance_index: Vec<(u64, f32)>, // (block_offset, importance)
199}
200
201impl M8BinaryFile {
202    /// Create new .m8 file
203    pub fn create(path: impl AsRef<Path>) -> Result<Self> {
204        let path = path.as_ref().to_path_buf();
205
206        let mut file = OpenOptions::new()
207            .create(true)
208            .write(true)
209            .read(true)
210            .truncate(true)
211            .open(&path)
212            .context("Failed to create .m8 file")?;
213
214        let header = M8Header {
215            magic: *M8_MAGIC,
216            version: M8_VERSION,
217            flags: 0,
218            block_count: 0,
219            identity_freq: 440.0, // A440 Hz by default
220            temporal_phase: 0.0,
221            crc32: 0,
222        };
223
224        // Write header
225        file.write_all(&header.to_bytes())?;
226
227        Ok(Self {
228            path,
229            file,
230            header,
231            tokens: TokenMap::new(),
232            read_position: std::mem::size_of::<M8Header>() as u64,
233            importance_index: Vec::new(),
234        })
235    }
236
237    /// Open existing .m8 file
238    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
239        let path = path.as_ref().to_path_buf();
240
241        let mut file = OpenOptions::new()
242            .read(true)
243            .write(true)
244            .open(&path)
245            .context("Failed to open .m8 file")?;
246
247        // Read header
248        let mut header_bytes = vec![0u8; std::mem::size_of::<M8Header>()];
249        file.read_exact(&mut header_bytes)?;
250
251        let header = M8Header::from_bytes(&header_bytes)?;
252
253        // Verify magic
254        if header.magic != *M8_MAGIC {
255            anyhow::bail!("Invalid .m8 file (bad magic)");
256        }
257
258        // Seek to end for backwards reading
259        let file_size = file.seek(SeekFrom::End(0))?;
260
261        let mut m8 = Self {
262            path,
263            file,
264            header,
265            tokens: TokenMap::new(),
266            read_position: file_size,
267            importance_index: Vec::new(),
268        };
269
270        // Build importance index
271        m8.build_importance_index()?;
272
273        Ok(m8)
274    }
275
276    /// Append memory block (never modifies existing blocks)
277    pub fn append_block(&mut self, content: &[u8], importance: f32) -> Result<()> {
278        let timestamp = SystemTime::now().duration_since(UNIX_EPOCH)?.as_micros() as u64;
279
280        let wave_sig = self.generate_wave_signature(content);
281
282        let block = M8Block {
283            index: self.header.block_count as u64,
284            wave_signature: wave_sig,
285            timestamp,
286            importance: (importance * 65535.0) as u16,
287            token_id: 0,        // TODO: tokenize content
288            prev_hash: [0; 32], // TODO: compute hash
289            content_len: content.len() as u32,
290            content: content.to_vec(),
291        };
292
293        // Seek to end
294        self.file.seek(SeekFrom::End(0))?;
295
296        // Write block
297        self.write_block(&block)?;
298
299        // Update header
300        self.header.block_count += 1;
301        self.update_header()?;
302
303        // Update importance index
304        let offset = self.file.stream_position()?;
305        self.importance_index.push((offset, importance));
306
307        Ok(())
308    }
309
310    /// Read backwards from end (most recent first)
311    pub fn read_backwards(&mut self) -> Result<Option<M8Block>> {
312        if self.read_position <= std::mem::size_of::<M8Header>() as u64 {
313            return Ok(None);
314        }
315
316        // Seek to previous block
317        self.read_position -= BLOCK_SIZE as u64;
318        self.file.seek(SeekFrom::Start(self.read_position))?;
319
320        // Read block
321        self.read_block()
322    }
323
324    /// Read blocks by importance (user keywords boost importance)
325    pub fn read_by_importance(&mut self, keywords: &[String]) -> Result<Vec<M8Block>> {
326        let mut blocks = Vec::new();
327
328        // Sort by importance
329        self.importance_index
330            .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
331
332        // Extract offsets to avoid holding any borrow of self while reading blocks
333        let offsets: Vec<u64> = self.importance_index.iter().map(|(off, _)| *off).collect();
334
335        for offset in offsets {
336            self.file.seek(SeekFrom::Start(offset))?;
337
338            if let Some(block) = self.read_block()? {
339                // Check if block contains keywords
340                let content_str = String::from_utf8_lossy(&block.content);
341                let has_keyword = keywords.iter().any(|kw| content_str.contains(kw));
342
343                if has_keyword || blocks.len() < 10 {
344                    blocks.push(block);
345                }
346
347                if blocks.len() >= 20 {
348                    break;
349                }
350            }
351        }
352
353        Ok(blocks)
354    }
355
356    /// Generate wave signature from content
357    fn generate_wave_signature(&self, content: &[u8]) -> [u8; 16] {
358        use sha2::{Digest, Sha256};
359
360        let mut hasher = Sha256::new();
361        hasher.update(content);
362        hasher.update(self.header.identity_freq.to_le_bytes());
363
364        let hash = hasher.finalize();
365        let mut signature = [0u8; 16];
366        signature.copy_from_slice(&hash[..16]);
367
368        signature
369    }
370
371    /// Write block to file
372    fn write_block(&mut self, block: &M8Block) -> Result<()> {
373        // Create fixed-size block buffer
374        let mut buffer = vec![0u8; BLOCK_SIZE];
375        let mut cursor = 0;
376
377        // Write block fields
378        buffer[cursor..cursor + 8].copy_from_slice(&block.index.to_le_bytes());
379        cursor += 8;
380
381        buffer[cursor..cursor + 16].copy_from_slice(&block.wave_signature);
382        cursor += 16;
383
384        buffer[cursor..cursor + 8].copy_from_slice(&block.timestamp.to_le_bytes());
385        cursor += 8;
386
387        buffer[cursor..cursor + 2].copy_from_slice(&block.importance.to_le_bytes());
388        cursor += 2;
389
390        buffer[cursor..cursor + 2].copy_from_slice(&block.token_id.to_le_bytes());
391        cursor += 2;
392
393        buffer[cursor..cursor + 32].copy_from_slice(&block.prev_hash);
394        cursor += 32;
395
396        buffer[cursor..cursor + 4].copy_from_slice(&block.content_len.to_le_bytes());
397        cursor += 4;
398
399        // Copy content (up to remaining space)
400        let content_space = BLOCK_SIZE - cursor;
401        let content_to_copy = block.content.len().min(content_space);
402        buffer[cursor..cursor + content_to_copy].copy_from_slice(&block.content[..content_to_copy]);
403
404        self.file.write_all(&buffer)?;
405
406        Ok(())
407    }
408
409    /// Read block from file
410    fn read_block(&mut self) -> Result<Option<M8Block>> {
411        let mut buffer = vec![0u8; BLOCK_SIZE];
412
413        match self.file.read_exact(&mut buffer) {
414            Ok(_) => {}
415            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => return Ok(None),
416            Err(e) => return Err(e.into()),
417        }
418
419        let mut cursor = 0;
420
421        let index = u64::from_le_bytes(buffer[cursor..cursor + 8].try_into()?);
422        cursor += 8;
423
424        let mut wave_signature = [0u8; 16];
425        wave_signature.copy_from_slice(&buffer[cursor..cursor + 16]);
426        cursor += 16;
427
428        let timestamp = u64::from_le_bytes(buffer[cursor..cursor + 8].try_into()?);
429        cursor += 8;
430
431        let importance = u16::from_le_bytes(buffer[cursor..cursor + 2].try_into()?);
432        cursor += 2;
433
434        let token_id = u16::from_le_bytes(buffer[cursor..cursor + 2].try_into()?);
435        cursor += 2;
436
437        let mut prev_hash = [0u8; 32];
438        prev_hash.copy_from_slice(&buffer[cursor..cursor + 32]);
439        cursor += 32;
440
441        let content_len = u32::from_le_bytes(buffer[cursor..cursor + 4].try_into()?);
442        cursor += 4;
443
444        let content = buffer[cursor..cursor + content_len as usize].to_vec();
445
446        Ok(Some(M8Block {
447            index,
448            wave_signature,
449            timestamp,
450            importance,
451            token_id,
452            prev_hash,
453            content_len,
454            content,
455        }))
456    }
457
458    /// Update header in file
459    fn update_header(&mut self) -> Result<()> {
460        self.file.seek(SeekFrom::Start(0))?;
461        self.file.write_all(&self.header.to_bytes())?;
462        self.file.flush()?;
463        Ok(())
464    }
465
466    /// Build importance index from file
467    fn build_importance_index(&mut self) -> Result<()> {
468        self.file
469            .seek(SeekFrom::Start(std::mem::size_of::<M8Header>() as u64))?;
470
471        loop {
472            let offset = self.file.stream_position()?;
473
474            match self.read_block()? {
475                Some(block) => {
476                    let importance = block.importance as f32 / 65535.0;
477                    self.importance_index.push((offset, importance));
478                }
479                None => break,
480            }
481        }
482
483        Ok(())
484    }
485}
486
487/// Convert existing JSON .m8 files to proper binary format
488pub fn convert_json_to_binary(json_path: &Path, binary_path: &Path) -> Result<()> {
489    use std::fs;
490
491    // Read and decompress if needed
492    let data = fs::read(json_path)?;
493
494    let json_str = if data.starts_with(b"\x78\x9c") || data.starts_with(b"\x78\xda") {
495        // zlib compressed
496        use flate2::read::ZlibDecoder;
497        let mut decoder = ZlibDecoder::new(&data[..]);
498        let mut decompressed = String::new();
499        decoder.read_to_string(&mut decompressed)?;
500        decompressed
501    } else {
502        String::from_utf8(data)?
503    };
504
505    // Parse JSON
506    let contexts: serde_json::Value = serde_json::from_str(&json_str)?;
507
508    // Create binary .m8 file
509    let mut m8_file = M8BinaryFile::create(binary_path)?;
510
511    // Convert contexts to blocks
512    if let Some(contexts_array) = contexts.get("contexts").and_then(|c| c.as_array()) {
513        for context in contexts_array {
514            let content = serde_json::to_vec(context)?;
515            let importance = context.get("score").and_then(|s| s.as_f64()).unwrap_or(0.5) as f32;
516
517            m8_file.append_block(&content, importance)?;
518        }
519    }
520
521    Ok(())
522}
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527    use tempfile::tempdir;
528
529    #[test]
530    fn test_m8_binary_format() {
531        let dir = tempdir().unwrap();
532        let path = dir.path().join("test.m8");
533
534        // Create file
535        let mut m8 = M8BinaryFile::create(&path).unwrap();
536
537        // Append some blocks
538        m8.append_block(b"First memory", 0.8).unwrap();
539        m8.append_block(b"Second memory", 0.5).unwrap();
540        m8.append_block(b"Important memory", 1.0).unwrap();
541
542        // Reopen and read backwards
543        let mut m8 = M8BinaryFile::open(&path).unwrap();
544
545        // Should read "Important memory" first (most recent)
546        let block = m8.read_backwards().unwrap().unwrap();
547        assert_eq!(&block.content, b"Important memory");
548
549        // Read by importance
550        let important_blocks = m8.read_by_importance(&["Important".to_string()]).unwrap();
551        assert!(!important_blocks.is_empty());
552    }
553
554    #[test]
555    fn test_tokenization() {
556        let mut tokens = TokenMap::new();
557
558        // Reserved tokens
559        assert_eq!(tokens.get_token("node_modules"), 0x80);
560        assert_eq!(tokens.get_token(".rs"), 0x90);
561        assert_eq!(tokens.get_token("Claude"), 0xFFFE);
562
563        // Dynamic tokens
564        let token1 = tokens.get_token("custom_string");
565        let token2 = tokens.get_token("another_string");
566        assert!(token1 >= 0x100);
567        assert!(token2 > token1);
568
569        // Decode
570        assert_eq!(tokens.decode_token(0x80), Some("node_modules"));
571        assert_eq!(tokens.decode_token(token1), Some("custom_string"));
572    }
573}