Skip to main content

st/mem8/
format.rs

1//! .m8 file format implementation with Markqant compression
2//! Achieves 100:1 semantic-preserving compression for AI-native storage
3
4use crate::mem8::wave::MemoryWave;
5use anyhow::{anyhow, Result};
6use std::collections::HashMap;
7use std::io::Write;
8
9/// Magic bytes for .m8 format
10const M8_MAGIC: &[u8] = b"M8\x02\x09";
11
12/// Section types in .m8 files
13#[derive(Debug, Clone, Copy, PartialEq)]
14#[repr(u8)]
15pub enum SectionType {
16    MarkqantText = 0x09,
17    QuantumDirectory = 0x0A,
18    WaveMemory = 0x0B,
19    Metadata = 0x0C,
20    Index = 0x0D,
21}
22
23/// .m8 file header
24#[derive(Debug)]
25pub struct M8Header {
26    /// File format version
27    pub version: u16,
28    /// Number of sections
29    pub section_count: u16,
30    /// Total file size (excluding CRC)
31    pub file_size: u64,
32    /// Creation timestamp
33    pub timestamp: u64,
34}
35
36/// .m8 file section
37#[derive(Debug)]
38pub struct M8Section {
39    /// Section type
40    pub section_type: SectionType,
41    /// Section size in bytes
42    pub size: u32,
43    /// Section data
44    pub data: Vec<u8>,
45}
46
47/// Wave memory compressed to 32 bytes
48#[derive(Debug, Clone)]
49pub struct CompressedWave {
50    pub id: u64,           // 8 bytes - unique identifier
51    pub amplitude: u8,     // 1 byte - logarithmically quantized
52    pub frequency: u16,    // 2 bytes - frequency in Hz
53    pub phase: u8,         // 1 byte - phase in radians * 40.74
54    pub valence: i8,       // 1 byte - emotional valence * 127
55    pub arousal: u8,       // 1 byte - emotional arousal * 255
56    pub decay_tau: u16,    // 2 bytes - decay constant in seconds
57    pub timestamp: u64,    // 8 bytes - creation time
58    pub interference: u64, // 8 bytes - interference pattern hash
59}
60
61impl CompressedWave {
62    /// Compress a MemoryWave to 32 bytes
63    pub fn from_wave(wave: &MemoryWave, id: u64) -> Self {
64        Self {
65            id,
66            amplitude: quantize_amplitude(wave.amplitude),
67            frequency: wave.frequency as u16,
68            phase: ((wave.phase / std::f32::consts::PI + 1.0) * 127.5) as u8,
69            valence: (wave.valence * 127.0) as i8,
70            arousal: (wave.arousal * 255.0) as u8,
71            decay_tau: wave
72                .decay_tau
73                .map(|d| d.as_secs() as u16)
74                .unwrap_or(u16::MAX),
75            timestamp: wave.created_at.elapsed().as_secs(),
76            interference: 0, // Calculated separately
77        }
78    }
79
80    /// Decompress to MemoryWave
81    pub fn to_wave(&self) -> MemoryWave {
82        let mut wave = MemoryWave::new(self.frequency as f32, dequantize_amplitude(self.amplitude));
83
84        wave.phase = (self.phase as f32 / 127.5 - 1.0) * std::f32::consts::PI;
85        wave.valence = self.valence as f32 / 127.0;
86        wave.arousal = self.arousal as f32 / 255.0;
87        wave.decay_tau = if self.decay_tau == u16::MAX {
88            None
89        } else {
90            Some(std::time::Duration::from_secs(self.decay_tau as u64))
91        };
92
93        wave
94    }
95
96    /// Serialize to bytes
97    pub fn to_bytes(&self) -> [u8; 32] {
98        let mut bytes = [0u8; 32];
99        bytes[0..8].copy_from_slice(&self.id.to_le_bytes());
100        bytes[8] = self.amplitude;
101        bytes[9..11].copy_from_slice(&self.frequency.to_le_bytes());
102        bytes[11] = self.phase;
103        bytes[12] = self.valence as u8;
104        bytes[13] = self.arousal;
105        bytes[14..16].copy_from_slice(&self.decay_tau.to_le_bytes());
106        bytes[16..24].copy_from_slice(&self.timestamp.to_le_bytes());
107        bytes[24..32].copy_from_slice(&self.interference.to_le_bytes());
108        bytes
109    }
110
111    /// Deserialize from bytes
112    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
113        if bytes.len() != 32 {
114            return Err(anyhow!("CompressedWave must be exactly 32 bytes"));
115        }
116
117        Ok(Self {
118            id: u64::from_le_bytes(bytes[0..8].try_into()?),
119            amplitude: bytes[8],
120            frequency: u16::from_le_bytes(bytes[9..11].try_into()?),
121            phase: bytes[11],
122            valence: bytes[12] as i8,
123            arousal: bytes[13],
124            decay_tau: u16::from_le_bytes(bytes[14..16].try_into()?),
125            timestamp: u64::from_le_bytes(bytes[16..24].try_into()?),
126            interference: u64::from_le_bytes(bytes[24..32].try_into()?),
127        })
128    }
129}
130
131/// Logarithmic amplitude quantization
132fn quantize_amplitude(amplitude: f32) -> u8 {
133    if amplitude <= 0.0 {
134        0
135    } else {
136        (32.0 * amplitude.log2()).clamp(0.0, 255.0) as u8
137    }
138}
139
140/// Inverse logarithmic quantization
141fn dequantize_amplitude(quantized: u8) -> f32 {
142    if quantized == 0 {
143        0.0
144    } else {
145        2.0_f32.powf(quantized as f32 / 32.0)
146    }
147}
148
149/// Markqant v2.0 rotating token system
150pub struct MarkqantEncoder {
151    /// Token assignments (pattern -> token)
152    tokens: HashMap<String, u8>,
153    /// Reverse mapping (token -> pattern)
154    patterns: HashMap<u8, String>,
155    /// Pattern frequencies
156    frequencies: HashMap<String, usize>,
157    /// Next available token
158    next_token: u8,
159}
160
161impl Default for MarkqantEncoder {
162    fn default() -> Self {
163        Self::new()
164    }
165}
166
167impl MarkqantEncoder {
168    pub fn new() -> Self {
169        Self {
170            tokens: HashMap::new(),
171            patterns: HashMap::new(),
172            frequencies: HashMap::new(),
173            next_token: 0x80, // Start at 128
174        }
175    }
176
177    /// Analyze text and build token assignments
178    pub fn analyze(&mut self, text: &str) {
179        // Find all repeated substrings
180        let words: Vec<&str> = text.split_whitespace().collect();
181
182        // Count frequencies
183        for window_size in 1..=5 {
184            for i in 0..words.len().saturating_sub(window_size - 1) {
185                let pattern = words[i..i + window_size].join(" ");
186                *self.frequencies.entry(pattern).or_insert(0) += 1;
187            }
188        }
189
190        // Score patterns by (length - 1) * (frequency - 1)
191        let mut scored_patterns: Vec<_> = self
192            .frequencies
193            .iter()
194            .filter(|(_, &freq)| freq >= 2)
195            .map(|(pattern, &freq)| {
196                let score = (pattern.len() - 1) * (freq - 1);
197                (pattern.clone(), score)
198            })
199            .collect();
200
201        scored_patterns.sort_by_key(|(_, score)| std::cmp::Reverse(*score));
202
203        // Assign tokens to top patterns
204        for (pattern, _) in scored_patterns.iter().take(128) {
205            if self.next_token < 255 {
206                self.tokens.insert(pattern.clone(), self.next_token);
207                self.patterns.insert(self.next_token, pattern.clone());
208                self.next_token = self.next_token.saturating_add(1);
209            }
210        }
211    }
212
213    /// Encode text using assigned tokens
214    pub fn encode(&self, text: &str) -> Vec<u8> {
215        let mut result = Vec::new();
216        let mut remaining = text;
217
218        while !remaining.is_empty() {
219            let mut found = false;
220
221            // Try to match longest pattern first
222            for len in (1..=remaining.len()).rev() {
223                if let Some(&token) = self.tokens.get(&remaining[..len]) {
224                    result.push(token);
225                    remaining = &remaining[len..];
226                    found = true;
227                    break;
228                }
229            }
230
231            if !found {
232                // No pattern match, encode as raw byte
233                result.extend_from_slice(remaining.chars().next().unwrap().to_string().as_bytes());
234                remaining = &remaining[remaining.chars().next().unwrap().len_utf8()..];
235            }
236        }
237
238        result
239    }
240
241    /// Decode tokens back to text
242    pub fn decode(&self, data: &[u8]) -> Result<String> {
243        let mut result = String::new();
244        let mut i = 0;
245
246        while i < data.len() {
247            if data[i] >= 0x80 {
248                // Token
249                if let Some(pattern) = self.patterns.get(&data[i]) {
250                    result.push_str(pattern);
251                } else {
252                    return Err(anyhow!("Unknown token: 0x{:02x}", data[i]));
253                }
254                i += 1;
255            } else {
256                // Raw UTF-8
257                let ch = data[i] as char;
258                result.push(ch);
259                i += 1;
260            }
261        }
262
263        Ok(result)
264    }
265}
266
267/// .m8 file writer
268pub struct M8Writer<W: Write> {
269    writer: W,
270    sections: Vec<M8Section>,
271}
272
273impl<W: Write> M8Writer<W> {
274    pub fn new(writer: W) -> Self {
275        Self {
276            writer,
277            sections: Vec::new(),
278        }
279    }
280
281    /// Add a wave memory section
282    pub fn add_wave_memory(&mut self, waves: &[CompressedWave]) -> Result<()> {
283        let mut data = Vec::with_capacity(waves.len() * 32);
284
285        for wave in waves {
286            data.extend_from_slice(&wave.to_bytes());
287        }
288
289        self.sections.push(M8Section {
290            section_type: SectionType::WaveMemory,
291            size: data.len() as u32,
292            data,
293        });
294
295        Ok(())
296    }
297
298    /// Add a Markqant-compressed text section
299    pub fn add_markqant_text(&mut self, text: &str) -> Result<()> {
300        let mut encoder = MarkqantEncoder::new();
301        encoder.analyze(text);
302        let encoded = encoder.encode(text);
303
304        // Store token table followed by encoded data
305        let mut data = Vec::new();
306
307        // Token table header
308        data.extend_from_slice(&(encoder.patterns.len() as u16).to_le_bytes());
309
310        // Token definitions
311        for (token, pattern) in &encoder.patterns {
312            data.push(*token);
313            data.extend_from_slice(&(pattern.len() as u16).to_le_bytes());
314            data.extend_from_slice(pattern.as_bytes());
315        }
316
317        // Encoded text
318        data.extend_from_slice(&(encoded.len() as u32).to_le_bytes());
319        data.extend_from_slice(&encoded);
320
321        self.sections.push(M8Section {
322            section_type: SectionType::MarkqantText,
323            size: data.len() as u32,
324            data,
325        });
326
327        Ok(())
328    }
329
330    /// Write the complete .m8 file
331    pub fn finish(mut self) -> Result<()> {
332        // Write magic bytes
333        self.writer.write_all(M8_MAGIC)?;
334
335        // Calculate total size
336        let header_size = 16; // Magic + header fields
337        let section_headers_size = self.sections.len() * 8; // Type + size per section
338        let data_size: usize = self.sections.iter().map(|s| s.data.len()).sum();
339        let total_size = header_size + section_headers_size + data_size + 4; // +4 for CRC
340
341        // Write header
342        let header = M8Header {
343            version: 1,
344            section_count: self.sections.len() as u16,
345            file_size: total_size as u64,
346            timestamp: std::time::SystemTime::now()
347                .duration_since(std::time::UNIX_EPOCH)?
348                .as_secs(),
349        };
350
351        self.writer.write_all(&header.version.to_le_bytes())?;
352        self.writer.write_all(&header.section_count.to_le_bytes())?;
353        self.writer.write_all(&header.file_size.to_le_bytes())?;
354        self.writer.write_all(&header.timestamp.to_le_bytes())?;
355
356        // Write sections
357        for section in &self.sections {
358            self.writer.write_all(&[section.section_type as u8])?;
359            self.writer.write_all(&section.size.to_le_bytes())?;
360            self.writer.write_all(&section.data)?;
361        }
362
363        // Calculate and write CRC32
364        let crc = 0u32; // TODO: Implement actual CRC32
365        self.writer.write_all(&crc.to_le_bytes())?;
366
367        Ok(())
368    }
369}
370
371/// Example of .m8 format usage
372pub fn create_example_m8() -> Result<Vec<u8>> {
373    let mut buffer = Vec::new();
374    let mut writer = M8Writer::new(&mut buffer);
375
376    // Add some wave memories
377    let waves = vec![
378        CompressedWave::from_wave(&MemoryWave::new(440.0, 0.8), 1),
379        CompressedWave::from_wave(&MemoryWave::new(880.0, 0.6), 2),
380    ];
381    writer.add_wave_memory(&waves)?;
382
383    // Add some text
384    writer.add_markqant_text("The user is cooking in the kitchen at 6PM")?;
385
386    writer.finish()?;
387    Ok(buffer)
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_wave_compression() {
396        let mut wave = MemoryWave::new(440.0, 0.8);
397        wave.valence = 0.7;
398        wave.arousal = 0.4;
399
400        let compressed = CompressedWave::from_wave(&wave, 12345);
401        assert_eq!(compressed.to_bytes().len(), 32);
402
403        let decompressed = compressed.to_wave();
404        assert!((decompressed.frequency - 440.0).abs() < 1.0);
405        assert!((decompressed.valence - 0.7).abs() < 0.01);
406    }
407
408    #[test]
409    fn test_markqant_encoding() {
410        let mut encoder = MarkqantEncoder::new();
411        let text = "the cat in the hat sat on the mat";
412        encoder.analyze(text);
413
414        let encoded = encoder.encode(text);
415        let decoded = encoder.decode(&encoded).unwrap();
416
417        assert_eq!(decoded, text);
418        assert!(encoded.len() < text.len()); // Should compress
419    }
420
421    #[test]
422    fn test_m8_creation() {
423        let m8_data = create_example_m8().unwrap();
424        assert!(m8_data.starts_with(M8_MAGIC));
425        assert!(m8_data.len() > 100); // Should have some content
426    }
427}