use anyhow::{Context, Result};
use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::io::{Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
const M8_MAGIC: &[u8; 4] = b"MEM8";
const M8_VERSION: u8 = 1;
const BLOCK_SIZE: usize = 4096;
#[repr(C, packed)]
pub struct M8Header {
magic: [u8; 4],
version: u8,
flags: u8,
block_count: u32,
identity_freq: f64,
temporal_phase: f64,
crc32: u32,
}
impl M8Header {
fn to_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::with_capacity(std::mem::size_of::<Self>());
bytes.extend_from_slice(&self.magic);
bytes.push(self.version);
bytes.push(self.flags);
bytes.extend_from_slice(&self.block_count.to_le_bytes());
bytes.extend_from_slice(&self.identity_freq.to_le_bytes());
bytes.extend_from_slice(&self.temporal_phase.to_le_bytes());
bytes.extend_from_slice(&self.crc32.to_le_bytes());
bytes
}
fn from_bytes(bytes: &[u8]) -> Result<Self> {
if bytes.len() < std::mem::size_of::<Self>() {
anyhow::bail!("Invalid header size");
}
let mut cursor = 0;
let mut magic = [0u8; 4];
magic.copy_from_slice(&bytes[cursor..cursor + 4]);
cursor += 4;
let version = bytes[cursor];
cursor += 1;
let flags = bytes[cursor];
cursor += 1;
let block_count = u32::from_le_bytes(bytes[cursor..cursor + 4].try_into()?);
cursor += 4;
let identity_freq = f64::from_le_bytes(bytes[cursor..cursor + 8].try_into()?);
cursor += 8;
let temporal_phase = f64::from_le_bytes(bytes[cursor..cursor + 8].try_into()?);
cursor += 8;
let crc32 = u32::from_le_bytes(bytes[cursor..cursor + 4].try_into()?);
Ok(Self {
magic,
version,
flags,
block_count,
identity_freq,
temporal_phase,
crc32,
})
}
}
#[repr(C)]
#[derive(Debug, Clone)]
pub struct M8Block {
pub index: u64,
pub wave_signature: [u8; 16],
pub timestamp: u64,
pub importance: u16,
pub token_id: u16,
pub prev_hash: [u8; 32],
pub content_len: u32,
pub content: Vec<u8>,
}
pub struct TokenMap {
str_to_token: HashMap<String, u16>,
token_to_str: HashMap<u16, String>,
next_id: u16,
reserved: HashMap<u16, String>,
}
impl Default for TokenMap {
fn default() -> Self {
Self::new()
}
}
impl TokenMap {
pub fn new() -> Self {
let mut reserved = HashMap::new();
reserved.insert(0x80, "node_modules".to_string());
reserved.insert(0x81, ".git".to_string());
reserved.insert(0x82, "target".to_string());
reserved.insert(0x83, "dist".to_string());
reserved.insert(0x84, "build".to_string());
reserved.insert(0x90, ".rs".to_string());
reserved.insert(0x91, ".py".to_string());
reserved.insert(0x92, ".js".to_string());
reserved.insert(0x93, ".ts".to_string());
reserved.insert(0xFFFE, "Claude".to_string());
reserved.insert(0xFFFF, "Hue".to_string());
let mut str_to_token = HashMap::new();
let mut token_to_str = HashMap::new();
for (&id, value) in &reserved {
str_to_token.insert(value.clone(), id);
token_to_str.insert(id, value.clone());
}
Self {
str_to_token,
token_to_str,
next_id: 0x100, reserved,
}
}
pub fn get_token(&mut self, s: &str) -> u16 {
if let Some(&token) = self.str_to_token.get(s) {
return token;
}
let token = self.next_id;
self.next_id += 1;
self.str_to_token.insert(s.to_string(), token);
self.token_to_str.insert(token, s.to_string());
token
}
pub fn decode_token(&self, token: u16) -> Option<&str> {
self.token_to_str.get(&token).map(|s| s.as_str())
}
}
pub struct M8BinaryFile {
path: PathBuf,
file: File,
header: M8Header,
tokens: TokenMap,
read_position: u64,
importance_index: Vec<(u64, f32)>, }
impl M8BinaryFile {
pub fn create(path: impl AsRef<Path>) -> Result<Self> {
let path = path.as_ref().to_path_buf();
let mut file = OpenOptions::new()
.create(true)
.write(true)
.read(true)
.truncate(true)
.open(&path)
.context("Failed to create .m8 file")?;
let header = M8Header {
magic: *M8_MAGIC,
version: M8_VERSION,
flags: 0,
block_count: 0,
identity_freq: 440.0, temporal_phase: 0.0,
crc32: 0,
};
file.write_all(&header.to_bytes())?;
Ok(Self {
path,
file,
header,
tokens: TokenMap::new(),
read_position: std::mem::size_of::<M8Header>() as u64,
importance_index: Vec::new(),
})
}
pub fn open(path: impl AsRef<Path>) -> Result<Self> {
let path = path.as_ref().to_path_buf();
let mut file = OpenOptions::new()
.read(true)
.write(true)
.open(&path)
.context("Failed to open .m8 file")?;
let mut header_bytes = vec![0u8; std::mem::size_of::<M8Header>()];
file.read_exact(&mut header_bytes)?;
let header = M8Header::from_bytes(&header_bytes)?;
if header.magic != *M8_MAGIC {
anyhow::bail!("Invalid .m8 file (bad magic)");
}
let file_size = file.seek(SeekFrom::End(0))?;
let mut m8 = Self {
path,
file,
header,
tokens: TokenMap::new(),
read_position: file_size,
importance_index: Vec::new(),
};
m8.build_importance_index()?;
Ok(m8)
}
pub fn append_block(&mut self, content: &[u8], importance: f32) -> Result<()> {
let timestamp = SystemTime::now().duration_since(UNIX_EPOCH)?.as_micros() as u64;
let wave_sig = self.generate_wave_signature(content);
let block = M8Block {
index: self.header.block_count as u64,
wave_signature: wave_sig,
timestamp,
importance: (importance * 65535.0) as u16,
token_id: 0, prev_hash: [0; 32], content_len: content.len() as u32,
content: content.to_vec(),
};
self.file.seek(SeekFrom::End(0))?;
self.write_block(&block)?;
self.header.block_count += 1;
self.update_header()?;
let offset = self.file.stream_position()?;
self.importance_index.push((offset, importance));
Ok(())
}
pub fn read_backwards(&mut self) -> Result<Option<M8Block>> {
if self.read_position <= std::mem::size_of::<M8Header>() as u64 {
return Ok(None);
}
self.read_position -= BLOCK_SIZE as u64;
self.file.seek(SeekFrom::Start(self.read_position))?;
self.read_block()
}
pub fn read_by_importance(&mut self, keywords: &[String]) -> Result<Vec<M8Block>> {
let mut blocks = Vec::new();
self.importance_index
.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
let offsets: Vec<u64> = self.importance_index.iter().map(|(off, _)| *off).collect();
for offset in offsets {
self.file.seek(SeekFrom::Start(offset))?;
if let Some(block) = self.read_block()? {
let content_str = String::from_utf8_lossy(&block.content);
let has_keyword = keywords.iter().any(|kw| content_str.contains(kw));
if has_keyword || blocks.len() < 10 {
blocks.push(block);
}
if blocks.len() >= 20 {
break;
}
}
}
Ok(blocks)
}
fn generate_wave_signature(&self, content: &[u8]) -> [u8; 16] {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(content);
hasher.update(self.header.identity_freq.to_le_bytes());
let hash = hasher.finalize();
let mut signature = [0u8; 16];
signature.copy_from_slice(&hash[..16]);
signature
}
fn write_block(&mut self, block: &M8Block) -> Result<()> {
let mut buffer = vec![0u8; BLOCK_SIZE];
let mut cursor = 0;
buffer[cursor..cursor + 8].copy_from_slice(&block.index.to_le_bytes());
cursor += 8;
buffer[cursor..cursor + 16].copy_from_slice(&block.wave_signature);
cursor += 16;
buffer[cursor..cursor + 8].copy_from_slice(&block.timestamp.to_le_bytes());
cursor += 8;
buffer[cursor..cursor + 2].copy_from_slice(&block.importance.to_le_bytes());
cursor += 2;
buffer[cursor..cursor + 2].copy_from_slice(&block.token_id.to_le_bytes());
cursor += 2;
buffer[cursor..cursor + 32].copy_from_slice(&block.prev_hash);
cursor += 32;
buffer[cursor..cursor + 4].copy_from_slice(&block.content_len.to_le_bytes());
cursor += 4;
let content_space = BLOCK_SIZE - cursor;
let content_to_copy = block.content.len().min(content_space);
buffer[cursor..cursor + content_to_copy].copy_from_slice(&block.content[..content_to_copy]);
self.file.write_all(&buffer)?;
Ok(())
}
fn read_block(&mut self) -> Result<Option<M8Block>> {
let mut buffer = vec![0u8; BLOCK_SIZE];
match self.file.read_exact(&mut buffer) {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => return Ok(None),
Err(e) => return Err(e.into()),
}
let mut cursor = 0;
let index = u64::from_le_bytes(buffer[cursor..cursor + 8].try_into()?);
cursor += 8;
let mut wave_signature = [0u8; 16];
wave_signature.copy_from_slice(&buffer[cursor..cursor + 16]);
cursor += 16;
let timestamp = u64::from_le_bytes(buffer[cursor..cursor + 8].try_into()?);
cursor += 8;
let importance = u16::from_le_bytes(buffer[cursor..cursor + 2].try_into()?);
cursor += 2;
let token_id = u16::from_le_bytes(buffer[cursor..cursor + 2].try_into()?);
cursor += 2;
let mut prev_hash = [0u8; 32];
prev_hash.copy_from_slice(&buffer[cursor..cursor + 32]);
cursor += 32;
let content_len = u32::from_le_bytes(buffer[cursor..cursor + 4].try_into()?);
cursor += 4;
let content = buffer[cursor..cursor + content_len as usize].to_vec();
Ok(Some(M8Block {
index,
wave_signature,
timestamp,
importance,
token_id,
prev_hash,
content_len,
content,
}))
}
fn update_header(&mut self) -> Result<()> {
self.file.seek(SeekFrom::Start(0))?;
self.file.write_all(&self.header.to_bytes())?;
self.file.flush()?;
Ok(())
}
fn build_importance_index(&mut self) -> Result<()> {
self.file
.seek(SeekFrom::Start(std::mem::size_of::<M8Header>() as u64))?;
loop {
let offset = self.file.stream_position()?;
match self.read_block()? {
Some(block) => {
let importance = block.importance as f32 / 65535.0;
self.importance_index.push((offset, importance));
}
None => break,
}
}
Ok(())
}
}
pub fn convert_json_to_binary(json_path: &Path, binary_path: &Path) -> Result<()> {
use std::fs;
let data = fs::read(json_path)?;
let json_str = if data.starts_with(b"\x78\x9c") || data.starts_with(b"\x78\xda") {
use flate2::read::ZlibDecoder;
let mut decoder = ZlibDecoder::new(&data[..]);
let mut decompressed = String::new();
decoder.read_to_string(&mut decompressed)?;
decompressed
} else {
String::from_utf8(data)?
};
let contexts: serde_json::Value = serde_json::from_str(&json_str)?;
let mut m8_file = M8BinaryFile::create(binary_path)?;
if let Some(contexts_array) = contexts.get("contexts").and_then(|c| c.as_array()) {
for context in contexts_array {
let content = serde_json::to_vec(context)?;
let importance = context.get("score").and_then(|s| s.as_f64()).unwrap_or(0.5) as f32;
m8_file.append_block(&content, importance)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_m8_binary_format() {
let dir = tempdir().unwrap();
let path = dir.path().join("test.m8");
let mut m8 = M8BinaryFile::create(&path).unwrap();
m8.append_block(b"First memory", 0.8).unwrap();
m8.append_block(b"Second memory", 0.5).unwrap();
m8.append_block(b"Important memory", 1.0).unwrap();
let mut m8 = M8BinaryFile::open(&path).unwrap();
let block = m8.read_backwards().unwrap().unwrap();
assert_eq!(&block.content, b"Important memory");
let important_blocks = m8.read_by_importance(&["Important".to_string()]).unwrap();
assert!(!important_blocks.is_empty());
}
#[test]
fn test_tokenization() {
let mut tokens = TokenMap::new();
assert_eq!(tokens.get_token("node_modules"), 0x80);
assert_eq!(tokens.get_token(".rs"), 0x90);
assert_eq!(tokens.get_token("Claude"), 0xFFFE);
let token1 = tokens.get_token("custom_string");
let token2 = tokens.get_token("another_string");
assert!(token1 >= 0x100);
assert!(token2 > token1);
assert_eq!(tokens.decode_token(0x80), Some("node_modules"));
assert_eq!(tokens.decode_token(token1), Some("custom_string"));
}
}