use stowken::{
storage::{FilesystemBackend, MemoryBackend},
types::{Conversation, Message, MessageContent, StowkenConfig},
Stowken,
};
use tempfile::TempDir;
fn make_conv(id: &str, sys: Vec<u32>, user: Vec<u32>) -> Conversation {
Conversation {
id: Some(id.to_owned()),
application: Some("substring-test".to_owned()),
model: "gpt-4".to_owned(),
tokenizer: "cl100k_base".to_owned(),
messages: vec![
Message {
role: "system".to_owned(),
content: MessageContent::Tokens(sys),
name: None,
tool_call_id: None,
},
Message {
role: "user".to_owned(),
content: MessageContent::Tokens(user),
name: None,
tool_call_id: None,
},
],
metadata: None,
}
}
fn make_drift_system(seed: u32, length: usize) -> Vec<u32> {
let mut tokens: Vec<u32> = (0..length as u32).collect();
let positions = [
(seed % length as u32) as usize,
((seed * 7) % length as u32) as usize,
((seed * 13) % length as u32) as usize,
];
for (i, p) in positions.iter().enumerate() {
tokens[*p] = 90_000 + (seed * 17 + i as u32 * 11) % 5_000;
}
tokens
}
#[tokio::test]
async fn discovery_promotes_frequent_windows() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(
&format!("seed-{i}"),
make_drift_system(i, 200),
vec![1000 + i],
))
.await
.unwrap();
}
assert_eq!(vault.list_substrings().len(), 0);
let promoted = vault.train_substrings(30, 5).await.unwrap();
assert!(
!promoted.is_empty(),
"discovery should promote at least one substring on a high-overlap corpus"
);
assert_eq!(vault.list_substrings().len(), promoted.len());
for info in &promoted {
assert!(info.length >= 16, "promoted substring must clear MIN_LENGTH");
assert!(info.source_occurrences >= 5, "must clear min_occurrences");
}
}
#[tokio::test]
async fn substring_frame_used_after_training() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(
&format!("seed-{i}"),
make_drift_system(i, 300),
vec![5000 + i],
))
.await
.unwrap();
}
let promoted = vault.train_substrings(30, 5).await.unwrap();
assert!(!promoted.is_empty(), "training produced no substrings");
let novel_id = "post-train";
let novel_sys = make_drift_system(999, 300);
let novel_user = vec![9_999_001, 9_999_002];
let novel_sys_clone = novel_sys.clone();
vault
.store(make_conv(novel_id, novel_sys, novel_user.clone()))
.await
.unwrap();
let r = vault.retrieve(novel_id).await.unwrap();
let sys_seg = r
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(sys_seg.tokens, novel_sys_clone);
}
#[tokio::test]
async fn substring_round_trip_is_exact() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
for i in 0..20u32 {
vault
.store(make_conv(
&format!("seed-{i}"),
make_drift_system(i, 250),
vec![100 + i],
))
.await
.unwrap();
}
vault.train_substrings(20, 5).await.unwrap();
let novel = make_drift_system(42, 250);
vault
.store(make_conv("rt", novel.clone(), vec![99_999]))
.await
.unwrap();
let r = vault.retrieve("rt").await.unwrap();
let sys = r
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(sys.tokens, novel);
}
#[tokio::test]
async fn no_substrings_means_no_0x05_frames() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
let tokens: Vec<u32> = (0..200).collect();
vault
.store(make_conv("only", tokens.clone(), vec![1, 2, 3]))
.await
.unwrap();
let r = vault.retrieve("only").await.unwrap();
let sys = r
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(sys.tokens, tokens);
assert_eq!(vault.list_substrings().len(), 0);
}
#[tokio::test]
async fn substring_persistence_across_vault_reopen() {
let dir = TempDir::new().unwrap();
let data_path = dir.path().to_path_buf();
let db_path = data_path.join("metadata.db");
let db_str = db_path.to_str().unwrap().to_owned();
let novel_tokens = make_drift_system(777, 250);
let novel_clone = novel_tokens.clone();
let promoted_count;
{
let backend = FilesystemBackend::new(&data_path).await.unwrap();
let vault = Stowken::open(backend, StowkenConfig::default(), &db_str)
.await
.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(
&format!("seed-{i}"),
make_drift_system(i, 250),
vec![1000 + i],
))
.await
.unwrap();
}
let promoted = vault.train_substrings(30, 5).await.unwrap();
promoted_count = promoted.len();
assert!(promoted_count > 0);
vault
.store(make_conv("post-train", novel_tokens, vec![88_888]))
.await
.unwrap();
}
{
let backend = FilesystemBackend::new(&data_path).await.unwrap();
let vault = Stowken::open(backend, StowkenConfig::default(), &db_str)
.await
.unwrap();
assert_eq!(
vault.list_substrings().len(),
promoted_count,
"substring registry didn't persist across reopen"
);
let r = vault.retrieve("post-train").await.unwrap();
let sys = r
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(sys.tokens, novel_clone, "0x05 frame failed to round-trip after reopen");
}
}
#[tokio::test]
async fn pre_substring_segments_remain_readable_after_promotion() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
let pre = make_drift_system(1, 250);
vault.store(make_conv("pre", pre.clone(), vec![1])).await.unwrap();
for i in 100..130u32 {
vault
.store(make_conv(
&format!("seed-{i}"),
make_drift_system(i, 250),
vec![100 + i],
))
.await
.unwrap();
}
vault.train_substrings(30, 5).await.unwrap();
let post = make_drift_system(2, 250);
vault.store(make_conv("post", post.clone(), vec![2])).await.unwrap();
assert_eq!(
vault
.retrieve("pre")
.await
.unwrap()
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap()
.tokens,
pre
);
assert_eq!(
vault
.retrieve("post")
.await
.unwrap()
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap()
.tokens,
post
);
}
#[tokio::test]
async fn empty_corpus_train_returns_empty() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
let promoted = vault.train_substrings(50, 5).await.unwrap();
assert!(promoted.is_empty());
assert_eq!(vault.list_substrings().len(), 0);
}
#[tokio::test]
async fn compact_rewrites_pre_train_segments() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
let pre_tokens: Vec<Vec<u32>> = (0..30u32).map(|i| make_drift_system(i, 250)).collect();
for (i, sys) in pre_tokens.iter().enumerate() {
vault
.store(make_conv(&format!("pre-{i}"), sys.clone(), vec![100 + i as u32]))
.await
.unwrap();
}
let pre_compaction_storage = vault.stats().await.unwrap().storage_bytes;
vault.train_substrings(30, 5).await.unwrap();
let mid_storage = vault.stats().await.unwrap().storage_bytes;
assert_eq!(mid_storage, pre_compaction_storage, "training shouldn't rewrite existing segments");
let report = vault.compact_substrings().await.unwrap();
assert!(report.segments_rewritten > 0, "expected at least one rewrite on a drift corpus");
assert!(report.bytes_saved > 0);
let post_storage = vault.stats().await.unwrap().storage_bytes;
assert!(
post_storage < pre_compaction_storage,
"compaction should reduce storage; before={pre_compaction_storage} after={post_storage}"
);
for (i, original) in pre_tokens.iter().enumerate() {
let r = vault.retrieve(&format!("pre-{i}")).await.unwrap();
let sys = r
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(&sys.tokens, original, "compaction corrupted segment {i}");
}
}
#[tokio::test]
async fn compact_skips_delta_and_substring_frames() {
let vault = Stowken::new(
MemoryBackend::new(),
StowkenConfig {
enable_compression: true,
near_dedup_threshold: Some(0.80),
},
)
.await
.unwrap();
let canon: Vec<u32> = (0..200u32).collect();
let mut variant = canon.clone();
variant[10] = 99_999;
variant[100] = 88_888;
vault.store(make_conv("canon", canon.clone(), vec![1])).await.unwrap();
vault.store(make_conv("variant", variant.clone(), vec![2])).await.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(&format!("drift-{i}"), make_drift_system(i, 250), vec![100 + i]))
.await
.unwrap();
}
vault.train_substrings(30, 5).await.unwrap();
vault
.store(make_conv("substr", make_drift_system(999, 250), vec![3]))
.await
.unwrap();
let report = vault.compact_substrings().await.unwrap();
assert!(report.segments_skipped >= 1, "expected at least one skip");
let v = vault.retrieve("variant").await.unwrap();
let v_sys = v
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(v_sys.tokens, variant);
let s = vault.retrieve("substr").await.unwrap();
let s_sys = s
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(s_sys.tokens, make_drift_system(999, 250));
}
#[tokio::test]
async fn gc_drops_unreferenced_substrings() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(&format!("seed-{i}"), make_drift_system(i, 250), vec![i]))
.await
.unwrap();
}
vault.train_substrings(30, 5).await.unwrap();
let before = vault.list_substrings().len() as u64;
assert!(before > 0);
let report = vault.gc_substrings().await.unwrap();
assert_eq!(report.registry_size_before, before);
assert_eq!(report.registry_size_after, 0);
assert_eq!(report.substrings_dropped, before);
assert_eq!(vault.list_substrings().len(), 0);
}
#[tokio::test]
async fn gc_after_compaction_keeps_referenced_substrings() {
let vault = Stowken::new(MemoryBackend::new(), StowkenConfig::default())
.await
.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(&format!("seed-{i}"), make_drift_system(i, 250), vec![i]))
.await
.unwrap();
}
vault.train_substrings(30, 5).await.unwrap();
vault.compact_substrings().await.unwrap();
let before = vault.list_substrings().len() as u64;
let report = vault.gc_substrings().await.unwrap();
assert!(report.registry_size_after > 0, "compacted segments should keep at least some substrings alive");
assert_eq!(
report.registry_size_before,
before,
"registry size before should match list_substrings"
);
}
#[tokio::test]
async fn gc_persistence_across_reopen() {
use std::path::PathBuf;
use stowken::storage::FilesystemBackend;
let dir = TempDir::new().unwrap();
let data_path: PathBuf = dir.path().to_path_buf();
let db_path = data_path.join("metadata.db");
let db_str = db_path.to_str().unwrap().to_owned();
{
let backend = FilesystemBackend::new(&data_path).await.unwrap();
let vault = Stowken::open(backend, StowkenConfig::default(), &db_str)
.await
.unwrap();
for i in 0..30u32 {
vault
.store(make_conv(
&format!("seed-{i}"),
make_drift_system(i, 250),
vec![i],
))
.await
.unwrap();
}
vault.train_substrings(30, 5).await.unwrap();
vault.compact_substrings().await.unwrap();
let _ = vault.gc_substrings().await.unwrap();
}
let backend = FilesystemBackend::new(&data_path).await.unwrap();
let vault = Stowken::open(backend, StowkenConfig::default(), &db_str)
.await
.unwrap();
let r = vault.retrieve("seed-7").await.unwrap();
let sys = r
.segments
.iter()
.find(|s| matches!(s.segment_type, stowken::types::SegmentType::SystemPrompt))
.unwrap();
assert_eq!(sys.tokens, make_drift_system(7, 250));
}