use std::cell::RefCell;
use std::collections::HashMap;
use std::io;
use super::sha256_hex_bytes;
use serde::{Deserialize, Serialize};
use super::rat::RatDafsa;
pub const JSON_SCHEMA_DOC: &str = include_str!("schemas/blocks_schema.txt");
const MANIFEST_FORMAT: &str = "tilezz-rat-dafsa-blocks";
const MANIFEST_VERSION: u32 = 1;
const BLOCK_MAGIC: &[u8; 4] = b"TRB1";
const SCALAR_TAG: &str = "i8";
const BLOCK_FORMAT_TAG: &str = "tilezz-rat-block";
const BLOCK_FORMAT_VERSION: u32 = 1;
pub const DEFAULT_TARGET_BLOCK_BYTES: u32 = 1 << 22;
const STATE_RECORD_BYTES: usize = 16;
const EDGE_RECORD_BYTES: usize = 8;
const BLOCK_HEADER_BYTES: usize = 4 + 4 + 4 + 4;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockManifest {
pub format: String,
pub version: u32,
pub scalar: String,
pub block_format: String,
pub block_version: u32,
pub target_block_bytes: u32,
pub n_states: u32,
pub n_edges: u64,
pub n_sequences: u64,
pub max_indexed_length: u32,
pub root: RootState,
pub blocks: Vec<BlockEntry>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub block_base_url: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RootState {
pub count: u64,
pub is_accept: bool,
pub edges: Vec<RootEdge>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RootEdge {
pub label: i8,
pub target: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockEntry {
pub first_state: u32,
pub sha256: String,
pub size: u32,
}
impl BlockManifest {
pub fn block_filename(&self, entry: &BlockEntry) -> String {
format!("blocks/{}.bin", entry.sha256)
}
pub fn block_url(&self, entry: &BlockEntry) -> String {
match self.block_base_url.as_deref() {
Some(base) => format!("{}{}.bin", base, entry.sha256),
None => self.block_filename(entry),
}
}
pub fn block_index_for_state(&self, state_id: u32) -> Option<usize> {
if state_id == 0 || state_id >= self.n_states {
return None;
}
let pos = self.blocks.partition_point(|b| b.first_state <= state_id);
if pos == 0 {
None
} else {
Some(pos - 1)
}
}
pub fn block_state_range(&self, block_index: usize) -> Option<(u32, u32)> {
let entry = self.blocks.get(block_index)?;
let end = self
.blocks
.get(block_index + 1)
.map(|next| next.first_state)
.unwrap_or(self.n_states);
Some((entry.first_state, end))
}
}
#[derive(Debug, Clone)]
struct Block {
first_state_id: u32,
n_states: u32,
states: Vec<StateRec>,
edges: Vec<EdgeRec>,
}
#[derive(Debug, Clone, Copy)]
struct StateRec {
edges_offset: u32,
count: u64,
is_accept: bool,
}
#[derive(Debug, Clone, Copy)]
struct EdgeRec {
label: i8,
target: u32,
}
impl Block {
fn from_gz_bytes(bytes: &[u8]) -> io::Result<Self> {
let mut buf = Vec::new();
let mut dec = flate2::read::GzDecoder::new(bytes);
io::Read::read_to_end(&mut dec, &mut buf)?;
Self::from_bytes(&buf)
}
fn from_bytes(buf: &[u8]) -> io::Result<Self> {
if buf.len() < BLOCK_HEADER_BYTES {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"block: truncated header",
));
}
let magic = &buf[..4];
if magic != BLOCK_MAGIC {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"block: bad magic (expected {:?}, got {:?})",
std::str::from_utf8(BLOCK_MAGIC).unwrap(),
String::from_utf8_lossy(magic)
),
));
}
let first_state_id = read_u32(buf, 4);
let n_states = read_u32(buf, 8);
let n_edges = read_u32(buf, 12);
let states_start = BLOCK_HEADER_BYTES;
let states_end = states_start + n_states as usize * STATE_RECORD_BYTES;
let edges_start = states_end;
let edges_end = edges_start + n_edges as usize * EDGE_RECORD_BYTES;
if buf.len() < edges_end {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"block: truncated body",
));
}
let mut states = Vec::with_capacity(n_states as usize);
for i in 0..n_states as usize {
let o = states_start + i * STATE_RECORD_BYTES;
let edges_offset = read_u32(buf, o);
let count = read_u64(buf, o + 4);
let is_accept = buf[o + 12] != 0;
states.push(StateRec {
edges_offset,
count,
is_accept,
});
}
let mut edges = Vec::with_capacity(n_edges as usize);
for i in 0..n_edges as usize {
let o = edges_start + i * EDGE_RECORD_BYTES;
let label = buf[o] as i8;
let target = read_u32(buf, o + 4);
edges.push(EdgeRec { label, target });
}
Ok(Block {
first_state_id,
n_states,
states,
edges,
})
}
fn edges_for(&self, local_index: usize) -> &[EdgeRec] {
let start = self.states[local_index].edges_offset as usize;
let end = if local_index + 1 < self.n_states as usize {
self.states[local_index + 1].edges_offset as usize
} else {
self.edges.len()
};
&self.edges[start..end]
}
}
fn read_u32(buf: &[u8], off: usize) -> u32 {
u32::from_le_bytes([buf[off], buf[off + 1], buf[off + 2], buf[off + 3]])
}
fn read_u64(buf: &[u8], off: usize) -> u64 {
u64::from_le_bytes([
buf[off],
buf[off + 1],
buf[off + 2],
buf[off + 3],
buf[off + 4],
buf[off + 5],
buf[off + 6],
buf[off + 7],
])
}
fn write_u32(buf: &mut Vec<u8>, v: u32) {
buf.extend_from_slice(&v.to_le_bytes());
}
fn write_u64(buf: &mut Vec<u8>, v: u64) {
buf.extend_from_slice(&v.to_le_bytes());
}
pub struct LazyRatDafsa<F>
where
F: Fn(u32) -> io::Result<Vec<u8>>,
{
manifest: BlockManifest,
fetch_block: F,
cache: RefCell<HashMap<u32, Block>>,
}
impl<F> LazyRatDafsa<F>
where
F: Fn(u32) -> io::Result<Vec<u8>>,
{
pub fn open(manifest_json: &str, fetch_block: F) -> io::Result<Self> {
let manifest: BlockManifest = serde_json::from_str(manifest_json)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if manifest.format != MANIFEST_FORMAT {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsa: bad manifest format (expected '{}', got '{}')",
MANIFEST_FORMAT, manifest.format
),
));
}
if manifest.version != MANIFEST_VERSION {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsa: unsupported manifest version (expected {}, got {})",
MANIFEST_VERSION, manifest.version
),
));
}
if manifest.scalar != SCALAR_TAG {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsa: scalar mismatch (file is '{}', expected '{}')",
manifest.scalar, SCALAR_TAG
),
));
}
if manifest.block_format != BLOCK_FORMAT_TAG {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsa: bad block format tag (expected '{}', got '{}')",
BLOCK_FORMAT_TAG, manifest.block_format
),
));
}
if manifest.block_version != BLOCK_FORMAT_VERSION {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsa: unsupported block format version (expected {}, got {})",
BLOCK_FORMAT_VERSION, manifest.block_version
),
));
}
validate_block_index(&manifest)?;
Ok(Self {
manifest,
fetch_block,
cache: RefCell::new(HashMap::new()),
})
}
pub fn len(&self) -> u64 {
self.manifest.n_sequences
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn contains<S: AsRef<[i8]>>(&self, rat: S) -> bool {
let prefixed = prefix(rat.as_ref());
self.walk(&prefixed)
.ok()
.is_some_and(|(rec, _)| rec.is_accept)
}
pub fn index_of<S: AsRef<[i8]>>(&self, rat: S) -> Option<u64> {
let prefixed = prefix(rat.as_ref());
let mut state = 0u32;
let mut rank: u64 = 0;
for &symbol in &prefixed {
let (rec, edges) = self.lookup_state(state).ok()?;
if rec.is_accept {
rank = rank.checked_add(1)?;
}
let mut found = None;
for e in edges {
match e.label.cmp(&symbol) {
std::cmp::Ordering::Less => {
let (sub_rec, _) = self.lookup_state(e.target).ok()?;
rank = rank.checked_add(sub_rec.count)?;
}
std::cmp::Ordering::Equal => {
found = Some(e.target);
break;
}
std::cmp::Ordering::Greater => return None,
}
}
state = found?;
}
let (rec, _) = self.lookup_state(state).ok()?;
if !rec.is_accept {
return None;
}
Some(rank)
}
pub fn get(&self, i: u64) -> Option<Vec<i8>> {
if i >= self.len() {
return None;
}
let mut out: Vec<i8> = Vec::new();
let mut state = 0u32;
let mut remaining = i;
loop {
let (rec, edges) = self.lookup_state(state).ok()?;
if rec.is_accept {
if remaining == 0 {
if !out.is_empty() {
out.remove(0);
}
return Some(out);
}
remaining -= 1;
}
let mut descended = false;
for e in edges {
let (sub_rec, _) = self.lookup_state(e.target).ok()?;
let n = sub_rec.count;
if remaining < n {
out.push(e.label);
state = e.target;
descended = true;
break;
}
remaining -= n;
}
if !descended {
return None;
}
}
}
pub fn to_rat_dafsa(&self) -> io::Result<RatDafsa> {
let mut rats: Vec<Vec<i8>> = Vec::with_capacity(self.len() as usize);
let mut current: Vec<i8> = Vec::new();
let mut stack: Vec<(Vec<EdgeRec>, usize)> = Vec::new();
let (root_rec, root_edges) = self.lookup_state(0u32)?;
if root_rec.is_accept {
}
stack.push((root_edges, 0));
while let Some(frame) = stack.last_mut() {
if frame.1 < frame.0.len() {
let edge = frame.0[frame.1];
frame.1 += 1;
current.push(edge.label);
let (rec, child_edges) = self.lookup_state(edge.target)?;
if rec.is_accept {
let mut rat = current.clone();
rat.remove(0);
rats.push(rat);
}
stack.push((child_edges, 0));
} else {
stack.pop();
current.pop();
}
}
debug_assert_eq!(
rats.len() as u64,
self.len(),
"to_rat_dafsa: DFS yielded {} rats but manifest claims {}",
rats.len(),
self.len()
);
Ok(RatDafsa::from_rats(rats))
}
fn walk(&self, prefixed: &[i8]) -> Result<(StateRec, Vec<EdgeRec>), ()> {
let mut state = 0u32;
for &symbol in prefixed {
let (_, edges) = self.lookup_state(state).map_err(|_| ())?;
let mut next: Option<u32> = None;
for e in &edges {
match e.label.cmp(&symbol) {
std::cmp::Ordering::Less => continue,
std::cmp::Ordering::Equal => {
next = Some(e.target);
break;
}
std::cmp::Ordering::Greater => return Err(()),
}
}
state = next.ok_or(())?;
}
self.lookup_state(state).map_err(|_| ())
}
fn lookup_state(&self, state_id: u32) -> io::Result<(StateRec, Vec<EdgeRec>)> {
if state_id == 0 {
return Ok((
root_as_state_rec(&self.manifest.root),
root_edges(&self.manifest.root),
));
}
let block_index = self
.manifest
.block_index_for_state(state_id)
.ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!(
"state_id {state_id} not within asset (n_states = {})",
self.manifest.n_states
),
)
})?;
let entry = &self.manifest.blocks[block_index];
let local = (state_id - entry.first_state) as usize;
self.ensure_block_loaded(block_index)?;
let cache = self.cache.borrow();
let block = cache
.get(&(block_index as u32))
.expect("ensure_block_loaded just populated it");
let rec = block.states[local];
let edges = block.edges_for(local).to_vec();
Ok((rec, edges))
}
fn ensure_block_loaded(&self, block_index: usize) -> io::Result<()> {
let key = block_index as u32;
if self.cache.borrow().contains_key(&key) {
return Ok(());
}
let entry = &self.manifest.blocks[block_index];
let bytes = (self.fetch_block)(block_index as u32)?;
verify_block_hash(&bytes, &entry.sha256, block_index)?;
let block = Block::from_gz_bytes(&bytes)?;
if block.first_state_id != entry.first_state {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"block {block_index} declares first_state_id {} but manifest says {}",
block.first_state_id, entry.first_state
),
));
}
self.cache.borrow_mut().insert(key, block);
Ok(())
}
}
pub struct LazyRatDafsaAsync {
manifest: BlockManifest,
cache: RefCell<HashMap<u32, Block>>,
}
impl LazyRatDafsaAsync {
pub fn open(manifest_json: &str) -> io::Result<Self> {
let manifest: BlockManifest = serde_json::from_str(manifest_json)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if manifest.format != MANIFEST_FORMAT {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsaAsync: bad manifest format (expected '{}', got '{}')",
MANIFEST_FORMAT, manifest.format
),
));
}
if manifest.version != MANIFEST_VERSION {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsaAsync: unsupported manifest version (expected {}, got {})",
MANIFEST_VERSION, manifest.version
),
));
}
if manifest.scalar != SCALAR_TAG {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsaAsync: scalar mismatch (file is '{}', expected '{}')",
manifest.scalar, SCALAR_TAG
),
));
}
if manifest.block_format != BLOCK_FORMAT_TAG {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsaAsync: bad block format tag (expected '{}', got '{}')",
BLOCK_FORMAT_TAG, manifest.block_format
),
));
}
if manifest.block_version != BLOCK_FORMAT_VERSION {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"LazyRatDafsaAsync: unsupported block format version (expected {}, got {})",
BLOCK_FORMAT_VERSION, manifest.block_version
),
));
}
validate_block_index(&manifest)?;
Ok(Self {
manifest,
cache: RefCell::new(HashMap::new()),
})
}
pub fn manifest(&self) -> &BlockManifest {
&self.manifest
}
pub fn len(&self) -> u64 {
self.manifest.n_sequences
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub async fn contains<S, F, Fut>(&self, rat: S, fetch: &F) -> bool
where
S: AsRef<[i8]>,
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
let prefixed = prefix(rat.as_ref());
match self.walk(&prefixed, fetch).await {
Ok((rec, _)) => rec.is_accept,
Err(_) => false,
}
}
pub async fn index_of<S, F, Fut>(&self, rat: S, fetch: &F) -> Option<u64>
where
S: AsRef<[i8]>,
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
let rat_slice = rat.as_ref();
if rat_slice.len() as u32 > self.manifest.max_indexed_length {
return None;
}
let prefixed = prefix(rat_slice);
let mut state = 0u32;
let mut rank: u64 = 0;
for &symbol in &prefixed {
let (rec, edges) = self.lookup_state(state, fetch).await.ok()?;
if rec.is_accept {
rank = rank.checked_add(1)?;
}
let mut next: Option<u32> = None;
for e in edges {
match e.label.cmp(&symbol) {
std::cmp::Ordering::Less => {
let (sub_rec, _) = self.lookup_state(e.target, fetch).await.ok()?;
rank = rank.checked_add(sub_rec.count)?;
}
std::cmp::Ordering::Equal => {
next = Some(e.target);
break;
}
std::cmp::Ordering::Greater => return None,
}
}
state = next?;
}
let (rec, _) = self.lookup_state(state, fetch).await.ok()?;
if !rec.is_accept {
return None;
}
Some(rank)
}
pub async fn get<F, Fut>(&self, i: u64, fetch: &F) -> Option<Vec<i8>>
where
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
if i >= self.len() {
return None;
}
let mut out: Vec<i8> = Vec::new();
let mut state = 0u32;
let mut remaining = i;
loop {
let (rec, edges) = self.lookup_state(state, fetch).await.ok()?;
if rec.is_accept {
if remaining == 0 {
if !out.is_empty() {
out.remove(0);
}
return Some(out);
}
remaining -= 1;
}
let mut descended = false;
for e in edges {
let (sub_rec, _) = self.lookup_state(e.target, fetch).await.ok()?;
let n = sub_rec.count;
if remaining < n {
out.push(e.label);
state = e.target;
descended = true;
break;
}
remaining -= n;
}
if !descended {
return None;
}
}
}
pub async fn prewarm_to_depth<F, Fut>(&self, max_depth: usize, fetch: &F) -> usize
where
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
let mut visited: std::collections::HashSet<u32> = std::collections::HashSet::new();
visited.insert(0);
let mut frontier: Vec<u32> = vec![0];
for depth in 0..=max_depth {
let mut next: Vec<u32> = Vec::new();
for &state in &frontier {
let Ok((_, edges)) = self.lookup_state(state, fetch).await else {
continue;
};
if depth < max_depth {
for e in edges {
if visited.insert(e.target) {
next.push(e.target);
}
}
}
}
if next.is_empty() {
break;
}
frontier = next;
}
self.cache.borrow().len()
}
async fn walk<F, Fut>(&self, prefixed: &[i8], fetch: &F) -> Result<(StateRec, Vec<EdgeRec>), ()>
where
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
let mut state = 0u32;
for &symbol in prefixed {
let (_, edges) = self.lookup_state(state, fetch).await.map_err(|_| ())?;
let mut next: Option<u32> = None;
for e in &edges {
match e.label.cmp(&symbol) {
std::cmp::Ordering::Less => continue,
std::cmp::Ordering::Equal => {
next = Some(e.target);
break;
}
std::cmp::Ordering::Greater => return Err(()),
}
}
state = next.ok_or(())?;
}
self.lookup_state(state, fetch).await.map_err(|_| ())
}
async fn lookup_state<F, Fut>(
&self,
state_id: u32,
fetch: &F,
) -> io::Result<(StateRec, Vec<EdgeRec>)>
where
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
if state_id == 0 {
return Ok((
root_as_state_rec(&self.manifest.root),
root_edges(&self.manifest.root),
));
}
let block_index = self
.manifest
.block_index_for_state(state_id)
.ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!(
"state_id {state_id} not within asset (n_states = {})",
self.manifest.n_states
),
)
})?;
let entry = self.manifest.blocks[block_index].clone();
let local = (state_id - entry.first_state) as usize;
self.ensure_block_loaded(block_index, fetch).await?;
let cache = self.cache.borrow();
let block = cache
.get(&(block_index as u32))
.expect("ensure_block_loaded just populated it");
let rec = block.states[local];
let edges = block.edges_for(local).to_vec();
Ok((rec, edges))
}
async fn ensure_block_loaded<F, Fut>(&self, block_index: usize, fetch: &F) -> io::Result<()>
where
F: Fn(u32) -> Fut,
Fut: core::future::Future<Output = io::Result<Vec<u8>>>,
{
let key = block_index as u32;
if self.cache.borrow().contains_key(&key) {
return Ok(());
}
let entry = self.manifest.blocks[block_index].clone();
let bytes = fetch(block_index as u32).await?;
verify_block_hash(&bytes, &entry.sha256, block_index)?;
let block = Block::from_gz_bytes(&bytes)?;
if block.first_state_id != entry.first_state {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"block {block_index} declares first_state_id {} but manifest says {}",
block.first_state_id, entry.first_state
),
));
}
self.cache.borrow_mut().insert(key, block);
Ok(())
}
}
fn root_as_state_rec(root: &RootState) -> StateRec {
StateRec {
edges_offset: 0,
count: root.count,
is_accept: root.is_accept,
}
}
fn root_edges(root: &RootState) -> Vec<EdgeRec> {
root.edges
.iter()
.map(|e| EdgeRec {
label: e.label,
target: e.target,
})
.collect()
}
fn verify_block_hash(bytes: &[u8], want: &str, block_index: usize) -> io::Result<()> {
let got = sha256_hex_bytes(bytes);
if got != want {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("block {block_index} sha256 mismatch (got {got}, manifest wants {want})"),
));
}
Ok(())
}
fn validate_block_index(manifest: &BlockManifest) -> io::Result<()> {
if manifest.blocks.is_empty() {
if manifest.n_states > 1 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"manifest has n_states={} but no blocks (only root in manifest)",
manifest.n_states
),
));
}
return Ok(());
}
if manifest.blocks[0].first_state != 1 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"first block must start at state 1 (root is state 0); got {}",
manifest.blocks[0].first_state
),
));
}
for w in manifest.blocks.windows(2) {
if w[1].first_state <= w[0].first_state {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"block index not strictly ascending by first_state",
));
}
}
let last = &manifest.blocks[manifest.blocks.len() - 1];
if last.first_state >= manifest.n_states {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"last block first_state={} >= n_states={}",
last.first_state, manifest.n_states
),
));
}
Ok(())
}
impl RatDafsa {
pub fn write_blocks(&self, dir: &std::path::Path, target_block_bytes: u32) -> io::Result<()> {
assert!(target_block_bytes >= 1, "target_block_bytes must be >= 1");
let dafsa = self.inner();
let n_states_usize = dafsa.raw_n_states();
let n_edges_usize = dafsa.raw_n_edges();
if n_states_usize > u32::MAX as usize {
return Err(io::Error::other(format!(
"DAFSA has {n_states_usize} states, exceeding the u32 wire \
state-id capacity ({}); this perimeter is beyond the block \
format's range and would require a wider-id revision.",
u32::MAX
)));
}
let n_states = n_states_usize as u32; let n_edges = n_edges_usize as u64; let counts = dafsa.raw_counts();
let n_sequences = counts.first().copied().unwrap_or(0) as u64;
let edges_start = dafsa.raw_edges_start();
let labels = dafsa.raw_labels();
let targets = dafsa.raw_targets();
let is_accept = dafsa.raw_is_accept();
let root_edge_lo = edges_start[0] as usize;
let root_edge_hi = if n_states > 1 {
edges_start[1] as usize
} else {
n_edges_usize
};
let root_edges_list: Vec<RootEdge> = (root_edge_lo..root_edge_hi)
.map(|i| RootEdge {
label: labels[i],
target: targets[i],
})
.collect();
let max_indexed_length = root_edges_list
.iter()
.map(|e| e.label as u32)
.max()
.unwrap_or(0);
let root = RootState {
count: counts.first().copied().unwrap_or(0) as u64,
is_accept: *is_accept.first().unwrap_or(&false),
edges: root_edges_list,
};
let blocks_dir = dir.join("blocks");
std::fs::create_dir_all(&blocks_dir)?;
let mut block_index: Vec<BlockEntry> = Vec::new();
let mut buf_states: Vec<(u32, u64, bool)> = Vec::new();
let mut buf_edges: Vec<(i8, u32)> = Vec::new();
let mut block_first_state: u32 = 1;
let mut block_edge_lo: usize = root_edge_hi;
for state in 1..n_states {
let global = state as usize;
let next_edge_offset = if global + 1 == n_states as usize {
n_edges_usize
} else {
edges_start[global + 1] as usize
};
let state_edge_lo = edges_start[global] as usize;
let state_edge_count = next_edge_offset - state_edge_lo;
let edges_offset_local = (state_edge_lo - block_edge_lo) as u32;
let next_bytes = BLOCK_HEADER_BYTES
+ (buf_states.len() + 1) * STATE_RECORD_BYTES
+ (buf_edges.len() + state_edge_count) * EDGE_RECORD_BYTES;
if !buf_states.is_empty() && next_bytes as u32 > target_block_bytes {
flush_block(
&blocks_dir,
block_first_state,
&buf_states,
&buf_edges,
&mut block_index,
)?;
buf_states.clear();
buf_edges.clear();
block_first_state = state;
block_edge_lo = state_edge_lo;
}
let local_offset = if buf_states.is_empty() {
0
} else {
edges_offset_local
};
buf_states.push((local_offset, counts[global] as u64, is_accept[global]));
for i in state_edge_lo..next_edge_offset {
buf_edges.push((labels[i], targets[i]));
}
}
if !buf_states.is_empty() {
flush_block(
&blocks_dir,
block_first_state,
&buf_states,
&buf_edges,
&mut block_index,
)?;
}
let manifest = BlockManifest {
format: MANIFEST_FORMAT.to_string(),
version: MANIFEST_VERSION,
scalar: SCALAR_TAG.to_string(),
block_format: BLOCK_FORMAT_TAG.to_string(),
block_version: BLOCK_FORMAT_VERSION,
target_block_bytes,
n_states,
n_edges,
n_sequences,
max_indexed_length,
root,
blocks: block_index,
block_base_url: None,
};
let manifest_path = dir.join("block_index.json");
let manifest_file = std::fs::File::create(&manifest_path)?;
serde_json::to_writer_pretty(manifest_file, &manifest)
.map_err(|e| io::Error::other(format!("write manifest: {e}")))?;
Ok(())
}
}
fn flush_block(
blocks_dir: &std::path::Path,
first_state: u32,
states: &[(u32, u64, bool)],
edges: &[(i8, u32)],
index: &mut Vec<BlockEntry>,
) -> io::Result<()> {
let n_states = states.len() as u32;
let n_edges = edges.len() as u32;
let mut bytes = Vec::with_capacity(
BLOCK_HEADER_BYTES + states.len() * STATE_RECORD_BYTES + edges.len() * EDGE_RECORD_BYTES,
);
bytes.extend_from_slice(BLOCK_MAGIC);
write_u32(&mut bytes, first_state);
write_u32(&mut bytes, n_states);
write_u32(&mut bytes, n_edges);
for &(off, count, accept) in states {
write_u32(&mut bytes, off);
write_u64(&mut bytes, count);
bytes.push(accept as u8);
bytes.push(0);
bytes.push(0);
bytes.push(0);
}
for &(label, target) in edges {
bytes.push(label as u8);
bytes.push(0);
bytes.push(0);
bytes.push(0);
write_u32(&mut bytes, target);
}
let mut gz: Vec<u8> = Vec::new();
{
let mut enc = flate2::write::GzEncoder::new(&mut gz, flate2::Compression::default());
io::Write::write_all(&mut enc, &bytes)?;
enc.finish()?;
}
let sha256 = sha256_hex_bytes(&gz);
let path = blocks_dir.join(format!("{sha256}.bin"));
std::fs::write(&path, &gz)?;
index.push(BlockEntry {
first_state,
sha256,
size: gz.len() as u32,
});
Ok(())
}
fn prefix(rat: &[i8]) -> Vec<i8> {
assert!(
rat.len() <= i8::MAX as usize,
"LazyRatDafsa: rat length {} exceeds the single-byte length-prefix limit ({})",
rat.len(),
i8::MAX as usize
);
let mut v = Vec::with_capacity(rat.len() + 1);
v.push(rat.len() as i8);
v.extend_from_slice(rat);
v
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
fn blocks_roundtrip_matches_ratdafsa() {
let input: Vec<Vec<i8>> = vec![
vec![-2],
vec![-1],
vec![0],
vec![1],
vec![2],
vec![-2, 1, 1],
vec![-1, 0, 1],
vec![0, 1, 2],
vec![1, 2, 3],
vec![1, 2, 3, 4],
vec![1, 2, 4],
vec![1, 3, 5],
vec![2, 2, 2],
];
let rd = RatDafsa::from_rats(input.iter().map(|v| v.as_slice()));
let dir = tempdir();
rd.write_blocks(&dir, 128).expect("write_blocks");
let manifest_path = dir.join("block_index.json");
assert!(manifest_path.exists(), "manifest must be created");
let manifest_text = fs::read_to_string(&manifest_path).unwrap();
let manifest: BlockManifest = serde_json::from_str(&manifest_text).unwrap();
assert!(!manifest.blocks.is_empty());
assert!(
manifest.blocks.len() >= 2,
"expected >=2 blocks for {} states, got {}",
manifest.n_states,
manifest.blocks.len()
);
let dir_for_cb = dir.clone();
let manifest_clone = manifest.clone();
let fetch = move |block_index: u32| -> io::Result<Vec<u8>> {
let entry = &manifest_clone.blocks[block_index as usize];
let name = manifest_clone.block_filename(entry);
fs::read(dir_for_cb.join(name))
};
let lazy = LazyRatDafsa::open(&manifest_text, fetch).expect("open");
assert_eq!(lazy.len(), rd.len() as u64);
for i in 0..rd.len() {
assert_eq!(lazy.get(i as u64), rd.get(i), "get({i}) mismatch");
}
assert_eq!(lazy.get(rd.len() as u64), None, "out-of-range");
for i in 0..rd.len() {
let rat = rd.get(i).unwrap();
assert!(lazy.contains(rat.as_slice()), "missing rat #{i}");
assert_eq!(lazy.index_of(rat.as_slice()), Some(i as u64));
}
assert!(!lazy.contains([42i8, 42, 42].as_slice()));
assert_eq!(lazy.index_of([42i8, 42, 42].as_slice()), None);
}
#[test]
fn cross_block_walks_are_correct() {
let input: Vec<Vec<i8>> = vec![
vec![-2, 1, 1],
vec![1, 2, 3, 4],
vec![1, 2, 4],
vec![0, 0, 0],
];
let rd = RatDafsa::from_rats(input.iter().map(|v| v.as_slice()));
let dir = tempdir();
rd.write_blocks(&dir, 1).expect("write_blocks");
let manifest_text = fs::read_to_string(dir.join("block_index.json")).unwrap();
let manifest: BlockManifest = serde_json::from_str(&manifest_text).unwrap();
assert_eq!(manifest.target_block_bytes, 1);
assert_eq!(manifest.blocks.len() as u32, manifest.n_states - 1);
let dir_for_cb = dir.clone();
let manifest_clone = manifest.clone();
let fetch = move |i: u32| {
let entry = &manifest_clone.blocks[i as usize];
fs::read(dir_for_cb.join(manifest_clone.block_filename(entry)))
};
let lazy = LazyRatDafsa::open(&manifest_text, fetch).expect("open");
for i in 0..rd.len() {
let rat = rd.get(i).unwrap();
assert!(lazy.contains(rat.as_slice()));
assert_eq!(lazy.get(i as u64), Some(rat.clone()));
assert_eq!(lazy.index_of(rat.as_slice()), Some(i as u64));
}
}
#[test]
fn fetch_callback_is_called_minimally() {
let input: Vec<Vec<i8>> = vec![
vec![1, 2, 3, 4],
vec![1, 2, 3, 5],
vec![1, 2, 4, 5],
vec![1, 3, 4, 5],
vec![2, 3, 4, 5],
vec![-1, 0, 1, 2],
vec![-2, -1, 0, 1],
vec![0, 0, 0, 0],
];
let rd = RatDafsa::from_rats(input.iter().map(|v| v.as_slice()));
let dir = tempdir();
rd.write_blocks(&dir, 48).expect("write_blocks");
let manifest_text = fs::read_to_string(dir.join("block_index.json")).unwrap();
let manifest: BlockManifest = serde_json::from_str(&manifest_text).unwrap();
let total_blocks = manifest.blocks.len();
assert!(total_blocks >= 2, "test needs at least 2 blocks");
let dir_for_cb = dir.clone();
let manifest_clone = manifest.clone();
let counter = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0));
let counter_for_cb = counter.clone();
let fetch = move |i: u32| {
counter_for_cb.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let entry = &manifest_clone.blocks[i as usize];
fs::read(dir_for_cb.join(manifest_clone.block_filename(entry)))
};
let lazy = LazyRatDafsa::open(&manifest_text, fetch).expect("open");
assert!(lazy.contains([1i8, 2, 3, 4].as_slice()));
let fetches = counter.load(std::sync::atomic::Ordering::Relaxed);
assert!(
fetches < total_blocks as usize,
"single query fetched all {total_blocks} blocks, no locality"
);
counter.store(0, std::sync::atomic::Ordering::Relaxed);
assert!(lazy.contains([1i8, 2, 3, 4].as_slice()));
assert_eq!(
counter.load(std::sync::atomic::Ordering::Relaxed),
0,
"warm repeat should hit cache only"
);
}
#[test]
fn to_rat_dafsa_round_trips() {
let input: Vec<Vec<i8>> = vec![
vec![-3, 2, 1],
vec![-1, 0, 1],
vec![0, 0, 0, 0],
vec![1, 2, 3, 4, 5],
vec![1, 2, 4, 5],
vec![1],
vec![2, 3],
vec![3, 3, 3],
vec![-2, -1, 0, 1],
];
let original = RatDafsa::from_rats(input.iter().map(|v| v.as_slice()));
let dir = tempdir();
original.write_blocks(&dir, 64).expect("write_blocks");
let manifest_text = fs::read_to_string(dir.join("block_index.json")).unwrap();
let manifest: BlockManifest = serde_json::from_str(&manifest_text).unwrap();
assert!(manifest.blocks.len() >= 2);
let dir_for_cb = dir.clone();
let manifest_clone = manifest.clone();
let fetch = move |i: u32| {
let entry = &manifest_clone.blocks[i as usize];
fs::read(dir_for_cb.join(manifest_clone.block_filename(entry)))
};
let lazy = LazyRatDafsa::open(&manifest_text, fetch).expect("open");
let restored = lazy.to_rat_dafsa().expect("to_rat_dafsa");
assert_eq!(restored.len(), original.len());
let original_seqs: Vec<Vec<i8>> = original.iter().collect();
let restored_seqs: Vec<Vec<i8>> = restored.iter().collect();
assert_eq!(original_seqs, restored_seqs, "iter() mismatch");
for i in 0..original.len() {
assert_eq!(original.get(i), restored.get(i), "get({i}) mismatch");
}
let mut a = Vec::new();
let mut b = Vec::new();
original.write_json_gz(&mut a).unwrap();
restored.write_json_gz(&mut b).unwrap();
let a2 = RatDafsa::read_json_gz(&a[..]).unwrap();
let b2 = RatDafsa::read_json_gz(&b[..]).unwrap();
assert_eq!(a2.iter().collect::<Vec<_>>(), b2.iter().collect::<Vec<_>>());
}
#[test]
fn manifest_validation() {
let valid = BlockManifest {
format: MANIFEST_FORMAT.to_string(),
version: MANIFEST_VERSION,
scalar: SCALAR_TAG.to_string(),
block_format: BLOCK_FORMAT_TAG.to_string(),
block_version: BLOCK_FORMAT_VERSION,
target_block_bytes: 1024,
n_states: 3,
n_edges: 2,
n_sequences: 1,
max_indexed_length: 1,
root: RootState {
count: 1,
is_accept: false,
edges: vec![RootEdge {
label: 1,
target: 1,
}],
},
blocks: vec![BlockEntry {
first_state: 1,
sha256: "0".repeat(64),
size: 0,
}],
block_base_url: None,
};
let unused = |_: u32| -> io::Result<Vec<u8>> {
Err(io::Error::other(
"fetch should never be called in this test",
))
};
let mut bad = valid.clone();
bad.format = "tilezz-something-else".to_string();
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let mut bad = valid.clone();
bad.version = 999;
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let mut bad = valid.clone();
bad.scalar = "u16".to_string();
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let mut bad = valid.clone();
bad.block_format = "something-else".to_string();
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let mut bad = valid.clone();
bad.blocks[0].first_state = 2;
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let mut bad = valid.clone();
bad.blocks.push(BlockEntry {
first_state: 1,
sha256: "1".repeat(64),
size: 0,
});
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let mut bad = valid.clone();
bad.blocks.clear();
let json = serde_json::to_string(&bad).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_err());
let json = serde_json::to_string(&valid).unwrap();
assert!(LazyRatDafsa::open(&json, unused).is_ok());
}
#[test]
fn upgrade_path_preserves_inner_block_hashes() {
let small_input: Vec<Vec<i8>> = vec![
vec![-2],
vec![-1],
vec![0],
vec![1],
vec![2],
vec![-2, 1, 1],
vec![-1, 0, 1],
vec![0, 1, 2],
vec![1, 2, 3],
vec![1, 2, 4],
vec![1, 3, 5],
vec![2, 2, 2],
];
let mut big_input = small_input.clone();
big_input.extend(vec![
vec![1, 2, 3, 4],
vec![1, 2, 4, 5],
vec![0, 1, 2, 3],
vec![1, 2, 3, 4, 5],
vec![0, 1, 2, 3, 4],
vec![1, 2, 3, 4, 5, 6],
]);
let small_rd = RatDafsa::from_rats(small_input.iter().map(|v| v.as_slice()));
let big_rd = RatDafsa::from_rats(big_input.iter().map(|v| v.as_slice()));
let small_dir = tempdir();
let big_dir = tempdir();
let target = 96u32;
small_rd.write_blocks(&small_dir, target).expect("small");
big_rd.write_blocks(&big_dir, target).expect("big");
let small_manifest: BlockManifest =
serde_json::from_str(&fs::read_to_string(small_dir.join("block_index.json")).unwrap())
.unwrap();
let big_manifest: BlockManifest =
serde_json::from_str(&fs::read_to_string(big_dir.join("block_index.json")).unwrap())
.unwrap();
assert!(
small_manifest.blocks.len() >= 2,
"test needs small asset to have >=2 blocks; got {}",
small_manifest.blocks.len()
);
assert!(big_manifest.n_states > small_manifest.n_states);
assert!(big_manifest.n_sequences > small_manifest.n_sequences);
let cutoff = small_manifest.blocks.len() - 1;
for (i, small_entry) in small_manifest.blocks.iter().take(cutoff).enumerate() {
let big_entry = &big_manifest.blocks[i];
assert_eq!(
big_entry.first_state, small_entry.first_state,
"block {i}: big.first_state {} != small.first_state {}",
big_entry.first_state, small_entry.first_state
);
assert_eq!(
big_entry.sha256, small_entry.sha256,
"block {i} (first_state={}): sha256 diverged after upgrade",
small_entry.first_state
);
let small_path = small_dir.join(small_manifest.block_filename(small_entry));
let big_path = big_dir.join(big_manifest.block_filename(big_entry));
let small_bytes = fs::read(&small_path).unwrap();
let big_bytes = fs::read(&big_path).unwrap();
assert_eq!(
small_bytes, big_bytes,
"block {i} (first_state={}): on-disk bytes diverged",
small_entry.first_state
);
}
}
#[test]
fn block_url_resolves_with_and_without_base() {
let entry = BlockEntry {
first_state: 1,
sha256: "abcdef0123456789".repeat(4),
size: 4096,
};
assert_eq!(entry.sha256.len(), 64);
let mut manifest = BlockManifest {
format: MANIFEST_FORMAT.to_string(),
version: MANIFEST_VERSION,
scalar: SCALAR_TAG.to_string(),
block_format: BLOCK_FORMAT_TAG.to_string(),
block_version: BLOCK_FORMAT_VERSION,
target_block_bytes: 1024,
n_states: 2,
n_edges: 0,
n_sequences: 0,
max_indexed_length: 0,
root: RootState {
count: 0,
is_accept: false,
edges: vec![],
},
blocks: vec![entry.clone()],
block_base_url: None,
};
assert_eq!(manifest.block_url(&entry), manifest.block_filename(&entry));
assert!(manifest.block_url(&entry).starts_with("blocks/"));
assert!(manifest.block_url(&entry).ends_with(".bin"));
manifest.block_base_url = Some(
"https://github.com/apirogov/tilezz/releases/download/data-zz12-n16-free-v1/"
.to_string(),
);
let url = manifest.block_url(&entry);
assert!(
url.starts_with("https://github.com/apirogov/tilezz/releases/download/"),
"got {url}"
);
assert!(url.ends_with(&format!("{}.bin", entry.sha256)), "got {url}");
assert!(
!url.contains("/blocks/"),
"url must skip the `blocks/` prefix when base set; got {url}"
);
assert_eq!(
manifest.block_filename(&entry),
format!("blocks/{}.bin", entry.sha256)
);
}
#[test]
fn block_base_url_serde_roundtrip() {
let make = |base: Option<&str>| BlockManifest {
format: MANIFEST_FORMAT.to_string(),
version: MANIFEST_VERSION,
scalar: SCALAR_TAG.to_string(),
block_format: BLOCK_FORMAT_TAG.to_string(),
block_version: BLOCK_FORMAT_VERSION,
target_block_bytes: 1024,
n_states: 2,
n_edges: 0,
n_sequences: 0,
max_indexed_length: 0,
root: RootState {
count: 0,
is_accept: false,
edges: vec![],
},
blocks: vec![BlockEntry {
first_state: 1,
sha256: "0".repeat(64),
size: 0,
}],
block_base_url: base.map(str::to_string),
};
let m = make(None);
let json = serde_json::to_string(&m).unwrap();
assert!(
!json.contains("block_base_url"),
"field should not serialise when None; got {json}"
);
let parsed: BlockManifest = serde_json::from_str(&json).unwrap();
assert!(parsed.block_base_url.is_none());
let m = make(Some("https://cdn.example.com/data/"));
let json = serde_json::to_string(&m).unwrap();
assert!(json.contains("\"block_base_url\":\"https://cdn.example.com/data/\""));
let parsed: BlockManifest = serde_json::from_str(&json).unwrap();
assert_eq!(
parsed.block_base_url.as_deref(),
Some("https://cdn.example.com/data/")
);
}
#[test]
fn manifest_n_sequences_beyond_u32() {
const BIG: u64 = 5_000_000_000; const BIG_EDGES: u64 = 6_000_000_000; let manifest = BlockManifest {
format: MANIFEST_FORMAT.to_string(),
version: MANIFEST_VERSION,
scalar: SCALAR_TAG.to_string(),
block_format: BLOCK_FORMAT_TAG.to_string(),
block_version: BLOCK_FORMAT_VERSION,
target_block_bytes: 1024,
n_states: 3,
n_edges: BIG_EDGES,
n_sequences: BIG,
max_indexed_length: 1,
root: RootState {
count: BIG,
is_accept: false,
edges: vec![RootEdge {
label: 1,
target: 1,
}],
},
blocks: vec![BlockEntry {
first_state: 1,
sha256: "0".repeat(64),
size: 0,
}],
block_base_url: None,
};
let json = serde_json::to_string(&manifest).unwrap();
let back: BlockManifest = serde_json::from_str(&json).unwrap();
assert_eq!(back.n_sequences, BIG);
assert_eq!(back.n_edges, BIG_EDGES);
let unused = |_: u32| -> io::Result<Vec<u8>> {
Err(io::Error::other("fetch should never be called"))
};
let lazy = LazyRatDafsa::open(&json, unused).expect("open");
assert_eq!(lazy.len(), BIG);
}
#[test]
fn index_of_rank_beyond_u32() {
const SIBLING: u64 = 5_000_000_000;
let mut body = Vec::new();
body.extend_from_slice(BLOCK_MAGIC);
write_u32(&mut body, 1); write_u32(&mut body, 3); write_u32(&mut body, 2); let push_state = |b: &mut Vec<u8>, off: u32, count: u64, acc: bool| {
write_u32(b, off);
write_u64(b, count);
b.push(acc as u8);
b.extend_from_slice(&[0u8; 3]);
};
push_state(&mut body, 0, SIBLING + 1, false); push_state(&mut body, 2, SIBLING, true); push_state(&mut body, 2, 1, true); let push_edge = |b: &mut Vec<u8>, label: i8, target: u32| {
b.push(label as u8);
b.extend_from_slice(&[0u8; 3]);
write_u32(b, target);
};
push_edge(&mut body, 5, 2); push_edge(&mut body, 7, 3);
let mut enc = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
std::io::Write::write_all(&mut enc, &body).unwrap();
let gz = enc.finish().unwrap();
let sha = sha256_hex_bytes(&gz);
let manifest = BlockManifest {
format: MANIFEST_FORMAT.to_string(),
version: MANIFEST_VERSION,
scalar: SCALAR_TAG.to_string(),
block_format: BLOCK_FORMAT_TAG.to_string(),
block_version: BLOCK_FORMAT_VERSION,
target_block_bytes: 1024,
n_states: 4,
n_edges: 2,
n_sequences: SIBLING + 1,
max_indexed_length: 8,
root: RootState {
count: SIBLING + 1,
is_accept: false,
edges: vec![RootEdge {
label: 1,
target: 1,
}],
},
blocks: vec![BlockEntry {
first_state: 1,
sha256: sha,
size: gz.len() as u32,
}],
block_base_url: None,
};
let json = serde_json::to_string(&manifest).unwrap();
let gz_for_cb = gz.clone();
let fetch = move |_: u32| -> io::Result<Vec<u8>> { Ok(gz_for_cb.clone()) };
let lazy = LazyRatDafsa::open(&json, fetch).expect("open");
assert_eq!(lazy.index_of([7i8].as_slice()), Some(SIBLING));
}
fn tempdir() -> std::path::PathBuf {
let base = std::env::temp_dir();
let nanos = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
let p = base.join(format!("tilezz-lazyratdafsa-test-{nanos}"));
fs::create_dir_all(&p).expect("tempdir mkdir");
p
}
}