pub const MAGIC: [u8; 4] = [0x49, 0x58, 0x30, 0x31];
pub const VERSION_MAJOR: u16 = 1;
pub const VERSION_MINOR: u16 = 3;
pub const HEADER_SIZE: usize = 256;
pub const TRIGRAM_ENTRY_SIZE: usize = 20;
pub const CDX_BLOCK_SIZE: usize = 1024;
pub const FILE_ENTRY_SIZE: usize = 48;
pub const DELTA_MAGIC: [u8; 4] = [0x49, 0x58, 0x44, 0x4C];
pub const DELTA_TOMBSTONE: u8 = 0x01;
pub const DELTA_FILE_ENTRY: u8 = 0x02;
pub const DELTA_TRIGRAM_ENTRY: u8 = 0x03;
pub mod flags {
pub const HAS_BLOOM_FILTERS: u64 = 1 << 0;
pub const HAS_CONTENT_HASHES: u64 = 1 << 1;
pub const POSTING_LISTS_COMPRESSED: u64 = 1 << 2;
pub const POSTING_LISTS_CHECKSUMMED: u64 = 1 << 3;
pub const HAS_CDX_INDEX: u64 = 1 << 4;
}
#[repr(u8)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FileStatus {
Fresh = 0x00,
Stale = 0x01,
Deleted = 0x02,
}
impl FileStatus {
#[must_use]
pub const fn from_u8(v: u8) -> Self {
match v {
0x00 => Self::Fresh,
0x02 => Self::Deleted,
_ => Self::Stale, }
}
}
#[derive(Debug, Clone)]
pub struct Header {
pub version_major: u16,
pub version_minor: u16,
pub flags: u64,
pub created_at: u64,
pub source_bytes_total: u64,
pub file_count: u32,
pub trigram_count: u32,
pub file_table_offset: u64,
pub file_table_size: u64,
pub trigram_table_offset: u64,
pub trigram_table_size: u64,
pub posting_data_offset: u64,
pub posting_data_size: u64,
pub bloom_offset: u64,
pub bloom_size: u64,
pub string_pool_offset: u64,
pub string_pool_size: u64,
pub name_index_offset: u64,
pub name_index_size: u64,
pub cdx_block_index_offset: u64,
pub cdx_block_index_size: u64,
}
impl Header {
pub fn parse(data: &[u8]) -> crate::error::Result<Self> {
if data.len() < HEADER_SIZE {
return Err(crate::error::Error::IndexTooSmall);
}
if data.get(0..4).ok_or(crate::error::Error::IndexTooSmall)? != MAGIC {
return Err(crate::error::Error::BadMagic);
}
let r = |off: usize| -> u64 {
data.get(off..off + 8)
.and_then(|s| s.try_into().ok())
.map_or(0, u64::from_le_bytes)
};
let r16 = |off: usize| -> u16 {
data.get(off..off + 2)
.and_then(|s| s.try_into().ok())
.map_or(0, u16::from_le_bytes)
};
let r32 = |off: usize| -> u32 {
data.get(off..off + 4)
.and_then(|s| s.try_into().ok())
.map_or(0, u32::from_le_bytes)
};
let major = r16(0x04);
let minor = r16(0x06);
if major != VERSION_MAJOR || minor < VERSION_MINOR {
return Err(crate::error::Error::UnsupportedVersion { major, minor });
}
let expected_crc = r32(0xF8);
let actual_crc = crc32c::crc32c(
data.get(0..0xF8)
.ok_or(crate::error::Error::IndexTooSmall)?,
);
if expected_crc != actual_crc {
return Err(crate::error::Error::HeaderCorrupted {
expected: expected_crc,
actual: actual_crc,
});
}
Ok(Self {
version_major: major,
version_minor: minor,
flags: r(0x08),
created_at: r(0x10),
source_bytes_total: r(0x18),
file_count: r32(0x20),
trigram_count: r32(0x24),
file_table_offset: r(0x28),
file_table_size: r(0x30),
trigram_table_offset: r(0x38),
trigram_table_size: r(0x40),
posting_data_offset: r(0x48),
posting_data_size: r(0x50),
bloom_offset: r(0x58),
bloom_size: r(0x60),
string_pool_offset: r(0x68),
string_pool_size: r(0x70),
name_index_offset: r(0x78),
name_index_size: r(0x80),
cdx_block_index_offset: r(0x88),
cdx_block_index_size: r(0x90),
})
}
pub fn validate_bounds(&self, file_len: u64) -> crate::error::Result<()> {
let check = |name: &'static str, off: u64, sz: u64| -> crate::error::Result<()> {
if off + sz > file_len {
Err(crate::error::Error::SectionOutOfBounds {
section: name,
offset: off,
size: sz,
file_len,
})
} else {
Ok(())
}
};
check("file_table", self.file_table_offset, self.file_table_size)?;
check(
"trigram_table",
self.trigram_table_offset,
self.trigram_table_size,
)?;
check(
"posting_data",
self.posting_data_offset,
self.posting_data_size,
)?;
if self.bloom_size > 0 {
check("bloom", self.bloom_offset, self.bloom_size)?;
}
check(
"string_pool",
self.string_pool_offset,
self.string_pool_size,
)?;
if self.name_index_size > 0 {
check("name_index", self.name_index_offset, self.name_index_size)?;
}
if self.cdx_block_index_size > 0 {
check(
"cdx_block_index",
self.cdx_block_index_offset,
self.cdx_block_index_size,
)?;
}
Ok(())
}
#[must_use]
pub const fn has_bloom(&self) -> bool {
self.flags & flags::HAS_BLOOM_FILTERS != 0
}
#[must_use]
pub const fn has_cdx(&self) -> bool {
self.flags & flags::HAS_CDX_INDEX != 0
}
}
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Beacon {
pub pid: i32,
pub root: PathBuf,
pub start_time: u64,
pub status: String,
pub last_event_at: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub socket_path: Option<PathBuf>,
}
impl Beacon {
#[must_use]
pub fn new(root: &Path) -> Self {
let pid = i32::try_from(std::process::id()).unwrap_or(0);
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
Self {
pid,
root: root.to_path_buf(),
start_time: now,
status: "idle".to_string(),
last_event_at: now,
socket_path: None,
}
}
#[must_use]
pub fn is_live(&self) -> bool {
use nix::sys::signal::kill;
use nix::unistd::Pid;
if kill(Pid::from_raw(self.pid), None).is_err() {
return false;
}
let comm_path = format!("/proc/{}/comm", self.pid);
if let Ok(comm) = std::fs::read_to_string(&comm_path) {
let comm = comm.trim();
if comm != "ixd" {
return false;
}
} else {
return false;
}
self.root.exists()
}
pub fn write_to(&self, folder: &Path) -> crate::error::Result<()> {
let path = folder.join("beacon.json");
let f = std::fs::File::create(path)?;
serde_json::to_writer_pretty(f, self).map_err(std::io::Error::other)?;
Ok(())
}
pub fn read_from(folder: &Path) -> crate::error::Result<Self> {
let path = folder.join("beacon.json");
let f = std::fs::File::open(path)?;
let beacon = serde_json::from_reader(f).map_err(std::io::Error::other)?;
Ok(beacon)
}
}
#[must_use]
#[allow(clippy::cast_precision_loss, clippy::as_conversions)]
pub fn is_binary(data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
let check_len = data.len().min(512);
let slice = data.get(..check_len).unwrap_or(&[]);
let mut non_text = 0usize;
let mut i = 0;
while i < slice.len() {
let b = slice[i];
if matches!(b, 0x09 | 0x0A | 0x0D | 0x20..=0x7E) {
} else if b & 0xC0 == 0xC0 {
let seq_len = if b & 0xE0 == 0xC0 {
2
} else if b & 0xF0 == 0xE0 {
3
} else if b & 0xF8 == 0xF0 {
4
} else {
0
};
if seq_len > 0 && i + seq_len <= slice.len() {
let seq = &slice[i..i + seq_len];
if is_valid_utf8_sequence(seq) {
i += seq_len;
continue;
}
}
non_text += 1;
} else if b & 0xC0 == 0x80 {
non_text += 1;
} else {
non_text += 1;
}
i += 1;
}
(non_text as f32 / check_len as f32) > 0.3
}
#[inline]
#[allow(clippy::indexing_slicing)]
fn is_valid_utf8_sequence(seq: &[u8]) -> bool {
match seq.len() {
2 => seq[0] >= 0xC2 && (seq[1] & 0xC0) == 0x80,
3 => {
let valid = (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80;
if !valid {
return false;
}
if seq[0] == 0xE0 {
seq[1] >= 0xA0
} else if seq[0] == 0xED {
seq[1] <= 0x9F
} else {
seq[0] >= 0xE1 && seq[0] <= 0xEC || seq[0] >= 0xEE
}
}
4 => {
let valid =
(seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80 && (seq[3] & 0xC0) == 0x80;
if !valid {
return false;
}
if seq[0] == 0xF0 {
seq[1] >= 0x90
} else if seq[0] == 0xF4 {
seq[1] <= 0x8F
} else {
seq[0] >= 0xF1 && seq[0] <= 0xF3
}
}
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_binary_empty() {
assert!(!is_binary(&[]));
}
#[test]
fn test_is_binary_pure_ascii() {
assert!(!is_binary(b"Hello, world! This is a normal text file.\n"));
}
#[test]
fn test_is_binary_null_bytes() {
assert!(is_binary(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03]));
}
#[test]
fn test_is_binary_emoji_heavy() {
let emoji: &[u8] = &[
0x23, 0x20, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8,
0x20, 0x41, 0x4C, 0x45, 0x52, 0x54,
];
assert!(
!is_binary(emoji),
"emoji-heavy file should NOT be flagged as binary"
);
}
#[test]
fn test_is_binary_cjk() {
let cjk: &[u8] = "你好世界これはテストです한국어".as_bytes();
assert!(!is_binary(cjk), "CJK text should NOT be flagged as binary");
}
#[test]
fn test_is_binary_mixed_utf8_ascii() {
let mut data = Vec::new();
data.extend_from_slice(b"def hello():\n ");
data.extend_from_slice("print('🚀')".as_bytes());
data.extend_from_slice(b"\n return 42\n");
assert!(
!is_binary(&data),
"Python with emoji should NOT be flagged as binary"
);
}
#[test]
fn test_is_binary_truly_binary() {
let mut binary_data = vec![0u8; 512];
for (i, b) in binary_data.iter_mut().enumerate() {
*b = (i % 256) as u8;
}
assert!(
is_binary(&binary_data),
"random byte data should be flagged as binary"
);
}
#[test]
fn test_is_binary_short_data() {
assert!(!is_binary(b"hi"), "very short text should not be binary");
assert!(!is_binary(&[0x0A]), "single newline is not binary");
}
#[test]
fn test_is_binary_utf8_truncated_at_boundary() {
let emoji: &[u8] = &[0xF0, 0x9F, 0x9A];
let mut data = Vec::new();
data.extend_from_slice(b"some text ");
data.extend_from_slice(emoji);
data.extend_from_slice(b" more text");
assert!(
!is_binary(&data),
"truncated UTF-8 at boundary should not flip to binary"
);
}
#[test]
fn test_is_binary_control_chars() {
let mut data = vec![0x0B; 200];
data.extend_from_slice(b"normal text padding");
assert!(
is_binary(&data),
"vertical tabs (0x0B) should be flagged as binary"
);
}
#[test]
fn test_is_binary_mixed_realistic_python() {
let mut emoji_line = Vec::new();
emoji_line.extend_from_slice(b"# ");
for _ in 0..16 {
emoji_line.extend_from_slice("🚨".as_bytes());
}
emoji_line.extend_from_slice(b" WARNING");
let mut data = Vec::new();
data.extend_from_slice(&emoji_line);
data.extend_from_slice(b"\n\ndef process(data):\n return data.strip()\n");
assert!(
!is_binary(&data),
"realistic Python file with emoji header should NOT be binary"
);
}
#[test]
fn test_is_binary_exactly_30_percent() {
let mut data = Vec::new();
let total = 100;
let non_text_count = (total as f32 * 0.29) as usize;
for _ in 0..non_text_count {
data.push(0x01);
}
for _ in 0..(total - non_text_count) {
data.push(b'x');
}
assert!(!is_binary(&data), "29% non-text should NOT be flagged");
let mut data_over = Vec::new();
let non_text_over = (total as f32 * 0.31) as usize;
for _ in 0..non_text_over {
data_over.push(0x01);
}
for _ in 0..(total - non_text_over) {
data_over.push(b'x');
}
assert!(is_binary(&data_over), "31% non-text should be flagged");
}
#[test]
fn test_is_valid_utf8_sequence() {
assert!(is_valid_utf8_sequence(&[0xC3, 0xA9]));
assert!(is_valid_utf8_sequence(&[0xE4, 0xBD, 0xA0]));
assert!(
is_valid_utf8_sequence(&[0xF0, 0x9F, 0x9A, 0xA8]),
"🚨 should be valid 4-byte UTF-8"
);
assert!(
!is_valid_utf8_sequence(&[0xC0, 0x80]),
"overlong 2-byte encoding (C0)"
);
assert!(
!is_valid_utf8_sequence(&[0xC1, 0x80]),
"overlong 2-byte encoding (C1)"
);
assert!(
!is_valid_utf8_sequence(&[0xE0, 0x80, 0x80]),
"overlong 3-byte encoding"
);
assert!(
!is_valid_utf8_sequence(&[0xF0, 0x80, 0x80, 0x80]),
"overlong 4-byte encoding"
);
assert!(
!is_valid_utf8_sequence(&[0xED, 0xA0, 0x80]),
"surrogate pair (ED A0)"
);
assert!(
!is_valid_utf8_sequence(&[0xF4, 0x90, 0x80, 0x80]),
"above U+10FFFF"
);
assert!(!is_valid_utf8_sequence(&[0xC2, 0x00]), "bad continuation");
assert!(!is_valid_utf8_sequence(&[]));
assert!(!is_valid_utf8_sequence(&[0xFF]));
}
#[test]
fn test_is_binary_stray_continuation_bytes() {
let data = vec![
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D,
0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, b' ', b' ', b' ', b' ', b' ', b' ',
];
assert!(
is_binary(&data),
"stray continuation bytes should be flagged as binary"
);
}
}