1pub const MAGIC: [u8; 4] = [0x49, 0x58, 0x30, 0x31];
8
9pub const VERSION_MAJOR: u16 = 1;
11
12pub const VERSION_MINOR: u16 = 3;
14
15pub const HEADER_SIZE: usize = 256;
17
18pub const TRIGRAM_ENTRY_SIZE: usize = 20;
21pub const CDX_BLOCK_SIZE: usize = 1024;
23
24pub const FILE_ENTRY_SIZE: usize = 48;
26
27pub mod flags {
29 pub const HAS_BLOOM_FILTERS: u64 = 1 << 0;
31 pub const HAS_CONTENT_HASHES: u64 = 1 << 1;
33 pub const POSTING_LISTS_COMPRESSED: u64 = 1 << 2;
35 pub const POSTING_LISTS_CHECKSUMMED: u64 = 1 << 3;
37 pub const HAS_CDX_INDEX: u64 = 1 << 4;
39}
40
41#[repr(u8)]
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum FileStatus {
45 Fresh = 0x00,
47 Stale = 0x01,
49 Deleted = 0x02,
51}
52
53impl FileStatus {
54 #[must_use]
58 pub const fn from_u8(v: u8) -> Self {
59 match v {
60 0x00 => Self::Fresh,
61 0x02 => Self::Deleted,
62 _ => Self::Stale, }
64 }
65}
66
67#[derive(Debug, Clone)]
72pub struct Header {
73 pub version_major: u16,
75 pub version_minor: u16,
77 pub flags: u64,
79 pub created_at: u64,
81 pub source_bytes_total: u64,
83 pub file_count: u32,
85 pub trigram_count: u32,
87 pub file_table_offset: u64,
89 pub file_table_size: u64,
91 pub trigram_table_offset: u64,
93 pub trigram_table_size: u64,
95 pub posting_data_offset: u64,
97 pub posting_data_size: u64,
99 pub bloom_offset: u64,
101 pub bloom_size: u64,
103 pub string_pool_offset: u64,
105 pub string_pool_size: u64,
107 pub name_index_offset: u64,
109 pub name_index_size: u64,
111 pub cdx_block_index_offset: u64,
113 pub cdx_block_index_size: u64,
115}
116
117impl Header {
118 pub fn parse(data: &[u8]) -> crate::error::Result<Self> {
125 if data.len() < HEADER_SIZE {
126 return Err(crate::error::Error::IndexTooSmall);
127 }
128 if data.get(0..4).ok_or(crate::error::Error::IndexTooSmall)? != MAGIC {
129 return Err(crate::error::Error::BadMagic);
130 }
131
132 let r = |off: usize| -> u64 {
133 data.get(off..off + 8)
134 .and_then(|s| s.try_into().ok())
135 .map_or(0, u64::from_le_bytes)
136 };
137 let r16 = |off: usize| -> u16 {
138 data.get(off..off + 2)
139 .and_then(|s| s.try_into().ok())
140 .map_or(0, u16::from_le_bytes)
141 };
142 let r32 = |off: usize| -> u32 {
143 data.get(off..off + 4)
144 .and_then(|s| s.try_into().ok())
145 .map_or(0, u32::from_le_bytes)
146 };
147
148 let major = r16(0x04);
149 let minor = r16(0x06);
150 if major != VERSION_MAJOR || minor < VERSION_MINOR {
151 return Err(crate::error::Error::UnsupportedVersion { major, minor });
152 }
153
154 let expected_crc = r32(0xF8);
156 let actual_crc = crc32c::crc32c(
157 data.get(0..0xF8)
158 .ok_or(crate::error::Error::IndexTooSmall)?,
159 );
160 if expected_crc != actual_crc {
161 return Err(crate::error::Error::HeaderCorrupted {
162 expected: expected_crc,
163 actual: actual_crc,
164 });
165 }
166
167 Ok(Self {
168 version_major: major,
169 version_minor: minor,
170 flags: r(0x08),
171 created_at: r(0x10),
172 source_bytes_total: r(0x18),
173 file_count: r32(0x20),
174 trigram_count: r32(0x24),
175 file_table_offset: r(0x28),
176 file_table_size: r(0x30),
177 trigram_table_offset: r(0x38),
178 trigram_table_size: r(0x40),
179 posting_data_offset: r(0x48),
180 posting_data_size: r(0x50),
181 bloom_offset: r(0x58),
182 bloom_size: r(0x60),
183 string_pool_offset: r(0x68),
184 string_pool_size: r(0x70),
185 name_index_offset: r(0x78),
186 name_index_size: r(0x80),
187 cdx_block_index_offset: r(0x88),
188 cdx_block_index_size: r(0x90),
189 })
190 }
191
192 pub fn validate_bounds(&self, file_len: u64) -> crate::error::Result<()> {
198 let check = |name: &'static str, off: u64, sz: u64| -> crate::error::Result<()> {
199 if off + sz > file_len {
200 Err(crate::error::Error::SectionOutOfBounds {
201 section: name,
202 offset: off,
203 size: sz,
204 file_len,
205 })
206 } else {
207 Ok(())
208 }
209 };
210 check("file_table", self.file_table_offset, self.file_table_size)?;
211 check(
212 "trigram_table",
213 self.trigram_table_offset,
214 self.trigram_table_size,
215 )?;
216 check(
217 "posting_data",
218 self.posting_data_offset,
219 self.posting_data_size,
220 )?;
221 if self.bloom_size > 0 {
222 check("bloom", self.bloom_offset, self.bloom_size)?;
223 }
224 check(
225 "string_pool",
226 self.string_pool_offset,
227 self.string_pool_size,
228 )?;
229 if self.name_index_size > 0 {
230 check("name_index", self.name_index_offset, self.name_index_size)?;
231 }
232 if self.cdx_block_index_size > 0 {
233 check(
234 "cdx_block_index",
235 self.cdx_block_index_offset,
236 self.cdx_block_index_size,
237 )?;
238 }
239 Ok(())
240 }
241
242 #[must_use]
244 pub const fn has_bloom(&self) -> bool {
245 self.flags & flags::HAS_BLOOM_FILTERS != 0
246 }
247
248 #[must_use]
250 pub const fn has_cdx(&self) -> bool {
251 self.flags & flags::HAS_CDX_INDEX != 0
252 }
253}
254
255use serde::{Deserialize, Serialize};
256use std::path::{Path, PathBuf};
257use std::time::{SystemTime, UNIX_EPOCH};
258
259#[derive(Debug, Serialize, Deserialize, Clone)]
262pub struct Beacon {
263 pub pid: i32,
265 pub root: PathBuf,
267 pub start_time: u64,
269 pub status: String,
271 pub last_event_at: u64,
273}
274
275impl Beacon {
276 #[must_use]
278 pub fn new(root: &Path) -> Self {
279 let pid = i32::try_from(std::process::id()).unwrap_or(0);
280 let now = SystemTime::now()
281 .duration_since(UNIX_EPOCH)
282 .unwrap_or_default()
283 .as_secs();
284
285 Self {
286 pid,
287 root: root.to_path_buf(),
288 start_time: now,
289 status: "idle".to_string(),
290 last_event_at: now,
291 }
292 }
293
294 #[must_use]
299 pub fn is_live(&self) -> bool {
300 use nix::sys::signal::kill;
301 use nix::unistd::Pid;
302
303 if kill(Pid::from_raw(self.pid), None).is_err() {
304 return false;
305 }
306
307 let comm_path = format!("/proc/{}/comm", self.pid);
308 if let Ok(comm) = std::fs::read_to_string(&comm_path) {
309 let comm = comm.trim();
310 if comm != "ixd" {
311 return false;
312 }
313 } else {
314 return false;
315 }
316
317 self.root.exists()
318 }
319
320 pub fn write_to(&self, folder: &Path) -> crate::error::Result<()> {
326 let path = folder.join("beacon.json");
327 let f = std::fs::File::create(path)?;
328 serde_json::to_writer_pretty(f, self).map_err(std::io::Error::other)?;
329 Ok(())
330 }
331
332 pub fn read_from(folder: &Path) -> crate::error::Result<Self> {
338 let path = folder.join("beacon.json");
339 let f = std::fs::File::open(path)?;
340 let beacon = serde_json::from_reader(f).map_err(std::io::Error::other)?;
341 Ok(beacon)
342 }
343}
344
345#[must_use]
351#[allow(clippy::cast_precision_loss, clippy::as_conversions)]
352pub fn is_binary(data: &[u8]) -> bool {
353 if data.is_empty() {
354 return false;
355 }
356 let check_len = data.len().min(512);
357 let slice = data.get(..check_len).unwrap_or(&[]);
358
359 let mut non_text = 0usize;
360 let mut i = 0;
361 while i < slice.len() {
362 let b = slice[i];
363 if matches!(b, 0x09 | 0x0A | 0x0D | 0x20..=0x7E) {
364 } else if b & 0xC0 == 0xC0 {
366 let seq_len = if b & 0xE0 == 0xC0 {
368 2
369 } else if b & 0xF0 == 0xE0 {
370 3
371 } else if b & 0xF8 == 0xF0 {
372 4
373 } else {
374 0
375 };
376
377 if seq_len > 0 && i + seq_len <= slice.len() {
378 let seq = &slice[i..i + seq_len];
379 if is_valid_utf8_sequence(seq) {
380 i += seq_len;
381 continue;
382 }
383 }
384 non_text += 1;
385 } else if b & 0xC0 == 0x80 {
386 non_text += 1;
388 } else {
389 non_text += 1;
391 }
392 i += 1;
393 }
394
395 (non_text as f32 / check_len as f32) > 0.3
396}
397
398#[inline]
399#[allow(clippy::indexing_slicing)]
400fn is_valid_utf8_sequence(seq: &[u8]) -> bool {
401 match seq.len() {
402 2 => seq[0] >= 0xC2 && (seq[1] & 0xC0) == 0x80,
403 3 => {
404 let valid = (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80;
405 if !valid {
406 return false;
407 }
408 if seq[0] == 0xE0 {
409 seq[1] >= 0xA0
410 } else if seq[0] == 0xED {
411 seq[1] <= 0x9F
412 } else {
413 seq[0] >= 0xE1 && seq[0] <= 0xEC || seq[0] >= 0xEE
414 }
415 }
416 4 => {
417 let valid =
418 (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80 && (seq[3] & 0xC0) == 0x80;
419 if !valid {
420 return false;
421 }
422 if seq[0] == 0xF0 {
423 seq[1] >= 0x90
424 } else if seq[0] == 0xF4 {
425 seq[1] <= 0x8F
426 } else {
427 seq[0] >= 0xF1 && seq[0] <= 0xF3
428 }
429 }
430 _ => false,
431 }
432}
433
434#[cfg(test)]
435mod tests {
436 use super::*;
437
438 #[test]
439 fn test_is_binary_empty() {
440 assert!(!is_binary(&[]));
441 }
442
443 #[test]
444 fn test_is_binary_pure_ascii() {
445 assert!(!is_binary(b"Hello, world! This is a normal text file.\n"));
446 }
447
448 #[test]
449 fn test_is_binary_null_bytes() {
450 assert!(is_binary(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03]));
451 }
452
453 #[test]
454 fn test_is_binary_emoji_heavy() {
455 let emoji: &[u8] = &[
456 0x23, 0x20, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8,
457 0x20, 0x41, 0x4C, 0x45, 0x52, 0x54,
458 ];
459 assert!(
460 !is_binary(emoji),
461 "emoji-heavy file should NOT be flagged as binary"
462 );
463 }
464
465 #[test]
466 fn test_is_binary_cjk() {
467 let cjk: &[u8] = "你好世界これはテストです한국어".as_bytes();
468 assert!(!is_binary(cjk), "CJK text should NOT be flagged as binary");
469 }
470
471 #[test]
472 fn test_is_binary_mixed_utf8_ascii() {
473 let mut data = Vec::new();
474 data.extend_from_slice(b"def hello():\n ");
475 data.extend_from_slice("print('🚀')".as_bytes());
476 data.extend_from_slice(b"\n return 42\n");
477 assert!(
478 !is_binary(&data),
479 "Python with emoji should NOT be flagged as binary"
480 );
481 }
482
483 #[test]
484 fn test_is_binary_truly_binary() {
485 let mut binary_data = vec![0u8; 512];
486 for (i, b) in binary_data.iter_mut().enumerate() {
487 *b = (i % 256) as u8;
488 }
489 assert!(
490 is_binary(&binary_data),
491 "random byte data should be flagged as binary"
492 );
493 }
494
495 #[test]
496 fn test_is_binary_short_data() {
497 assert!(!is_binary(b"hi"), "very short text should not be binary");
498 assert!(!is_binary(&[0x0A]), "single newline is not binary");
499 }
500
501 #[test]
502 fn test_is_binary_utf8_truncated_at_boundary() {
503 let emoji: &[u8] = &[0xF0, 0x9F, 0x9A];
504 let mut data = Vec::new();
505 data.extend_from_slice(b"some text ");
506 data.extend_from_slice(emoji);
507 data.extend_from_slice(b" more text");
508 assert!(
509 !is_binary(&data),
510 "truncated UTF-8 at boundary should not flip to binary"
511 );
512 }
513
514 #[test]
515 fn test_is_binary_control_chars() {
516 let mut data = vec![0x0B; 200];
517 data.extend_from_slice(b"normal text padding");
518 assert!(
519 is_binary(&data),
520 "vertical tabs (0x0B) should be flagged as binary"
521 );
522 }
523
524 #[test]
525 fn test_is_binary_mixed_realistic_python() {
526 let mut emoji_line = Vec::new();
527 emoji_line.extend_from_slice(b"# ");
528 for _ in 0..16 {
529 emoji_line.extend_from_slice("🚨".as_bytes());
530 }
531 emoji_line.extend_from_slice(b" WARNING");
532 let mut data = Vec::new();
533 data.extend_from_slice(&emoji_line);
534 data.extend_from_slice(b"\n\ndef process(data):\n return data.strip()\n");
535 assert!(
536 !is_binary(&data),
537 "realistic Python file with emoji header should NOT be binary"
538 );
539 }
540
541 #[test]
542 fn test_is_binary_exactly_30_percent() {
543 let mut data = Vec::new();
544 let total = 100;
545 let non_text_count = (total as f32 * 0.29) as usize;
546 for _ in 0..non_text_count {
547 data.push(0x01);
548 }
549 for _ in 0..(total - non_text_count) {
550 data.push(b'x');
551 }
552 assert!(!is_binary(&data), "29% non-text should NOT be flagged");
553 let mut data_over = Vec::new();
554 let non_text_over = (total as f32 * 0.31) as usize;
555 for _ in 0..non_text_over {
556 data_over.push(0x01);
557 }
558 for _ in 0..(total - non_text_over) {
559 data_over.push(b'x');
560 }
561 assert!(is_binary(&data_over), "31% non-text should be flagged");
562 }
563
564 #[test]
565 fn test_is_valid_utf8_sequence() {
566 assert!(is_valid_utf8_sequence(&[0xC3, 0xA9]));
567 assert!(is_valid_utf8_sequence(&[0xE4, 0xBD, 0xA0]));
568 assert!(
569 is_valid_utf8_sequence(&[0xF0, 0x9F, 0x9A, 0xA8]),
570 "🚨 should be valid 4-byte UTF-8"
571 );
572 assert!(
573 !is_valid_utf8_sequence(&[0xC0, 0x80]),
574 "overlong 2-byte encoding (C0)"
575 );
576 assert!(
577 !is_valid_utf8_sequence(&[0xC1, 0x80]),
578 "overlong 2-byte encoding (C1)"
579 );
580 assert!(
581 !is_valid_utf8_sequence(&[0xE0, 0x80, 0x80]),
582 "overlong 3-byte encoding"
583 );
584 assert!(
585 !is_valid_utf8_sequence(&[0xF0, 0x80, 0x80, 0x80]),
586 "overlong 4-byte encoding"
587 );
588 assert!(
589 !is_valid_utf8_sequence(&[0xED, 0xA0, 0x80]),
590 "surrogate pair (ED A0)"
591 );
592 assert!(
593 !is_valid_utf8_sequence(&[0xF4, 0x90, 0x80, 0x80]),
594 "above U+10FFFF"
595 );
596 assert!(!is_valid_utf8_sequence(&[0xC2, 0x00]), "bad continuation");
597 assert!(!is_valid_utf8_sequence(&[]));
598 assert!(!is_valid_utf8_sequence(&[0xFF]));
599 }
600
601 #[test]
602 fn test_is_binary_stray_continuation_bytes() {
603 let data = vec![
604 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D,
605 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
606 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
607 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
608 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, b' ', b' ', b' ', b' ', b' ', b' ',
609 ];
610 assert!(
611 is_binary(&data),
612 "stray continuation bytes should be flagged as binary"
613 );
614 }
615}