1pub const MAGIC: [u8; 4] = [0x49, 0x58, 0x30, 0x31];
8
9pub const VERSION_MAJOR: u16 = 1;
11
12pub const VERSION_MINOR: u16 = 3;
14
15pub const HEADER_SIZE: usize = 256;
17
18pub const TRIGRAM_ENTRY_SIZE: usize = 20;
21pub const CDX_BLOCK_SIZE: usize = 1024;
23
24pub const FILE_ENTRY_SIZE: usize = 48;
26
27pub mod flags {
29 pub const HAS_BLOOM_FILTERS: u64 = 1 << 0;
31 pub const HAS_CONTENT_HASHES: u64 = 1 << 1;
33 pub const POSTING_LISTS_COMPRESSED: u64 = 1 << 2;
35 pub const POSTING_LISTS_CHECKSUMMED: u64 = 1 << 3;
37 pub const HAS_CDX_INDEX: u64 = 1 << 4;
39}
40
41#[repr(u8)]
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum FileStatus {
45 Fresh = 0x00,
47 Stale = 0x01,
49 Deleted = 0x02,
51}
52
53impl FileStatus {
54 #[must_use]
58 pub const fn from_u8(v: u8) -> Self {
59 match v {
60 0x00 => Self::Fresh,
61 0x02 => Self::Deleted,
62 _ => Self::Stale, }
64 }
65}
66
67#[derive(Debug, Clone)]
72pub struct Header {
73 pub version_major: u16,
75 pub version_minor: u16,
77 pub flags: u64,
79 pub created_at: u64,
81 pub source_bytes_total: u64,
83 pub file_count: u32,
85 pub trigram_count: u32,
87 pub file_table_offset: u64,
89 pub file_table_size: u64,
91 pub trigram_table_offset: u64,
93 pub trigram_table_size: u64,
95 pub posting_data_offset: u64,
97 pub posting_data_size: u64,
99 pub bloom_offset: u64,
101 pub bloom_size: u64,
103 pub string_pool_offset: u64,
105 pub string_pool_size: u64,
107 pub name_index_offset: u64,
109 pub name_index_size: u64,
111 pub cdx_block_index_offset: u64,
113 pub cdx_block_index_size: u64,
115}
116
117impl Header {
118 pub fn parse(data: &[u8]) -> crate::error::Result<Self> {
125 if data.len() < HEADER_SIZE {
126 return Err(crate::error::Error::IndexTooSmall);
127 }
128 if data.get(0..4).ok_or(crate::error::Error::IndexTooSmall)? != MAGIC {
129 return Err(crate::error::Error::BadMagic);
130 }
131
132 let r = |off: usize| -> u64 {
133 data.get(off..off + 8)
134 .and_then(|s| s.try_into().ok())
135 .map_or(0, u64::from_le_bytes)
136 };
137 let r16 = |off: usize| -> u16 {
138 data.get(off..off + 2)
139 .and_then(|s| s.try_into().ok())
140 .map_or(0, u16::from_le_bytes)
141 };
142 let r32 = |off: usize| -> u32 {
143 data.get(off..off + 4)
144 .and_then(|s| s.try_into().ok())
145 .map_or(0, u32::from_le_bytes)
146 };
147
148 let major = r16(0x04);
149 let minor = r16(0x06);
150 if major != VERSION_MAJOR || minor < VERSION_MINOR {
151 return Err(crate::error::Error::UnsupportedVersion { major, minor });
152 }
153
154 let expected_crc = r32(0xF8);
156 let actual_crc = crc32c::crc32c(
157 data.get(0..0xF8)
158 .ok_or(crate::error::Error::IndexTooSmall)?,
159 );
160 if expected_crc != actual_crc {
161 return Err(crate::error::Error::HeaderCorrupted {
162 expected: expected_crc,
163 actual: actual_crc,
164 });
165 }
166
167 Ok(Self {
168 version_major: major,
169 version_minor: minor,
170 flags: r(0x08),
171 created_at: r(0x10),
172 source_bytes_total: r(0x18),
173 file_count: r32(0x20),
174 trigram_count: r32(0x24),
175 file_table_offset: r(0x28),
176 file_table_size: r(0x30),
177 trigram_table_offset: r(0x38),
178 trigram_table_size: r(0x40),
179 posting_data_offset: r(0x48),
180 posting_data_size: r(0x50),
181 bloom_offset: r(0x58),
182 bloom_size: r(0x60),
183 string_pool_offset: r(0x68),
184 string_pool_size: r(0x70),
185 name_index_offset: r(0x78),
186 name_index_size: r(0x80),
187 cdx_block_index_offset: r(0x88),
188 cdx_block_index_size: r(0x90),
189 })
190 }
191
192 pub fn validate_bounds(&self, file_len: u64) -> crate::error::Result<()> {
198 let check = |name: &'static str, off: u64, sz: u64| -> crate::error::Result<()> {
199 if off + sz > file_len {
200 Err(crate::error::Error::SectionOutOfBounds {
201 section: name,
202 offset: off,
203 size: sz,
204 file_len,
205 })
206 } else {
207 Ok(())
208 }
209 };
210 check("file_table", self.file_table_offset, self.file_table_size)?;
211 check(
212 "trigram_table",
213 self.trigram_table_offset,
214 self.trigram_table_size,
215 )?;
216 check(
217 "posting_data",
218 self.posting_data_offset,
219 self.posting_data_size,
220 )?;
221 if self.bloom_size > 0 {
222 check("bloom", self.bloom_offset, self.bloom_size)?;
223 }
224 check(
225 "string_pool",
226 self.string_pool_offset,
227 self.string_pool_size,
228 )?;
229 if self.name_index_size > 0 {
230 check("name_index", self.name_index_offset, self.name_index_size)?;
231 }
232 if self.cdx_block_index_size > 0 {
233 check(
234 "cdx_block_index",
235 self.cdx_block_index_offset,
236 self.cdx_block_index_size,
237 )?;
238 }
239 Ok(())
240 }
241
242 #[must_use]
244 pub const fn has_bloom(&self) -> bool {
245 self.flags & flags::HAS_BLOOM_FILTERS != 0
246 }
247
248 #[must_use]
250 pub const fn has_cdx(&self) -> bool {
251 self.flags & flags::HAS_CDX_INDEX != 0
252 }
253}
254
255use serde::{Deserialize, Serialize};
256use std::path::{Path, PathBuf};
257use std::time::{SystemTime, UNIX_EPOCH};
258
259#[derive(Debug, Serialize, Deserialize, Clone)]
262pub struct Beacon {
263 pub pid: i32,
265 pub root: PathBuf,
267 pub start_time: u64,
269 pub status: String,
271 pub last_event_at: u64,
273 #[serde(default, skip_serializing_if = "Option::is_none")]
275 pub socket_path: Option<PathBuf>,
276}
277
278impl Beacon {
279 #[must_use]
281 pub fn new(root: &Path) -> Self {
282 let pid = i32::try_from(std::process::id()).unwrap_or(0);
283 let now = SystemTime::now()
284 .duration_since(UNIX_EPOCH)
285 .unwrap_or_default()
286 .as_secs();
287
288 Self {
289 pid,
290 root: root.to_path_buf(),
291 start_time: now,
292 status: "idle".to_string(),
293 last_event_at: now,
294 socket_path: None,
295 }
296 }
297
298 #[must_use]
303 pub fn is_live(&self) -> bool {
304 use nix::sys::signal::kill;
305 use nix::unistd::Pid;
306
307 if kill(Pid::from_raw(self.pid), None).is_err() {
308 return false;
309 }
310
311 let comm_path = format!("/proc/{}/comm", self.pid);
312 if let Ok(comm) = std::fs::read_to_string(&comm_path) {
313 let comm = comm.trim();
314 if comm != "ixd" {
315 return false;
316 }
317 } else {
318 return false;
319 }
320
321 self.root.exists()
322 }
323
324 pub fn write_to(&self, folder: &Path) -> crate::error::Result<()> {
330 let path = folder.join("beacon.json");
331 let f = std::fs::File::create(path)?;
332 serde_json::to_writer_pretty(f, self).map_err(std::io::Error::other)?;
333 Ok(())
334 }
335
336 pub fn read_from(folder: &Path) -> crate::error::Result<Self> {
342 let path = folder.join("beacon.json");
343 let f = std::fs::File::open(path)?;
344 let beacon = serde_json::from_reader(f).map_err(std::io::Error::other)?;
345 Ok(beacon)
346 }
347}
348
349#[must_use]
355#[allow(clippy::cast_precision_loss, clippy::as_conversions)]
356pub fn is_binary(data: &[u8]) -> bool {
357 if data.is_empty() {
358 return false;
359 }
360 let check_len = data.len().min(512);
361 let slice = data.get(..check_len).unwrap_or(&[]);
362
363 let mut non_text = 0usize;
364 let mut i = 0;
365 while i < slice.len() {
366 let b = slice[i];
367 if matches!(b, 0x09 | 0x0A | 0x0D | 0x20..=0x7E) {
368 } else if b & 0xC0 == 0xC0 {
370 let seq_len = if b & 0xE0 == 0xC0 {
372 2
373 } else if b & 0xF0 == 0xE0 {
374 3
375 } else if b & 0xF8 == 0xF0 {
376 4
377 } else {
378 0
379 };
380
381 if seq_len > 0 && i + seq_len <= slice.len() {
382 let seq = &slice[i..i + seq_len];
383 if is_valid_utf8_sequence(seq) {
384 i += seq_len;
385 continue;
386 }
387 }
388 non_text += 1;
389 } else if b & 0xC0 == 0x80 {
390 non_text += 1;
392 } else {
393 non_text += 1;
395 }
396 i += 1;
397 }
398
399 (non_text as f32 / check_len as f32) > 0.3
400}
401
402#[inline]
403#[allow(clippy::indexing_slicing)]
404fn is_valid_utf8_sequence(seq: &[u8]) -> bool {
405 match seq.len() {
406 2 => seq[0] >= 0xC2 && (seq[1] & 0xC0) == 0x80,
407 3 => {
408 let valid = (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80;
409 if !valid {
410 return false;
411 }
412 if seq[0] == 0xE0 {
413 seq[1] >= 0xA0
414 } else if seq[0] == 0xED {
415 seq[1] <= 0x9F
416 } else {
417 seq[0] >= 0xE1 && seq[0] <= 0xEC || seq[0] >= 0xEE
418 }
419 }
420 4 => {
421 let valid =
422 (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80 && (seq[3] & 0xC0) == 0x80;
423 if !valid {
424 return false;
425 }
426 if seq[0] == 0xF0 {
427 seq[1] >= 0x90
428 } else if seq[0] == 0xF4 {
429 seq[1] <= 0x8F
430 } else {
431 seq[0] >= 0xF1 && seq[0] <= 0xF3
432 }
433 }
434 _ => false,
435 }
436}
437
438#[cfg(test)]
439mod tests {
440 use super::*;
441
442 #[test]
443 fn test_is_binary_empty() {
444 assert!(!is_binary(&[]));
445 }
446
447 #[test]
448 fn test_is_binary_pure_ascii() {
449 assert!(!is_binary(b"Hello, world! This is a normal text file.\n"));
450 }
451
452 #[test]
453 fn test_is_binary_null_bytes() {
454 assert!(is_binary(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03]));
455 }
456
457 #[test]
458 fn test_is_binary_emoji_heavy() {
459 let emoji: &[u8] = &[
460 0x23, 0x20, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8,
461 0x20, 0x41, 0x4C, 0x45, 0x52, 0x54,
462 ];
463 assert!(
464 !is_binary(emoji),
465 "emoji-heavy file should NOT be flagged as binary"
466 );
467 }
468
469 #[test]
470 fn test_is_binary_cjk() {
471 let cjk: &[u8] = "你好世界これはテストです한국어".as_bytes();
472 assert!(!is_binary(cjk), "CJK text should NOT be flagged as binary");
473 }
474
475 #[test]
476 fn test_is_binary_mixed_utf8_ascii() {
477 let mut data = Vec::new();
478 data.extend_from_slice(b"def hello():\n ");
479 data.extend_from_slice("print('🚀')".as_bytes());
480 data.extend_from_slice(b"\n return 42\n");
481 assert!(
482 !is_binary(&data),
483 "Python with emoji should NOT be flagged as binary"
484 );
485 }
486
487 #[test]
488 fn test_is_binary_truly_binary() {
489 let mut binary_data = vec![0u8; 512];
490 for (i, b) in binary_data.iter_mut().enumerate() {
491 *b = (i % 256) as u8;
492 }
493 assert!(
494 is_binary(&binary_data),
495 "random byte data should be flagged as binary"
496 );
497 }
498
499 #[test]
500 fn test_is_binary_short_data() {
501 assert!(!is_binary(b"hi"), "very short text should not be binary");
502 assert!(!is_binary(&[0x0A]), "single newline is not binary");
503 }
504
505 #[test]
506 fn test_is_binary_utf8_truncated_at_boundary() {
507 let emoji: &[u8] = &[0xF0, 0x9F, 0x9A];
508 let mut data = Vec::new();
509 data.extend_from_slice(b"some text ");
510 data.extend_from_slice(emoji);
511 data.extend_from_slice(b" more text");
512 assert!(
513 !is_binary(&data),
514 "truncated UTF-8 at boundary should not flip to binary"
515 );
516 }
517
518 #[test]
519 fn test_is_binary_control_chars() {
520 let mut data = vec![0x0B; 200];
521 data.extend_from_slice(b"normal text padding");
522 assert!(
523 is_binary(&data),
524 "vertical tabs (0x0B) should be flagged as binary"
525 );
526 }
527
528 #[test]
529 fn test_is_binary_mixed_realistic_python() {
530 let mut emoji_line = Vec::new();
531 emoji_line.extend_from_slice(b"# ");
532 for _ in 0..16 {
533 emoji_line.extend_from_slice("🚨".as_bytes());
534 }
535 emoji_line.extend_from_slice(b" WARNING");
536 let mut data = Vec::new();
537 data.extend_from_slice(&emoji_line);
538 data.extend_from_slice(b"\n\ndef process(data):\n return data.strip()\n");
539 assert!(
540 !is_binary(&data),
541 "realistic Python file with emoji header should NOT be binary"
542 );
543 }
544
545 #[test]
546 fn test_is_binary_exactly_30_percent() {
547 let mut data = Vec::new();
548 let total = 100;
549 let non_text_count = (total as f32 * 0.29) as usize;
550 for _ in 0..non_text_count {
551 data.push(0x01);
552 }
553 for _ in 0..(total - non_text_count) {
554 data.push(b'x');
555 }
556 assert!(!is_binary(&data), "29% non-text should NOT be flagged");
557 let mut data_over = Vec::new();
558 let non_text_over = (total as f32 * 0.31) as usize;
559 for _ in 0..non_text_over {
560 data_over.push(0x01);
561 }
562 for _ in 0..(total - non_text_over) {
563 data_over.push(b'x');
564 }
565 assert!(is_binary(&data_over), "31% non-text should be flagged");
566 }
567
568 #[test]
569 fn test_is_valid_utf8_sequence() {
570 assert!(is_valid_utf8_sequence(&[0xC3, 0xA9]));
571 assert!(is_valid_utf8_sequence(&[0xE4, 0xBD, 0xA0]));
572 assert!(
573 is_valid_utf8_sequence(&[0xF0, 0x9F, 0x9A, 0xA8]),
574 "🚨 should be valid 4-byte UTF-8"
575 );
576 assert!(
577 !is_valid_utf8_sequence(&[0xC0, 0x80]),
578 "overlong 2-byte encoding (C0)"
579 );
580 assert!(
581 !is_valid_utf8_sequence(&[0xC1, 0x80]),
582 "overlong 2-byte encoding (C1)"
583 );
584 assert!(
585 !is_valid_utf8_sequence(&[0xE0, 0x80, 0x80]),
586 "overlong 3-byte encoding"
587 );
588 assert!(
589 !is_valid_utf8_sequence(&[0xF0, 0x80, 0x80, 0x80]),
590 "overlong 4-byte encoding"
591 );
592 assert!(
593 !is_valid_utf8_sequence(&[0xED, 0xA0, 0x80]),
594 "surrogate pair (ED A0)"
595 );
596 assert!(
597 !is_valid_utf8_sequence(&[0xF4, 0x90, 0x80, 0x80]),
598 "above U+10FFFF"
599 );
600 assert!(!is_valid_utf8_sequence(&[0xC2, 0x00]), "bad continuation");
601 assert!(!is_valid_utf8_sequence(&[]));
602 assert!(!is_valid_utf8_sequence(&[0xFF]));
603 }
604
605 #[test]
606 fn test_is_binary_stray_continuation_bytes() {
607 let data = vec![
608 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D,
609 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
610 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
611 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
612 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, b' ', b' ', b' ', b' ', b' ', b' ',
613 ];
614 assert!(
615 is_binary(&data),
616 "stray continuation bytes should be flagged as binary"
617 );
618 }
619}