ix/
format.rs

1//! Index file format constants and header parsing.
2//!
3//! All integers little-endian. All offsets absolute from file start.
4//! Sections aligned to 8-byte boundaries.
5
6/// Magic bytes identifying an ix index file (`b"IX01"`).
7pub const MAGIC: [u8; 4] = [0x49, 0x58, 0x30, 0x31];
8
9/// Major version of the on-disk format this library writes.
10pub const VERSION_MAJOR: u16 = 1;
11
12/// Minimum minor version required to read an index file.
13pub const VERSION_MINOR: u16 = 3;
14
15/// Size of the fixed header at the start of every index file (256 bytes).
16pub const HEADER_SIZE: usize = 256;
17
18/// On-disk size of one trigram-table entry (u32 key + 16-byte payload).
19/// Legacy constant — unused when CDX index is present.
20pub const TRIGRAM_ENTRY_SIZE: usize = 20;
21/// Maximum number of trigram entries per CDX compressed block.
22pub const CDX_BLOCK_SIZE: usize = 1024;
23
24/// On-disk size of one file-table entry in the index.
25pub const FILE_ENTRY_SIZE: usize = 48;
26
27/// Bit-flag constants stored in the [`Header::flags`] field.
28pub mod flags {
29    /// The index contains per-trigram bloom filters.
30    pub const HAS_BLOOM_FILTERS: u64 = 1 << 0;
31    /// Per-file content hashes are stored in the file table.
32    pub const HAS_CONTENT_HASHES: u64 = 1 << 1;
33    /// Posting-list data is ZSTD-compressed.
34    pub const POSTING_LISTS_COMPRESSED: u64 = 1 << 2;
35    /// Each posting-list chunk carries an `XXHash64` checksum.
36    pub const POSTING_LISTS_CHECKSUMMED: u64 = 1 << 3;
37    /// Trigram table uses CDX (Concentrated Delta X) compression.
38    pub const HAS_CDX_INDEX: u64 = 1 << 4;
39}
40
41/// Whether a file tracked by the index is current, out-of-date, or deleted.
42#[repr(u8)]
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum FileStatus {
45    /// File exists on disk and is in sync with the index.
46    Fresh = 0x00,
47    /// File has been modified since it was last indexed.
48    Stale = 0x01,
49    /// File has been removed from the file system.
50    Deleted = 0x02,
51}
52
53impl FileStatus {
54    /// Decode a file status from its on-disk `u8` representation.
55    ///
56    /// Unknown values default to [`FileStatus::Stale`].
57    #[must_use]
58    pub const fn from_u8(v: u8) -> Self {
59        match v {
60            0x00 => Self::Fresh,
61            0x02 => Self::Deleted,
62            _ => Self::Stale, // unknown = treat as stale
63        }
64    }
65}
66
67/// Parsed contents of the fixed 256-byte index header.
68///
69/// All integer fields are stored little-endian in the file.
70/// Offsets are absolute byte offsets from the start of the file.
71#[derive(Debug, Clone)]
72pub struct Header {
73    /// Format major version (must equal [`VERSION_MAJOR`]).
74    pub version_major: u16,
75    /// Format minor version (must be ≥ [`VERSION_MINOR`]).
76    pub version_minor: u16,
77    /// Bit-field of feature flags (see [`flags`]).
78    pub flags: u64,
79    /// Unix timestamp (seconds) when the index was created.
80    pub created_at: u64,
81    /// Sum of byte-sizes of all source files when indexed.
82    pub source_bytes_total: u64,
83    /// Number of file entries in the file table.
84    pub file_count: u32,
85    /// Number of trigram entries in the trigram table.
86    pub trigram_count: u32,
87    /// Byte offset to the file table section.
88    pub file_table_offset: u64,
89    /// Byte length of the file table section.
90    pub file_table_size: u64,
91    /// Byte offset to the trigram lookup table.
92    pub trigram_table_offset: u64,
93    /// Byte length of the trigram lookup table.
94    pub trigram_table_size: u64,
95    /// Byte offset to the posting-list data blob.
96    pub posting_data_offset: u64,
97    /// Byte length of the posting-list data blob.
98    pub posting_data_size: u64,
99    /// Byte offset to the bloom-filter section (0 if absent).
100    pub bloom_offset: u64,
101    /// Byte length of the bloom-filter section (0 if absent).
102    pub bloom_size: u64,
103    /// Byte offset to the string pool section.
104    pub string_pool_offset: u64,
105    /// Byte length of the string pool section.
106    pub string_pool_size: u64,
107    /// Byte offset to the file-name index section (0 if absent).
108    pub name_index_offset: u64,
109    /// Byte length of the file-name index section (0 if absent).
110    pub name_index_size: u64,
111    /// Byte offset to the CDX block index (0 if absent).
112    pub cdx_block_index_offset: u64,
113    /// Byte length of the CDX block index (0 if absent).
114    pub cdx_block_index_size: u64,
115}
116
117impl Header {
118    /// Parse header from the first 256 bytes of an index file.
119    ///
120    /// # Errors
121    ///
122    /// Returns an error if the data is too small, has a bad magic number,
123    /// unsupported version, or corrupted CRC.
124    pub fn parse(data: &[u8]) -> crate::error::Result<Self> {
125        if data.len() < HEADER_SIZE {
126            return Err(crate::error::Error::IndexTooSmall);
127        }
128        if data.get(0..4).ok_or(crate::error::Error::IndexTooSmall)? != MAGIC {
129            return Err(crate::error::Error::BadMagic);
130        }
131
132        let r = |off: usize| -> u64 {
133            data.get(off..off + 8)
134                .and_then(|s| s.try_into().ok())
135                .map_or(0, u64::from_le_bytes)
136        };
137        let r16 = |off: usize| -> u16 {
138            data.get(off..off + 2)
139                .and_then(|s| s.try_into().ok())
140                .map_or(0, u16::from_le_bytes)
141        };
142        let r32 = |off: usize| -> u32 {
143            data.get(off..off + 4)
144                .and_then(|s| s.try_into().ok())
145                .map_or(0, u32::from_le_bytes)
146        };
147
148        let major = r16(0x04);
149        let minor = r16(0x06);
150        if major != VERSION_MAJOR || minor < VERSION_MINOR {
151            return Err(crate::error::Error::UnsupportedVersion { major, minor });
152        }
153
154        // Validate CRC32C of header (bytes 0x00..0xF8)
155        let expected_crc = r32(0xF8);
156        let actual_crc = crc32c::crc32c(
157            data.get(0..0xF8)
158                .ok_or(crate::error::Error::IndexTooSmall)?,
159        );
160        if expected_crc != actual_crc {
161            return Err(crate::error::Error::HeaderCorrupted {
162                expected: expected_crc,
163                actual: actual_crc,
164            });
165        }
166
167        Ok(Self {
168            version_major: major,
169            version_minor: minor,
170            flags: r(0x08),
171            created_at: r(0x10),
172            source_bytes_total: r(0x18),
173            file_count: r32(0x20),
174            trigram_count: r32(0x24),
175            file_table_offset: r(0x28),
176            file_table_size: r(0x30),
177            trigram_table_offset: r(0x38),
178            trigram_table_size: r(0x40),
179            posting_data_offset: r(0x48),
180            posting_data_size: r(0x50),
181            bloom_offset: r(0x58),
182            bloom_size: r(0x60),
183            string_pool_offset: r(0x68),
184            string_pool_size: r(0x70),
185            name_index_offset: r(0x78),
186            name_index_size: r(0x80),
187            cdx_block_index_offset: r(0x88),
188            cdx_block_index_size: r(0x90),
189        })
190    }
191
192    /// Validate all section offsets fit within the file.
193    ///
194    /// # Errors
195    ///
196    /// Returns an error if any section extends beyond the file length.
197    pub fn validate_bounds(&self, file_len: u64) -> crate::error::Result<()> {
198        let check = |name: &'static str, off: u64, sz: u64| -> crate::error::Result<()> {
199            if off + sz > file_len {
200                Err(crate::error::Error::SectionOutOfBounds {
201                    section: name,
202                    offset: off,
203                    size: sz,
204                    file_len,
205                })
206            } else {
207                Ok(())
208            }
209        };
210        check("file_table", self.file_table_offset, self.file_table_size)?;
211        check(
212            "trigram_table",
213            self.trigram_table_offset,
214            self.trigram_table_size,
215        )?;
216        check(
217            "posting_data",
218            self.posting_data_offset,
219            self.posting_data_size,
220        )?;
221        if self.bloom_size > 0 {
222            check("bloom", self.bloom_offset, self.bloom_size)?;
223        }
224        check(
225            "string_pool",
226            self.string_pool_offset,
227            self.string_pool_size,
228        )?;
229        if self.name_index_size > 0 {
230            check("name_index", self.name_index_offset, self.name_index_size)?;
231        }
232        if self.cdx_block_index_size > 0 {
233            check(
234                "cdx_block_index",
235                self.cdx_block_index_offset,
236                self.cdx_block_index_size,
237            )?;
238        }
239        Ok(())
240    }
241
242    /// Returns `true` when the index includes per-trigram bloom filters.
243    #[must_use]
244    pub const fn has_bloom(&self) -> bool {
245        self.flags & flags::HAS_BLOOM_FILTERS != 0
246    }
247
248    /// Returns `true` when the trigram table uses CDX compression.
249    #[must_use]
250    pub const fn has_cdx(&self) -> bool {
251        self.flags & flags::HAS_CDX_INDEX != 0
252    }
253}
254
255use serde::{Deserialize, Serialize};
256use std::path::{Path, PathBuf};
257use std::time::{SystemTime, UNIX_EPOCH};
258
259/// A heartbeat file written by the `ixd` daemon so other processes can
260/// detect a running watcher and query its status.
261#[derive(Debug, Serialize, Deserialize, Clone)]
262pub struct Beacon {
263    /// PID of the `ixd` daemon process.
264    pub pid: i32,
265    /// Canonical root directory being watched.
266    pub root: PathBuf,
267    /// Unix timestamp (seconds) when the daemon started.
268    pub start_time: u64,
269    /// Human-readable status (e.g. `"idle"`, `"indexing"`).
270    pub status: String,
271    /// Unix timestamp (seconds) of the last filesystem event.
272    pub last_event_at: u64,
273}
274
275impl Beacon {
276    /// Create a new beacon for the current process, anchored at the given root.
277    #[must_use]
278    pub fn new(root: &Path) -> Self {
279        let pid = i32::try_from(std::process::id()).unwrap_or(0);
280        let now = SystemTime::now()
281            .duration_since(UNIX_EPOCH)
282            .unwrap_or_default()
283            .as_secs();
284
285        Self {
286            pid,
287            root: root.to_path_buf(),
288            start_time: now,
289            status: "idle".to_string(),
290            last_event_at: now,
291        }
292    }
293
294    /// Check whether the daemon described by this beacon is still running.
295    ///
296    /// Verifies the recorded PID still exists, belongs to an `ixd` binary,
297    /// and the watched root directory is still accessible.
298    #[must_use]
299    pub fn is_live(&self) -> bool {
300        use nix::sys::signal::kill;
301        use nix::unistd::Pid;
302
303        if kill(Pid::from_raw(self.pid), None).is_err() {
304            return false;
305        }
306
307        let comm_path = format!("/proc/{}/comm", self.pid);
308        if let Ok(comm) = std::fs::read_to_string(&comm_path) {
309            let comm = comm.trim();
310            if comm != "ixd" {
311                return false;
312            }
313        } else {
314            return false;
315        }
316
317        self.root.exists()
318    }
319
320    /// Write the beacon to `beacon.json` in the given folder.
321    ///
322    /// # Errors
323    ///
324    /// Returns an error if the file cannot be created or serialization fails.
325    pub fn write_to(&self, folder: &Path) -> crate::error::Result<()> {
326        let path = folder.join("beacon.json");
327        let f = std::fs::File::create(path)?;
328        serde_json::to_writer_pretty(f, self).map_err(std::io::Error::other)?;
329        Ok(())
330    }
331
332    /// Read a beacon from `beacon.json` in the given folder.
333    ///
334    /// # Errors
335    ///
336    /// Returns an error if the file cannot be opened or deserialization fails.
337    pub fn read_from(folder: &Path) -> crate::error::Result<Self> {
338        let path = folder.join("beacon.json");
339        let f = std::fs::File::open(path)?;
340        let beacon = serde_json::from_reader(f).map_err(std::io::Error::other)?;
341        Ok(beacon)
342    }
343}
344
345/// Centralized binary file detection.
346///
347/// Uses a heuristic based on the ratio of non-text bytes in the first 512 bytes.
348/// Valid UTF-8 multi-byte sequences (2-4 bytes) are counted as text, not binary,
349/// so files containing emoji or CJK characters are not falsely flagged.
350#[must_use]
351#[allow(clippy::cast_precision_loss, clippy::as_conversions)]
352pub fn is_binary(data: &[u8]) -> bool {
353    if data.is_empty() {
354        return false;
355    }
356    let check_len = data.len().min(512);
357    let slice = data.get(..check_len).unwrap_or(&[]);
358
359    let mut non_text = 0usize;
360    let mut i = 0;
361    while i < slice.len() {
362        let b = slice[i];
363        if matches!(b, 0x09 | 0x0A | 0x0D | 0x20..=0x7E) {
364            // ASCII printable/control — text
365        } else if b & 0xC0 == 0xC0 {
366            // Potential UTF-8 lead byte: decode the sequence
367            let seq_len = if b & 0xE0 == 0xC0 {
368                2
369            } else if b & 0xF0 == 0xE0 {
370                3
371            } else if b & 0xF8 == 0xF0 {
372                4
373            } else {
374                0
375            };
376
377            if seq_len > 0 && i + seq_len <= slice.len() {
378                let seq = &slice[i..i + seq_len];
379                if is_valid_utf8_sequence(seq) {
380                    i += seq_len;
381                    continue;
382                }
383            }
384            non_text += 1;
385        } else if b & 0xC0 == 0x80 {
386            // Stray continuation byte — likely binary
387            non_text += 1;
388        } else {
389            // 0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F — control chars → binary
390            non_text += 1;
391        }
392        i += 1;
393    }
394
395    (non_text as f32 / check_len as f32) > 0.3
396}
397
398#[inline]
399#[allow(clippy::indexing_slicing)]
400fn is_valid_utf8_sequence(seq: &[u8]) -> bool {
401    match seq.len() {
402        2 => seq[0] >= 0xC2 && (seq[1] & 0xC0) == 0x80,
403        3 => {
404            let valid = (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80;
405            if !valid {
406                return false;
407            }
408            if seq[0] == 0xE0 {
409                seq[1] >= 0xA0
410            } else if seq[0] == 0xED {
411                seq[1] <= 0x9F
412            } else {
413                seq[0] >= 0xE1 && seq[0] <= 0xEC || seq[0] >= 0xEE
414            }
415        }
416        4 => {
417            let valid =
418                (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80 && (seq[3] & 0xC0) == 0x80;
419            if !valid {
420                return false;
421            }
422            if seq[0] == 0xF0 {
423                seq[1] >= 0x90
424            } else if seq[0] == 0xF4 {
425                seq[1] <= 0x8F
426            } else {
427                seq[0] >= 0xF1 && seq[0] <= 0xF3
428            }
429        }
430        _ => false,
431    }
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    #[test]
439    fn test_is_binary_empty() {
440        assert!(!is_binary(&[]));
441    }
442
443    #[test]
444    fn test_is_binary_pure_ascii() {
445        assert!(!is_binary(b"Hello, world! This is a normal text file.\n"));
446    }
447
448    #[test]
449    fn test_is_binary_null_bytes() {
450        assert!(is_binary(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03]));
451    }
452
453    #[test]
454    fn test_is_binary_emoji_heavy() {
455        let emoji: &[u8] = &[
456            0x23, 0x20, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8,
457            0x20, 0x41, 0x4C, 0x45, 0x52, 0x54,
458        ];
459        assert!(
460            !is_binary(emoji),
461            "emoji-heavy file should NOT be flagged as binary"
462        );
463    }
464
465    #[test]
466    fn test_is_binary_cjk() {
467        let cjk: &[u8] = "你好世界これはテストです한국어".as_bytes();
468        assert!(!is_binary(cjk), "CJK text should NOT be flagged as binary");
469    }
470
471    #[test]
472    fn test_is_binary_mixed_utf8_ascii() {
473        let mut data = Vec::new();
474        data.extend_from_slice(b"def hello():\n    ");
475        data.extend_from_slice("print('🚀')".as_bytes());
476        data.extend_from_slice(b"\n    return 42\n");
477        assert!(
478            !is_binary(&data),
479            "Python with emoji should NOT be flagged as binary"
480        );
481    }
482
483    #[test]
484    fn test_is_binary_truly_binary() {
485        let mut binary_data = vec![0u8; 512];
486        for (i, b) in binary_data.iter_mut().enumerate() {
487            *b = (i % 256) as u8;
488        }
489        assert!(
490            is_binary(&binary_data),
491            "random byte data should be flagged as binary"
492        );
493    }
494
495    #[test]
496    fn test_is_binary_short_data() {
497        assert!(!is_binary(b"hi"), "very short text should not be binary");
498        assert!(!is_binary(&[0x0A]), "single newline is not binary");
499    }
500
501    #[test]
502    fn test_is_binary_utf8_truncated_at_boundary() {
503        let emoji: &[u8] = &[0xF0, 0x9F, 0x9A];
504        let mut data = Vec::new();
505        data.extend_from_slice(b"some text ");
506        data.extend_from_slice(emoji);
507        data.extend_from_slice(b" more text");
508        assert!(
509            !is_binary(&data),
510            "truncated UTF-8 at boundary should not flip to binary"
511        );
512    }
513
514    #[test]
515    fn test_is_binary_control_chars() {
516        let mut data = vec![0x0B; 200];
517        data.extend_from_slice(b"normal text padding");
518        assert!(
519            is_binary(&data),
520            "vertical tabs (0x0B) should be flagged as binary"
521        );
522    }
523
524    #[test]
525    fn test_is_binary_mixed_realistic_python() {
526        let mut emoji_line = Vec::new();
527        emoji_line.extend_from_slice(b"# ");
528        for _ in 0..16 {
529            emoji_line.extend_from_slice("🚨".as_bytes());
530        }
531        emoji_line.extend_from_slice(b" WARNING");
532        let mut data = Vec::new();
533        data.extend_from_slice(&emoji_line);
534        data.extend_from_slice(b"\n\ndef process(data):\n    return data.strip()\n");
535        assert!(
536            !is_binary(&data),
537            "realistic Python file with emoji header should NOT be binary"
538        );
539    }
540
541    #[test]
542    fn test_is_binary_exactly_30_percent() {
543        let mut data = Vec::new();
544        let total = 100;
545        let non_text_count = (total as f32 * 0.29) as usize;
546        for _ in 0..non_text_count {
547            data.push(0x01);
548        }
549        for _ in 0..(total - non_text_count) {
550            data.push(b'x');
551        }
552        assert!(!is_binary(&data), "29% non-text should NOT be flagged");
553        let mut data_over = Vec::new();
554        let non_text_over = (total as f32 * 0.31) as usize;
555        for _ in 0..non_text_over {
556            data_over.push(0x01);
557        }
558        for _ in 0..(total - non_text_over) {
559            data_over.push(b'x');
560        }
561        assert!(is_binary(&data_over), "31% non-text should be flagged");
562    }
563
564    #[test]
565    fn test_is_valid_utf8_sequence() {
566        assert!(is_valid_utf8_sequence(&[0xC3, 0xA9]));
567        assert!(is_valid_utf8_sequence(&[0xE4, 0xBD, 0xA0]));
568        assert!(
569            is_valid_utf8_sequence(&[0xF0, 0x9F, 0x9A, 0xA8]),
570            "🚨 should be valid 4-byte UTF-8"
571        );
572        assert!(
573            !is_valid_utf8_sequence(&[0xC0, 0x80]),
574            "overlong 2-byte encoding (C0)"
575        );
576        assert!(
577            !is_valid_utf8_sequence(&[0xC1, 0x80]),
578            "overlong 2-byte encoding (C1)"
579        );
580        assert!(
581            !is_valid_utf8_sequence(&[0xE0, 0x80, 0x80]),
582            "overlong 3-byte encoding"
583        );
584        assert!(
585            !is_valid_utf8_sequence(&[0xF0, 0x80, 0x80, 0x80]),
586            "overlong 4-byte encoding"
587        );
588        assert!(
589            !is_valid_utf8_sequence(&[0xED, 0xA0, 0x80]),
590            "surrogate pair (ED A0)"
591        );
592        assert!(
593            !is_valid_utf8_sequence(&[0xF4, 0x90, 0x80, 0x80]),
594            "above U+10FFFF"
595        );
596        assert!(!is_valid_utf8_sequence(&[0xC2, 0x00]), "bad continuation");
597        assert!(!is_valid_utf8_sequence(&[]));
598        assert!(!is_valid_utf8_sequence(&[0xFF]));
599    }
600
601    #[test]
602    fn test_is_binary_stray_continuation_bytes() {
603        let data = vec![
604            0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D,
605            0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
606            0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
607            0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
608            0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, b' ', b' ', b' ', b' ', b' ', b' ',
609        ];
610        assert!(
611            is_binary(&data),
612            "stray continuation bytes should be flagged as binary"
613        );
614    }
615}
ix/format.rs

ix/
format.rs