Skip to main content

ix/
format.rs

1//! Index file format constants and header parsing.
2//!
3//! All integers little-endian. All offsets absolute from file start.
4//! Sections aligned to 8-byte boundaries.
5
6/// Magic bytes identifying an ix index file (`b"IX01"`).
7pub const MAGIC: [u8; 4] = [0x49, 0x58, 0x30, 0x31];
8
9/// Major version of the on-disk format this library writes.
10pub const VERSION_MAJOR: u16 = 1;
11
12/// Minimum minor version required to read an index file.
13pub const VERSION_MINOR: u16 = 3;
14
15/// Size of the fixed header at the start of every index file (256 bytes).
16pub const HEADER_SIZE: usize = 256;
17
18/// On-disk size of one trigram-table entry (u32 key + 16-byte payload).
19/// Legacy constant — unused when CDX index is present.
20pub const TRIGRAM_ENTRY_SIZE: usize = 20;
21/// Maximum number of trigram entries per CDX compressed block.
22pub const CDX_BLOCK_SIZE: usize = 1024;
23
24/// On-disk size of one file-table entry in the index.
25pub const FILE_ENTRY_SIZE: usize = 48;
26
27/// Bit-flag constants stored in the [`Header::flags`] field.
28pub mod flags {
29    /// The index contains per-trigram bloom filters.
30    pub const HAS_BLOOM_FILTERS: u64 = 1 << 0;
31    /// Per-file content hashes are stored in the file table.
32    pub const HAS_CONTENT_HASHES: u64 = 1 << 1;
33    /// Posting-list data is ZSTD-compressed.
34    pub const POSTING_LISTS_COMPRESSED: u64 = 1 << 2;
35    /// Each posting-list chunk carries an `XXHash64` checksum.
36    pub const POSTING_LISTS_CHECKSUMMED: u64 = 1 << 3;
37    /// Trigram table uses CDX (Concentrated Delta X) compression.
38    pub const HAS_CDX_INDEX: u64 = 1 << 4;
39}
40
41/// Whether a file tracked by the index is current, out-of-date, or deleted.
42#[repr(u8)]
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum FileStatus {
45    /// File exists on disk and is in sync with the index.
46    Fresh = 0x00,
47    /// File has been modified since it was last indexed.
48    Stale = 0x01,
49    /// File has been removed from the file system.
50    Deleted = 0x02,
51}
52
53impl FileStatus {
54    /// Decode a file status from its on-disk `u8` representation.
55    ///
56    /// Unknown values default to [`FileStatus::Stale`].
57    #[must_use]
58    pub const fn from_u8(v: u8) -> Self {
59        match v {
60            0x00 => Self::Fresh,
61            0x02 => Self::Deleted,
62            _ => Self::Stale, // unknown = treat as stale
63        }
64    }
65}
66
67/// Parsed contents of the fixed 256-byte index header.
68///
69/// All integer fields are stored little-endian in the file.
70/// Offsets are absolute byte offsets from the start of the file.
71#[derive(Debug, Clone)]
72pub struct Header {
73    /// Format major version (must equal [`VERSION_MAJOR`]).
74    pub version_major: u16,
75    /// Format minor version (must be ≥ [`VERSION_MINOR`]).
76    pub version_minor: u16,
77    /// Bit-field of feature flags (see [`flags`]).
78    pub flags: u64,
79    /// Unix timestamp (seconds) when the index was created.
80    pub created_at: u64,
81    /// Sum of byte-sizes of all source files when indexed.
82    pub source_bytes_total: u64,
83    /// Number of file entries in the file table.
84    pub file_count: u32,
85    /// Number of trigram entries in the trigram table.
86    pub trigram_count: u32,
87    /// Byte offset to the file table section.
88    pub file_table_offset: u64,
89    /// Byte length of the file table section.
90    pub file_table_size: u64,
91    /// Byte offset to the trigram lookup table.
92    pub trigram_table_offset: u64,
93    /// Byte length of the trigram lookup table.
94    pub trigram_table_size: u64,
95    /// Byte offset to the posting-list data blob.
96    pub posting_data_offset: u64,
97    /// Byte length of the posting-list data blob.
98    pub posting_data_size: u64,
99    /// Byte offset to the bloom-filter section (0 if absent).
100    pub bloom_offset: u64,
101    /// Byte length of the bloom-filter section (0 if absent).
102    pub bloom_size: u64,
103    /// Byte offset to the string pool section.
104    pub string_pool_offset: u64,
105    /// Byte length of the string pool section.
106    pub string_pool_size: u64,
107    /// Byte offset to the file-name index section (0 if absent).
108    pub name_index_offset: u64,
109    /// Byte length of the file-name index section (0 if absent).
110    pub name_index_size: u64,
111    /// Byte offset to the CDX block index (0 if absent).
112    pub cdx_block_index_offset: u64,
113    /// Byte length of the CDX block index (0 if absent).
114    pub cdx_block_index_size: u64,
115}
116
117impl Header {
118    /// Parse header from the first 256 bytes of an index file.
119    ///
120    /// # Errors
121    ///
122    /// Returns an error if the data is too small, has a bad magic number,
123    /// unsupported version, or corrupted CRC.
124    pub fn parse(data: &[u8]) -> crate::error::Result<Self> {
125        if data.len() < HEADER_SIZE {
126            return Err(crate::error::Error::IndexTooSmall);
127        }
128        if data.get(0..4).ok_or(crate::error::Error::IndexTooSmall)? != MAGIC {
129            return Err(crate::error::Error::BadMagic);
130        }
131
132        let r = |off: usize| -> u64 {
133            data.get(off..off + 8)
134                .and_then(|s| s.try_into().ok())
135                .map_or(0, u64::from_le_bytes)
136        };
137        let r16 = |off: usize| -> u16 {
138            data.get(off..off + 2)
139                .and_then(|s| s.try_into().ok())
140                .map_or(0, u16::from_le_bytes)
141        };
142        let r32 = |off: usize| -> u32 {
143            data.get(off..off + 4)
144                .and_then(|s| s.try_into().ok())
145                .map_or(0, u32::from_le_bytes)
146        };
147
148        let major = r16(0x04);
149        let minor = r16(0x06);
150        if major != VERSION_MAJOR || minor < VERSION_MINOR {
151            return Err(crate::error::Error::UnsupportedVersion { major, minor });
152        }
153
154        // Validate CRC32C of header (bytes 0x00..0xF8)
155        let expected_crc = r32(0xF8);
156        let actual_crc = crc32c::crc32c(
157            data.get(0..0xF8)
158                .ok_or(crate::error::Error::IndexTooSmall)?,
159        );
160        if expected_crc != actual_crc {
161            return Err(crate::error::Error::HeaderCorrupted {
162                expected: expected_crc,
163                actual: actual_crc,
164            });
165        }
166
167        Ok(Self {
168            version_major: major,
169            version_minor: minor,
170            flags: r(0x08),
171            created_at: r(0x10),
172            source_bytes_total: r(0x18),
173            file_count: r32(0x20),
174            trigram_count: r32(0x24),
175            file_table_offset: r(0x28),
176            file_table_size: r(0x30),
177            trigram_table_offset: r(0x38),
178            trigram_table_size: r(0x40),
179            posting_data_offset: r(0x48),
180            posting_data_size: r(0x50),
181            bloom_offset: r(0x58),
182            bloom_size: r(0x60),
183            string_pool_offset: r(0x68),
184            string_pool_size: r(0x70),
185            name_index_offset: r(0x78),
186            name_index_size: r(0x80),
187            cdx_block_index_offset: r(0x88),
188            cdx_block_index_size: r(0x90),
189        })
190    }
191
192    /// Validate all section offsets fit within the file.
193    ///
194    /// # Errors
195    ///
196    /// Returns an error if any section extends beyond the file length.
197    pub fn validate_bounds(&self, file_len: u64) -> crate::error::Result<()> {
198        let check = |name: &'static str, off: u64, sz: u64| -> crate::error::Result<()> {
199            if off + sz > file_len {
200                Err(crate::error::Error::SectionOutOfBounds {
201                    section: name,
202                    offset: off,
203                    size: sz,
204                    file_len,
205                })
206            } else {
207                Ok(())
208            }
209        };
210        check("file_table", self.file_table_offset, self.file_table_size)?;
211        check(
212            "trigram_table",
213            self.trigram_table_offset,
214            self.trigram_table_size,
215        )?;
216        check(
217            "posting_data",
218            self.posting_data_offset,
219            self.posting_data_size,
220        )?;
221        if self.bloom_size > 0 {
222            check("bloom", self.bloom_offset, self.bloom_size)?;
223        }
224        check(
225            "string_pool",
226            self.string_pool_offset,
227            self.string_pool_size,
228        )?;
229        if self.name_index_size > 0 {
230            check("name_index", self.name_index_offset, self.name_index_size)?;
231        }
232        if self.cdx_block_index_size > 0 {
233            check(
234                "cdx_block_index",
235                self.cdx_block_index_offset,
236                self.cdx_block_index_size,
237            )?;
238        }
239        Ok(())
240    }
241
242    /// Returns `true` when the index includes per-trigram bloom filters.
243    #[must_use]
244    pub const fn has_bloom(&self) -> bool {
245        self.flags & flags::HAS_BLOOM_FILTERS != 0
246    }
247
248    /// Returns `true` when the trigram table uses CDX compression.
249    #[must_use]
250    pub const fn has_cdx(&self) -> bool {
251        self.flags & flags::HAS_CDX_INDEX != 0
252    }
253}
254
255use serde::{Deserialize, Serialize};
256use std::path::{Path, PathBuf};
257use std::time::{SystemTime, UNIX_EPOCH};
258
259/// A heartbeat file written by the `ixd` daemon so other processes can
260/// detect a running watcher and query its status.
261#[derive(Debug, Serialize, Deserialize, Clone)]
262pub struct Beacon {
263    /// PID of the `ixd` daemon process.
264    pub pid: i32,
265    /// Canonical root directory being watched.
266    pub root: PathBuf,
267    /// Unix timestamp (seconds) when the daemon started.
268    pub start_time: u64,
269    /// Human-readable status (e.g. `"idle"`, `"indexing"`).
270    pub status: String,
271    /// Unix timestamp (seconds) of the last filesystem event.
272    pub last_event_at: u64,
273    /// Path to the Unix domain socket for real-time notifications.
274    #[serde(default, skip_serializing_if = "Option::is_none")]
275    pub socket_path: Option<PathBuf>,
276}
277
278impl Beacon {
279    /// Create a new beacon for the current process, anchored at the given root.
280    #[must_use]
281    pub fn new(root: &Path) -> Self {
282        let pid = i32::try_from(std::process::id()).unwrap_or(0);
283        let now = SystemTime::now()
284            .duration_since(UNIX_EPOCH)
285            .unwrap_or_default()
286            .as_secs();
287
288        Self {
289            pid,
290            root: root.to_path_buf(),
291            start_time: now,
292            status: "idle".to_string(),
293            last_event_at: now,
294            socket_path: None,
295        }
296    }
297
298    /// Check whether the daemon described by this beacon is still running.
299    ///
300    /// Verifies the recorded PID still exists, belongs to an `ixd` binary,
301    /// and the watched root directory is still accessible.
302    #[must_use]
303    pub fn is_live(&self) -> bool {
304        use nix::sys::signal::kill;
305        use nix::unistd::Pid;
306
307        if kill(Pid::from_raw(self.pid), None).is_err() {
308            return false;
309        }
310
311        let comm_path = format!("/proc/{}/comm", self.pid);
312        if let Ok(comm) = std::fs::read_to_string(&comm_path) {
313            let comm = comm.trim();
314            if comm != "ixd" {
315                return false;
316            }
317        } else {
318            return false;
319        }
320
321        self.root.exists()
322    }
323
324    /// Write the beacon to `beacon.json` in the given folder.
325    ///
326    /// # Errors
327    ///
328    /// Returns an error if the file cannot be created or serialization fails.
329    pub fn write_to(&self, folder: &Path) -> crate::error::Result<()> {
330        let path = folder.join("beacon.json");
331        let f = std::fs::File::create(path)?;
332        serde_json::to_writer_pretty(f, self).map_err(std::io::Error::other)?;
333        Ok(())
334    }
335
336    /// Read a beacon from `beacon.json` in the given folder.
337    ///
338    /// # Errors
339    ///
340    /// Returns an error if the file cannot be opened or deserialization fails.
341    pub fn read_from(folder: &Path) -> crate::error::Result<Self> {
342        let path = folder.join("beacon.json");
343        let f = std::fs::File::open(path)?;
344        let beacon = serde_json::from_reader(f).map_err(std::io::Error::other)?;
345        Ok(beacon)
346    }
347}
348
349/// Centralized binary file detection.
350///
351/// Uses a heuristic based on the ratio of non-text bytes in the first 512 bytes.
352/// Valid UTF-8 multi-byte sequences (2-4 bytes) are counted as text, not binary,
353/// so files containing emoji or CJK characters are not falsely flagged.
354#[must_use]
355#[allow(clippy::cast_precision_loss, clippy::as_conversions)]
356pub fn is_binary(data: &[u8]) -> bool {
357    if data.is_empty() {
358        return false;
359    }
360    let check_len = data.len().min(512);
361    let slice = data.get(..check_len).unwrap_or(&[]);
362
363    let mut non_text = 0usize;
364    let mut i = 0;
365    while i < slice.len() {
366        let b = slice[i];
367        if matches!(b, 0x09 | 0x0A | 0x0D | 0x20..=0x7E) {
368            // ASCII printable/control — text
369        } else if b & 0xC0 == 0xC0 {
370            // Potential UTF-8 lead byte: decode the sequence
371            let seq_len = if b & 0xE0 == 0xC0 {
372                2
373            } else if b & 0xF0 == 0xE0 {
374                3
375            } else if b & 0xF8 == 0xF0 {
376                4
377            } else {
378                0
379            };
380
381            if seq_len > 0 && i + seq_len <= slice.len() {
382                let seq = &slice[i..i + seq_len];
383                if is_valid_utf8_sequence(seq) {
384                    i += seq_len;
385                    continue;
386                }
387            }
388            non_text += 1;
389        } else if b & 0xC0 == 0x80 {
390            // Stray continuation byte — likely binary
391            non_text += 1;
392        } else {
393            // 0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F — control chars → binary
394            non_text += 1;
395        }
396        i += 1;
397    }
398
399    (non_text as f32 / check_len as f32) > 0.3
400}
401
402#[inline]
403#[allow(clippy::indexing_slicing)]
404fn is_valid_utf8_sequence(seq: &[u8]) -> bool {
405    match seq.len() {
406        2 => seq[0] >= 0xC2 && (seq[1] & 0xC0) == 0x80,
407        3 => {
408            let valid = (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80;
409            if !valid {
410                return false;
411            }
412            if seq[0] == 0xE0 {
413                seq[1] >= 0xA0
414            } else if seq[0] == 0xED {
415                seq[1] <= 0x9F
416            } else {
417                seq[0] >= 0xE1 && seq[0] <= 0xEC || seq[0] >= 0xEE
418            }
419        }
420        4 => {
421            let valid =
422                (seq[1] & 0xC0) == 0x80 && (seq[2] & 0xC0) == 0x80 && (seq[3] & 0xC0) == 0x80;
423            if !valid {
424                return false;
425            }
426            if seq[0] == 0xF0 {
427                seq[1] >= 0x90
428            } else if seq[0] == 0xF4 {
429                seq[1] <= 0x8F
430            } else {
431                seq[0] >= 0xF1 && seq[0] <= 0xF3
432            }
433        }
434        _ => false,
435    }
436}
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441
442    #[test]
443    fn test_is_binary_empty() {
444        assert!(!is_binary(&[]));
445    }
446
447    #[test]
448    fn test_is_binary_pure_ascii() {
449        assert!(!is_binary(b"Hello, world! This is a normal text file.\n"));
450    }
451
452    #[test]
453    fn test_is_binary_null_bytes() {
454        assert!(is_binary(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03]));
455    }
456
457    #[test]
458    fn test_is_binary_emoji_heavy() {
459        let emoji: &[u8] = &[
460            0x23, 0x20, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8, 0xF0, 0x9F, 0x9A, 0xA8,
461            0x20, 0x41, 0x4C, 0x45, 0x52, 0x54,
462        ];
463        assert!(
464            !is_binary(emoji),
465            "emoji-heavy file should NOT be flagged as binary"
466        );
467    }
468
469    #[test]
470    fn test_is_binary_cjk() {
471        let cjk: &[u8] = "你好世界これはテストです한국어".as_bytes();
472        assert!(!is_binary(cjk), "CJK text should NOT be flagged as binary");
473    }
474
475    #[test]
476    fn test_is_binary_mixed_utf8_ascii() {
477        let mut data = Vec::new();
478        data.extend_from_slice(b"def hello():\n    ");
479        data.extend_from_slice("print('🚀')".as_bytes());
480        data.extend_from_slice(b"\n    return 42\n");
481        assert!(
482            !is_binary(&data),
483            "Python with emoji should NOT be flagged as binary"
484        );
485    }
486
487    #[test]
488    fn test_is_binary_truly_binary() {
489        let mut binary_data = vec![0u8; 512];
490        for (i, b) in binary_data.iter_mut().enumerate() {
491            *b = (i % 256) as u8;
492        }
493        assert!(
494            is_binary(&binary_data),
495            "random byte data should be flagged as binary"
496        );
497    }
498
499    #[test]
500    fn test_is_binary_short_data() {
501        assert!(!is_binary(b"hi"), "very short text should not be binary");
502        assert!(!is_binary(&[0x0A]), "single newline is not binary");
503    }
504
505    #[test]
506    fn test_is_binary_utf8_truncated_at_boundary() {
507        let emoji: &[u8] = &[0xF0, 0x9F, 0x9A];
508        let mut data = Vec::new();
509        data.extend_from_slice(b"some text ");
510        data.extend_from_slice(emoji);
511        data.extend_from_slice(b" more text");
512        assert!(
513            !is_binary(&data),
514            "truncated UTF-8 at boundary should not flip to binary"
515        );
516    }
517
518    #[test]
519    fn test_is_binary_control_chars() {
520        let mut data = vec![0x0B; 200];
521        data.extend_from_slice(b"normal text padding");
522        assert!(
523            is_binary(&data),
524            "vertical tabs (0x0B) should be flagged as binary"
525        );
526    }
527
528    #[test]
529    fn test_is_binary_mixed_realistic_python() {
530        let mut emoji_line = Vec::new();
531        emoji_line.extend_from_slice(b"# ");
532        for _ in 0..16 {
533            emoji_line.extend_from_slice("🚨".as_bytes());
534        }
535        emoji_line.extend_from_slice(b" WARNING");
536        let mut data = Vec::new();
537        data.extend_from_slice(&emoji_line);
538        data.extend_from_slice(b"\n\ndef process(data):\n    return data.strip()\n");
539        assert!(
540            !is_binary(&data),
541            "realistic Python file with emoji header should NOT be binary"
542        );
543    }
544
545    #[test]
546    fn test_is_binary_exactly_30_percent() {
547        let mut data = Vec::new();
548        let total = 100;
549        let non_text_count = (total as f32 * 0.29) as usize;
550        for _ in 0..non_text_count {
551            data.push(0x01);
552        }
553        for _ in 0..(total - non_text_count) {
554            data.push(b'x');
555        }
556        assert!(!is_binary(&data), "29% non-text should NOT be flagged");
557        let mut data_over = Vec::new();
558        let non_text_over = (total as f32 * 0.31) as usize;
559        for _ in 0..non_text_over {
560            data_over.push(0x01);
561        }
562        for _ in 0..(total - non_text_over) {
563            data_over.push(b'x');
564        }
565        assert!(is_binary(&data_over), "31% non-text should be flagged");
566    }
567
568    #[test]
569    fn test_is_valid_utf8_sequence() {
570        assert!(is_valid_utf8_sequence(&[0xC3, 0xA9]));
571        assert!(is_valid_utf8_sequence(&[0xE4, 0xBD, 0xA0]));
572        assert!(
573            is_valid_utf8_sequence(&[0xF0, 0x9F, 0x9A, 0xA8]),
574            "🚨 should be valid 4-byte UTF-8"
575        );
576        assert!(
577            !is_valid_utf8_sequence(&[0xC0, 0x80]),
578            "overlong 2-byte encoding (C0)"
579        );
580        assert!(
581            !is_valid_utf8_sequence(&[0xC1, 0x80]),
582            "overlong 2-byte encoding (C1)"
583        );
584        assert!(
585            !is_valid_utf8_sequence(&[0xE0, 0x80, 0x80]),
586            "overlong 3-byte encoding"
587        );
588        assert!(
589            !is_valid_utf8_sequence(&[0xF0, 0x80, 0x80, 0x80]),
590            "overlong 4-byte encoding"
591        );
592        assert!(
593            !is_valid_utf8_sequence(&[0xED, 0xA0, 0x80]),
594            "surrogate pair (ED A0)"
595        );
596        assert!(
597            !is_valid_utf8_sequence(&[0xF4, 0x90, 0x80, 0x80]),
598            "above U+10FFFF"
599        );
600        assert!(!is_valid_utf8_sequence(&[0xC2, 0x00]), "bad continuation");
601        assert!(!is_valid_utf8_sequence(&[]));
602        assert!(!is_valid_utf8_sequence(&[0xFF]));
603    }
604
605    #[test]
606    fn test_is_binary_stray_continuation_bytes() {
607        let data = vec![
608            0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D,
609            0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
610            0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
611            0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
612            0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, b' ', b' ', b' ', b' ', b' ', b' ',
613        ];
614        assert!(
615            is_binary(&data),
616            "stray continuation bytes should be flagged as binary"
617        );
618    }
619}