Skip to main content

pcap_toolkit/sort/
index.rs

1//! Packet index types for two-pass chronological sorting.
2//!
3//! [`PacketIndex`] is exactly 20 bytes — a minimal record that lets the second
4//! pass seek directly to each packet without buffering payloads.
5//!
6//! [`IndexStore`] abstracts over in-memory and on-disk storage so the first
7//! pass can switch modes transparently.
8
9use std::fs::File;
10use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12
13use crate::error::SortError;
14
15// ── PacketIndex ──────────────────────────────────────────────────────────────
16
17/// Minimal per-packet record built during the first pass.
18///
19/// | Field          | Size | Description                                   |
20/// |----------------|------|-----------------------------------------------|
21/// | `timestamp_ns` | 8 B  | Nanoseconds since the Unix epoch              |
22/// | `byte_offset`  | 8 B  | File position of the 16-byte record header    |
23/// | `caplen`       | 4 B  | Captured packet length (payload only)         |
24///
25/// Total: **20 bytes** per packet.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub struct PacketIndex {
28    /// Packet timestamp in nanoseconds since the Unix epoch.
29    pub timestamp_ns: u64,
30    /// Byte offset of the 16-byte PCAP record header in the source file.
31    pub byte_offset: u64,
32    /// Captured packet length (bytes of payload, not including the record header).
33    pub caplen: u32,
34}
35
36impl PacketIndex {
37    pub(crate) const SIZE: usize = 20;
38
39    /// Serialise to 20 little-endian bytes.
40    pub fn to_bytes(self) -> [u8; Self::SIZE] {
41        let mut buf = [0u8; Self::SIZE];
42        buf[0..8].copy_from_slice(&self.timestamp_ns.to_le_bytes());
43        buf[8..16].copy_from_slice(&self.byte_offset.to_le_bytes());
44        buf[16..20].copy_from_slice(&self.caplen.to_le_bytes());
45        buf
46    }
47
48    /// Deserialise from 20 little-endian bytes.
49    pub fn from_bytes(b: &[u8; Self::SIZE]) -> Self {
50        Self {
51            timestamp_ns: u64::from_le_bytes(b[0..8].try_into().unwrap()),
52            byte_offset: u64::from_le_bytes(b[8..16].try_into().unwrap()),
53            caplen: u32::from_le_bytes(b[16..20].try_into().unwrap()),
54        }
55    }
56}
57
58// ── FilePacketIndex ──────────────────────────────────────────────────────────
59
60/// A [`PacketIndex`] entry annotated with the index of its source file.
61///
62/// Used in the multi-file merge path so the second pass can seek into the
63/// correct input file for each packet.
64#[derive(Debug, Clone, Copy)]
65pub struct FilePacketIndex {
66    pub entry: PacketIndex,
67    /// Index into the `inputs` slice passed to [`crate::sort::sort_files`].
68    pub file_id: usize,
69}
70
71// ── IndexStore ───────────────────────────────────────────────────────────────
72
73/// Storage backend for the first-pass packet index.
74pub enum IndexStore {
75    /// In-memory vector — fast path for captures that fit comfortably in RAM.
76    Memory(Vec<PacketIndex>),
77    /// On-disk sidecar file — for TB-scale inputs where the index itself may be
78    /// large (≈ 20 MB per 1 M packets).
79    Disk {
80        writer: BufWriter<File>,
81        path: PathBuf,
82        count: u64,
83    },
84}
85
86impl IndexStore {
87    /// Create an in-memory store.
88    pub fn memory() -> Self {
89        IndexStore::Memory(Vec::new())
90    }
91
92    /// Create an on-disk store backed by a file at `sidecar_path`.
93    ///
94    /// # Errors
95    /// Returns [`SortError::Io`] if the sidecar file cannot be created.
96    pub fn disk(sidecar_path: &Path) -> Result<Self, SortError> {
97        let file = File::create(sidecar_path)?;
98        Ok(IndexStore::Disk {
99            writer: BufWriter::with_capacity(64 * 1024, file),
100            path: sidecar_path.to_owned(),
101            count: 0,
102        })
103    }
104
105    /// Append one [`PacketIndex`] record.
106    ///
107    /// # Errors
108    /// Returns [`SortError::Io`] on write failure (disk mode only).
109    pub fn push(&mut self, entry: PacketIndex) -> Result<(), SortError> {
110        match self {
111            IndexStore::Memory(v) => {
112                v.push(entry);
113                Ok(())
114            }
115            IndexStore::Disk { writer, count, .. } => {
116                writer.write_all(&entry.to_bytes())?;
117                *count += 1;
118                Ok(())
119            }
120        }
121    }
122
123    /// Return the path of the on-disk sidecar file, if any.
124    pub fn sidecar_path(&self) -> Option<&Path> {
125        match self {
126            IndexStore::Disk { path, .. } => Some(path),
127            IndexStore::Memory(_) => None,
128        }
129    }
130
131    /// Flush any buffered writes, sort all entries by `timestamp_ns`, and
132    /// return the result as an owned `Vec<PacketIndex>`.
133    ///
134    /// The on-disk sidecar file is **not** deleted here; the caller is
135    /// responsible for cleanup.
136    ///
137    /// # Errors
138    /// Returns [`SortError::Io`] on flush or read failure.
139    pub fn into_sorted(self) -> Result<Vec<PacketIndex>, SortError> {
140        match self {
141            IndexStore::Memory(mut v) => {
142                v.sort_unstable_by_key(|e| e.timestamp_ns);
143                Ok(v)
144            }
145            IndexStore::Disk {
146                mut writer,
147                path,
148                count,
149            } => {
150                writer.flush()?;
151                drop(writer);
152                let mut entries = read_disk_index(&path, count)?;
153                entries.sort_unstable_by_key(|e| e.timestamp_ns);
154                Ok(entries)
155            }
156        }
157    }
158}
159
160/// Read all [`PacketIndex`] records back from a sidecar file.
161fn read_disk_index(path: &Path, expected: u64) -> Result<Vec<PacketIndex>, SortError> {
162    let file = File::open(path)?;
163    let mut reader = BufReader::new(file);
164    let mut entries = Vec::with_capacity(expected as usize);
165    let mut buf = [0u8; PacketIndex::SIZE];
166    loop {
167        match reader.read_exact(&mut buf) {
168            Ok(()) => entries.push(PacketIndex::from_bytes(&buf)),
169            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
170            Err(e) => return Err(SortError::Io(e)),
171        }
172    }
173    Ok(entries)
174}
175
176// ── Helpers used by the second pass ─────────────────────────────────────────
177
178/// Derive the `.idx` sidecar path by appending `.idx` to the input path.
179///
180/// # Examples
181/// ```
182/// assert_eq!(
183///     pcap_toolkit::sort::index::sidecar_path(std::path::Path::new("/tmp/traffic.pcap")),
184///     std::path::PathBuf::from("/tmp/traffic.pcap.idx"),
185/// );
186/// ```
187pub fn sidecar_path(input: &Path) -> PathBuf {
188    let name = input
189        .file_name()
190        .unwrap_or_default()
191        .to_string_lossy()
192        .into_owned();
193    input.with_file_name(format!("{name}.idx"))
194}
195
196/// Seek `file` to `offset`, read the 16-byte PCAP record header to recover
197/// `origlen`, then read `caplen` bytes of packet data.
198///
199/// Returns `(origlen, packet_data)`.
200///
201/// # Errors
202/// Returns [`SortError::Io`] on seek or read failure.
203pub fn read_packet_at(
204    file: &mut File,
205    offset: u64,
206    caplen: u32,
207    big_endian: bool,
208) -> Result<(u32, Vec<u8>), SortError> {
209    file.seek(SeekFrom::Start(offset))?;
210    let mut hdr = [0u8; 16];
211    file.read_exact(&mut hdr)?;
212    let origlen = if big_endian {
213        u32::from_be_bytes(hdr[12..16].try_into().unwrap())
214    } else {
215        u32::from_le_bytes(hdr[12..16].try_into().unwrap())
216    };
217    let mut data = vec![0u8; caplen as usize];
218    file.read_exact(&mut data)?;
219    Ok((origlen, data))
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    #[test]
227    fn test_packet_index_roundtrip() {
228        let idx = PacketIndex {
229            timestamp_ns: 1_700_000_000_123_456_789,
230            byte_offset: 4096,
231            caplen: 1500,
232        };
233        let bytes = idx.to_bytes();
234        assert_eq!(bytes.len(), 20);
235        assert_eq!(PacketIndex::from_bytes(&bytes), idx);
236    }
237
238    #[test]
239    fn test_memory_store_sorts_by_timestamp() {
240        let mut store = IndexStore::memory();
241        for (ts, off) in [(3000u64, 300u64), (1000, 100), (2000, 200)] {
242            store
243                .push(PacketIndex {
244                    timestamp_ns: ts,
245                    byte_offset: off,
246                    caplen: 60,
247                })
248                .unwrap();
249        }
250        let sorted = store.into_sorted().unwrap();
251        assert_eq!(
252            sorted.iter().map(|e| e.timestamp_ns).collect::<Vec<_>>(),
253            [1000, 2000, 3000]
254        );
255    }
256
257    #[test]
258    fn test_disk_store_roundtrip() {
259        let path = std::env::temp_dir().join("pcap_toolkit_test_index.idx");
260        let mut store = IndexStore::disk(&path).unwrap();
261        for (ts, off) in [(300u64, 30u64), (100, 10), (200, 20)] {
262            store
263                .push(PacketIndex {
264                    timestamp_ns: ts,
265                    byte_offset: off,
266                    caplen: 42,
267                })
268                .unwrap();
269        }
270        let sorted = store.into_sorted().unwrap();
271        assert_eq!(
272            sorted.iter().map(|e| e.timestamp_ns).collect::<Vec<_>>(),
273            [100, 200, 300]
274        );
275        let _ = std::fs::remove_file(&path);
276    }
277
278    #[test]
279    fn test_sidecar_path() {
280        assert_eq!(
281            sidecar_path(Path::new("/tmp/traffic.pcap")),
282            PathBuf::from("/tmp/traffic.pcap.idx"),
283        );
284    }
285}