Skip to main content

keyhog_scanner/engine/
segment_attribution.rs

1//! Attribute coalesced scanner matches back to logical input segments.
2//!
3//! This mirrors the general `match.map_offsets_to_segments` primitive in
4//! Vyre while keeping Keyhog publishable until the matching primitive crate
5//! version used by this repository is available on crates.io.
6
7use thiserror::Error;
8
9/// A logical input range inside one coalesced scanner buffer.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
11pub struct Segment {
12    /// Caller-defined stable identifier for the logical input.
13    pub id: u32,
14    /// Inclusive global byte offset where the segment starts.
15    pub start: u32,
16    /// Segment length in bytes.
17    pub len: u32,
18}
19
20impl Segment {
21    /// Create a segment descriptor.
22    #[must_use]
23    pub const fn new(id: u32, start: u32, len: u32) -> Self {
24        Self { id, start, len }
25    }
26
27    fn checked_end(self, segment_index: usize) -> Result<u32, SegmentAttributionError> {
28        self.start
29            .checked_add(self.len)
30            .ok_or(SegmentAttributionError::SegmentEndOverflow {
31                segment_index,
32                start: self.start,
33                len: self.len,
34            })
35    }
36}
37
38/// A scanner match using global byte offsets in the coalesced buffer.
39#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
40pub struct GlobalMatch {
41    /// Pattern identifier emitted by the scanner.
42    pub pattern_id: u32,
43    /// Inclusive global match start.
44    pub start: u32,
45    /// Exclusive global match end.
46    pub end: u32,
47}
48
49impl GlobalMatch {
50    /// Create a global scanner match.
51    #[must_use]
52    pub const fn new(pattern_id: u32, start: u32, end: u32) -> Self {
53        Self {
54            pattern_id,
55            start,
56            end,
57        }
58    }
59}
60
61/// A match rewritten into segment-local byte offsets.
62#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
63pub struct AttributedMatch {
64    /// Identifier copied from the containing segment.
65    pub segment_id: u32,
66    /// Identifier copied from the original match.
67    pub pattern_id: u32,
68    /// Inclusive byte offset inside the containing segment.
69    pub local_start: u32,
70    /// Exclusive byte offset inside the containing segment.
71    pub local_end: u32,
72}
73
74impl AttributedMatch {
75    /// Create a segment-local match.
76    #[must_use]
77    pub const fn new(segment_id: u32, pattern_id: u32, local_start: u32, local_end: u32) -> Self {
78        Self {
79            segment_id,
80            pattern_id,
81            local_start,
82            local_end,
83        }
84    }
85}
86
87/// Validation error returned while attributing matches to segments.
88#[derive(Debug, Clone, Copy, PartialEq, Eq, Error)]
89pub enum SegmentAttributionError {
90    /// A segment's `start + len` exceeded `u32::MAX`.
91    #[error("segment {segment_index} overflows: start {start} + len {len}")]
92    SegmentEndOverflow {
93        /// Index of the overflowing segment.
94        segment_index: usize,
95        /// Segment start offset.
96        start: u32,
97        /// Segment length in bytes.
98        len: u32,
99    },
100    /// Segments must be sorted by start offset.
101    #[error("segment {segment_index} starts at {start} before previous start {previous_start}")]
102    SegmentsNotSorted {
103        /// Index of the segment that breaks ordering.
104        segment_index: usize,
105        /// Previous segment start offset.
106        previous_start: u32,
107        /// Current segment start offset.
108        start: u32,
109    },
110    /// Segment byte ranges may touch but must not overlap.
111    #[error(
112        "segment {segment_index} starts at {start} before previous segment {previous_index} ends at {previous_end}"
113    )]
114    SegmentsOverlap {
115        /// Index of the previous segment.
116        previous_index: usize,
117        /// Index of the overlapping segment.
118        segment_index: usize,
119        /// Exclusive end offset of the previous segment.
120        previous_end: u32,
121        /// Start offset of the current segment.
122        start: u32,
123    },
124    /// Match ranges must be non-empty half-open byte ranges.
125    #[error("match {match_index} has invalid range [{start}, {end})")]
126    InvalidMatchRange {
127        /// Index of the invalid match.
128        match_index: usize,
129        /// Inclusive match start offset.
130        start: u32,
131        /// Exclusive match end offset.
132        end: u32,
133    },
134}
135
136#[derive(Debug, Clone, Copy)]
137struct NormalizedSegment {
138    id: u32,
139    start: u32,
140    end: u32,
141}
142
143/// Map global byte-range matches back to their containing segments.
144///
145/// Matches that land in gaps or cross a segment boundary are omitted. Output
146/// order follows the caller-provided match order.
147pub fn map_offsets_to_segments(
148    segments: &[Segment],
149    matches: &[GlobalMatch],
150) -> Result<Vec<AttributedMatch>, SegmentAttributionError> {
151    let normalized = validate_segments(segments)?;
152    let mut ordered_match_indices: Vec<usize> = (0..matches.len()).collect();
153    ordered_match_indices.sort_by_key(|&index| {
154        let item = matches[index];
155        (item.start, item.end, item.pattern_id, index)
156    });
157
158    let mut segment_cursor = 0usize;
159    let mut mapped_by_input_order = vec![None; matches.len()];
160
161    for match_index in ordered_match_indices {
162        let item = matches[match_index];
163        if item.end <= item.start {
164            return Err(SegmentAttributionError::InvalidMatchRange {
165                match_index,
166                start: item.start,
167                end: item.end,
168            });
169        }
170
171        while segment_cursor < normalized.len() && normalized[segment_cursor].end <= item.start {
172            segment_cursor += 1;
173        }
174
175        let Some(segment) = normalized.get(segment_cursor).copied() else {
176            continue;
177        };
178
179        if segment.start <= item.start && item.end <= segment.end {
180            mapped_by_input_order[match_index] = Some(AttributedMatch::new(
181                segment.id,
182                item.pattern_id,
183                item.start - segment.start,
184                item.end - segment.start,
185            ));
186        }
187    }
188
189    Ok(mapped_by_input_order.into_iter().flatten().collect())
190}
191
192fn validate_segments(
193    segments: &[Segment],
194) -> Result<Vec<NormalizedSegment>, SegmentAttributionError> {
195    let mut normalized = Vec::with_capacity(segments.len());
196    let mut previous: Option<(usize, u32, u32)> = None;
197
198    for (segment_index, segment) in segments.iter().copied().enumerate() {
199        let end = segment.checked_end(segment_index)?;
200
201        if let Some((previous_index, previous_start, previous_end)) = previous {
202            if segment.start < previous_start {
203                return Err(SegmentAttributionError::SegmentsNotSorted {
204                    segment_index,
205                    previous_start,
206                    start: segment.start,
207                });
208            }
209            if segment.start < previous_end {
210                return Err(SegmentAttributionError::SegmentsOverlap {
211                    previous_index,
212                    segment_index,
213                    previous_end,
214                    start: segment.start,
215                });
216            }
217        }
218
219        normalized.push(NormalizedSegment {
220            id: segment.id,
221            start: segment.start,
222            end,
223        });
224        previous = Some((segment_index, segment.start, end));
225    }
226
227    Ok(normalized)
228}