Skip to main content

mlt_core/frames/v01/id/
optimizer.rs

1use crate::MltError;
2use crate::v01::{
3    DataProfile, EncodedId, IdEncoder, IdValues, IdWidth, IntEncoder, LogicalEncoder,
4    PhysicalEncoder,
5};
6
7/// A pre-computed set of [`IntEncoder`] candidates derived from a representative
8/// sample of tiles.
9///
10/// Building a profile once from sample tiles avoids re-running
11/// [`DataProfile::prune_candidates`] on every subsequent tile; the profile's
12/// candidate list is used directly in the competition step instead.
13///
14/// [`IdWidth`] is not stored because it is always re-derived from the actual
15/// data of the tile being encoded.
16///
17/// Profiles from multiple samples are combined with [`IdProfile::merge`], which
18/// takes the union of both candidate sets.
19#[derive(Debug, Clone, PartialEq)]
20pub struct IdProfile {
21    /// Encoder candidates to use during competition.
22    ///
23    /// An empty list causes the caller to fall back to automatic optimization.
24    candidates: Vec<IntEncoder>,
25}
26
27impl IdProfile {
28    #[doc(hidden)]
29    #[must_use]
30    pub fn new(candidates: Vec<IntEncoder>) -> Self {
31        Self { candidates }
32    }
33
34    /// Build a profile from a sample of decoded IDs.
35    #[must_use]
36    pub fn from_sample(decoded: &IdValues) -> Self {
37        let ids = &decoded.0;
38        let Ok((_, _, id_width)) = single_pass_statistics(ids) else {
39            return Self {
40                candidates: vec![IntEncoder::varint()],
41            };
42        };
43        Self {
44            candidates: pruned_candidates(ids, id_width),
45        }
46    }
47
48    /// Merge two profiles by taking the union of their candidate sets.
49    ///
50    /// Encoders already present in `self` are not duplicated.
51    #[must_use]
52    pub fn merge(mut self, other: &Self) -> Self {
53        for &enc in &other.candidates {
54            if !self.candidates.contains(&enc) {
55                self.candidates.push(enc);
56            }
57        }
58        self
59    }
60}
61
62/// Analyze `decoded` and return a near-optimal [`IdEncoder`].
63///
64/// Fast paths (short sequences, sequential, constant) are checked first.
65/// Otherwise, the full pruning + competition pipeline runs.
66fn optimize(decoded: &IdValues) -> IdEncoder {
67    let ids = &decoded.0;
68    let (is_sequential, is_constant, id_width) = match single_pass_statistics(ids) {
69        Ok(stats) => stats,
70        Err(default_enc) => return default_enc,
71    };
72
73    if ids.len() <= 2 {
74        return IdEncoder::new(LogicalEncoder::None, id_width);
75    }
76
77    if is_sequential && ids.len() > 4 {
78        return IdEncoder::new(LogicalEncoder::DeltaRle, id_width);
79    }
80
81    if is_constant {
82        return IdEncoder::new(LogicalEncoder::Rle, id_width);
83    }
84
85    let candidates = pruned_candidates(ids, id_width);
86    let logical = compete_with(ids, id_width, &candidates);
87    IdEncoder::new(logical, id_width)
88}
89
90/// Apply a profile to `decoded`, re-deriving [`IdWidth`] from the tile's data.
91///
92/// The same fast paths as [`optimize`] are applied first. For the general case,
93/// competition is run over the profile's pre-computed candidate list rather
94/// than re-running the full pruning analysis.
95fn apply_profile(decoded: &IdValues, profile: &IdProfile) -> IdEncoder {
96    let ids = &decoded.0;
97    let (is_sequential, is_constant, id_width) = match single_pass_statistics(ids) {
98        Ok(stats) => stats,
99        Err(default_enc) => return default_enc,
100    };
101
102    if ids.len() <= 2 {
103        return IdEncoder::new(LogicalEncoder::None, id_width);
104    }
105
106    if is_sequential && ids.len() > 4 {
107        return IdEncoder::new(LogicalEncoder::DeltaRle, id_width);
108    }
109
110    if is_constant {
111        return IdEncoder::new(LogicalEncoder::Rle, id_width);
112    }
113
114    let logical = compete_with(ids, id_width, &profile.candidates);
115    IdEncoder::new(logical, id_width)
116}
117
118/// Collect `is_sequential`, `is_constant`, and [`IdWidth`] in a single pass.
119///
120/// Returns `Err(default_encoder)` for the empty or all-null case so callers
121/// can return early.
122fn single_pass_statistics(ids: &[Option<u64>]) -> Result<(bool, bool, IdWidth), IdEncoder> {
123    let mut has_nulls = false;
124    let mut is_sequential = true;
125    let mut is_constant = true;
126
127    let mut ids_iter = ids.iter();
128    let first_non_null = loop {
129        match ids_iter.next() {
130            Some(Some(id)) => break *id,
131            Some(None) => has_nulls = true,
132            None => return Err(IdEncoder::new(LogicalEncoder::None, IdWidth::Id32)),
133        }
134    };
135
136    let mut max_value = first_non_null;
137    let mut prev_non_null = first_non_null;
138
139    for &id in ids_iter {
140        match id {
141            None => has_nulls = true,
142            Some(v) => {
143                max_value = max_value.max(v);
144                if v != prev_non_null.wrapping_add(1) {
145                    is_sequential = false;
146                }
147                if v != first_non_null {
148                    is_constant = false;
149                }
150                prev_non_null = v;
151            }
152        }
153    }
154
155    Ok((
156        is_sequential,
157        is_constant,
158        deduce_width(has_nulls, max_value),
159    ))
160}
161
162/// Determine the narrowest correct [`IdWidth`] for the given data.
163#[inline]
164fn deduce_width(has_nulls: bool, max_value: u64) -> IdWidth {
165    let fits_u32 = u32::try_from(max_value).is_ok();
166    match (has_nulls, fits_u32) {
167        (false, true) => IdWidth::Id32,
168        (true, true) => IdWidth::OptId32,
169        (false, false) => IdWidth::Id64,
170        (true, false) => IdWidth::OptId64,
171    }
172}
173
174/// Run [`DataProfile::prune_candidates`] and filter the result to VarInt-only.
175///
176/// This is the analysis half of automatic optimization; the competition half
177/// is [`compete_with`]. Splitting them lets [`IdProfile`] cache the result.
178fn pruned_candidates(ids: &[Option<u64>], id_width: IdWidth) -> Vec<IntEncoder> {
179    match id_width {
180        IdWidth::Id32 | IdWidth::OptId32 => {
181            #[expect(
182                clippy::cast_possible_truncation,
183                reason = "width was deduced as ≤ u32::MAX so truncation is safe"
184            )]
185            let vals: Vec<u32> = ids.iter().flatten().map(|&v| v as u32).collect();
186            filter_varint(&DataProfile::prune_candidates::<i32>(&vals))
187        }
188        IdWidth::Id64 | IdWidth::OptId64 => {
189            let vals: Vec<u64> = ids.iter().flatten().copied().collect();
190            filter_varint(&DataProfile::prune_candidates::<i64>(&vals))
191        }
192    }
193}
194
195/// Run trial encodings over `candidates` and return the [`LogicalEncoder`] that
196/// produces the smallest output for `ids`.
197fn compete_with(
198    ids: &[Option<u64>],
199    id_width: IdWidth,
200    candidates: &[IntEncoder],
201) -> LogicalEncoder {
202    let candidates = if candidates.is_empty() {
203        &[IntEncoder::varint()][..]
204    } else {
205        candidates
206    };
207
208    match id_width {
209        IdWidth::Id32 | IdWidth::OptId32 => {
210            #[expect(
211                clippy::cast_possible_truncation,
212                reason = "width was deduced as ≤ u32::MAX so truncation is safe"
213            )]
214            let vals: Vec<u32> = ids.iter().flatten().map(|&v| v as u32).collect();
215            DataProfile::compete_u32(candidates, &vals).logical
216        }
217        IdWidth::Id64 | IdWidth::OptId64 => {
218            let vals: Vec<u64> = ids.iter().flatten().copied().collect();
219            DataProfile::compete_u64(candidates, &vals).logical
220        }
221    }
222}
223
224/// Retain only candidates whose physical encoder is [`PhysicalEncoder::VarInt`],
225/// falling back to a single plain `VarInt` if the result would be empty.
226fn filter_varint(candidates: &[IntEncoder]) -> Vec<IntEncoder> {
227    let filtered: Vec<IntEncoder> = candidates
228        .iter()
229        .copied()
230        .filter(|enc| enc.physical == PhysicalEncoder::VarInt)
231        .collect();
232    if filtered.is_empty() {
233        vec![IntEncoder::varint()]
234    } else {
235        filtered
236    }
237}
238
239impl IdValues {
240    /// Encode this ID column using the given encoder, consuming `self`.
241    /// Returns `None` if the ID list is empty.
242    pub fn encode(self, encoder: IdEncoder) -> Result<Option<EncodedId>, MltError> {
243        if self.0.is_empty() {
244            Ok(None)
245        } else {
246            Ok(Some(EncodedId::encode(&self, encoder)?))
247        }
248    }
249
250    /// Encode using the profile to select the best encoder.
251    pub fn encode_with_profile(
252        &self,
253        profile: &IdProfile,
254    ) -> Result<(Option<EncodedId>, Option<IdEncoder>), MltError> {
255        if self.0.is_empty() {
256            return Ok((None, None));
257        }
258        let enc = apply_profile(self, profile);
259        let encoded = EncodedId::encode(self, enc)?;
260        Ok((Some(encoded), Some(enc)))
261    }
262
263    /// Automatically select the best encoder and encode, consuming `self`.
264    pub fn encode_auto(self) -> Result<(Option<EncodedId>, Option<IdEncoder>), MltError> {
265        if self.0.is_empty() {
266            return Ok((None, None));
267        }
268        let enc = optimize(&self);
269        let encoded = EncodedId::encode(&self, enc)?;
270        Ok((Some(encoded), Some(enc)))
271    }
272}