Skip to main content

mlt_core/frames/v01/property/
model.rs

1use std::borrow::Cow;
2
3use enum_dispatch::enum_dispatch;
4
5use crate::EncDec;
6use crate::analyse::{Analyze, StatType};
7use crate::v01::{EncodedStream, FsstStrEncoder, IntEncoder, RawStream, StreamMeta};
8
9/// Owned name string (Stage 4/5)
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct EncodedName(pub String);
12
13/// Property representation, either raw (borrowed from bytes) or parsed.
14pub type Property<'a> = EncDec<RawProperty<'a>, ParsedProperty<'a>>;
15
16pub enum PropertyKind {
17    Bool,
18    Integer,
19    Float,
20    String,
21    SharedDict,
22}
23
24/// Raw scalar column (bool, integer, or float) as read directly from the tile.
25#[derive(Debug, Clone, PartialEq)]
26pub struct RawScalar<'a> {
27    pub name: &'a str,
28    pub presence: RawPresence<'a>,
29    pub data: RawStream<'a>,
30}
31
32/// Wire-ready encoded scalar column (owns its byte buffers).
33#[derive(Debug, Clone, PartialEq)]
34pub struct EncodedScalar {
35    pub name: EncodedName,
36    pub presence: EncodedPresence,
37    pub data: EncodedStream,
38}
39
40/// Raw encoding payload for a string column (plain, dictionary, or FSST variants).
41///
42/// `RawStream` order matches the encoder: see `StringEncoder.encode()`.
43#[derive(Debug, Clone, PartialEq)]
44pub enum RawStringsEncoding<'a> {
45    /// Plain: length stream + data stream
46    Plain(RawPlainData<'a>),
47    /// Dictionary: lengths + offsets + dictionary data
48    Dictionary {
49        plain_data: RawPlainData<'a>,
50        offsets: RawStream<'a>,
51    },
52    /// FSST plain (4 streams): symbol lengths, symbol table, value lengths, compressed corpus. No offsets.
53    FsstPlain(RawFsstData<'a>),
54    /// FSST dictionary (5 streams): symbol lengths, symbol table, value lengths, compressed corpus, offsets.
55    FsstDictionary {
56        fsst_data: RawFsstData<'a>,
57        offsets: RawStream<'a>,
58    },
59}
60
61/// Wire-ready encoded strings encoding (owns byte buffers).
62#[derive(Debug, Clone, PartialEq)]
63pub enum EncodedStringsEncoding {
64    Plain(EncodedPlainData),
65    Dictionary {
66        plain_data: EncodedPlainData,
67        offsets: EncodedStream,
68    },
69    FsstPlain(EncodedFsstData),
70    FsstDictionary {
71        fsst_data: EncodedFsstData,
72        offsets: EncodedStream,
73    },
74}
75
76/// Raw string column as read directly from the tile.
77#[derive(Debug, Clone, PartialEq)]
78pub struct RawStrings<'a> {
79    pub name: &'a str,
80    pub presence: RawPresence<'a>,
81    pub encoding: RawStringsEncoding<'a>,
82}
83
84/// Wire-ready encoded string column (owns its byte buffers).
85#[derive(Debug, Clone, PartialEq)]
86pub struct EncodedStrings {
87    pub name: EncodedName,
88    pub presence: EncodedPresence,
89    pub encoding: EncodedStringsEncoding,
90}
91
92/// Raw encoding payload for a `SharedDict` column.
93///
94/// Unlike [`RawStringsEncoding`], shared dictionaries do NOT have their own offset stream.
95/// Instead, each child column has its own offset stream that references the shared dictionary.
96/// This is why only `Plain` and `FsstPlain` variants exist here.
97#[derive(Debug, Clone, PartialEq)]
98pub enum RawSharedDictEncoding<'a> {
99    /// Plain shared dict (2 streams): lengths + data.
100    Plain(RawPlainData<'a>),
101    /// FSST plain shared dict (4 streams): symbol lengths, symbol table, lengths, corpus.
102    FsstPlain(RawFsstData<'a>),
103}
104
105/// Wire-ready encoded shared dict encoding (owns byte buffers).
106#[derive(Debug, Clone, PartialEq)]
107pub enum EncodedSharedDictEncoding {
108    Plain(EncodedPlainData),
109    FsstPlain(EncodedFsstData),
110}
111
112/// Raw shared-dictionary column as read directly from the tile.
113#[derive(Debug, Clone, PartialEq)]
114pub struct RawSharedDict<'a> {
115    pub name: &'a str,
116    pub encoding: RawSharedDictEncoding<'a>,
117    pub children: Vec<RawSharedDictItem<'a>>,
118}
119
120/// Wire-ready encoded shared-dictionary column (owns its byte buffers).
121#[derive(Debug, Clone, PartialEq)]
122pub struct EncodedSharedDict {
123    pub name: EncodedName,
124    pub encoding: EncodedSharedDictEncoding,
125    pub children: Vec<EncodedSharedDictItem>,
126}
127
128/// Raw property data as read directly from the tile.
129#[derive(Debug, PartialEq, Clone)]
130pub enum RawProperty<'a> {
131    Bool(RawScalar<'a>),
132    I8(RawScalar<'a>),
133    U8(RawScalar<'a>),
134    I32(RawScalar<'a>),
135    U32(RawScalar<'a>),
136    I64(RawScalar<'a>),
137    U64(RawScalar<'a>),
138    F32(RawScalar<'a>),
139    F64(RawScalar<'a>),
140    Str(RawStrings<'a>),
141    SharedDict(RawSharedDict<'a>),
142}
143
144/// Wire-ready encoded property data (owns its byte buffers).
145#[derive(Debug, Clone, PartialEq)]
146pub enum EncodedProperty {
147    Bool(EncodedScalar),
148    I8(EncodedScalar),
149    U8(EncodedScalar),
150    I32(EncodedScalar),
151    U32(EncodedScalar),
152    I64(EncodedScalar),
153    U64(EncodedScalar),
154    F32(EncodedScalar),
155    F64(EncodedScalar),
156    Str(EncodedStrings),
157    SharedDict(EncodedSharedDict),
158}
159
160/// Parsed property values in a typed enum form.
161#[derive(Clone, PartialEq, strum::IntoStaticStr)]
162#[strum(serialize_all = "snake_case")]
163#[enum_dispatch(Analyze)]
164pub enum ParsedProperty<'a> {
165    Bool(ParsedScalar<'a, bool>),
166    I8(ParsedScalar<'a, i8>),
167    U8(ParsedScalar<'a, u8>),
168    I32(ParsedScalar<'a, i32>),
169    U32(ParsedScalar<'a, u32>),
170    I64(ParsedScalar<'a, i64>),
171    U64(ParsedScalar<'a, u64>),
172    F32(ParsedScalar<'a, f32>),
173    F64(ParsedScalar<'a, f64>),
174    Str(ParsedStrings<'a>),
175    SharedDict(ParsedSharedDict<'a>),
176}
177
178/// Staged property column (encode-side, fully owned).
179///
180/// Unlike `ParsedProperty` (decode-side, potentially borrowed), all string names
181/// and corpus data are owned `String`s.  No lifetime parameter needed.
182///
183/// The `Encoded` variant holds wire-ready data after the `Staged*` → `Encoded*`
184/// encoding step has been applied. This allows `StagedLayer01` to hold a mix of
185/// staged and encoded properties before serialization.
186#[derive(Debug, Clone, PartialEq, strum::IntoStaticStr)]
187#[strum(serialize_all = "snake_case")]
188pub enum StagedProperty {
189    Bool(StagedScalar<bool>),
190    I8(StagedScalar<i8>),
191    U8(StagedScalar<u8>),
192    I32(StagedScalar<i32>),
193    U32(StagedScalar<u32>),
194    I64(StagedScalar<i64>),
195    U64(StagedScalar<u64>),
196    F32(StagedScalar<f32>),
197    F64(StagedScalar<f64>),
198    Str(StagedStrings),
199    SharedDict(StagedSharedDict),
200}
201
202#[derive(Clone, PartialEq)]
203#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
204pub struct ParsedScalar<'a, T: Copy + PartialEq> {
205    pub name: &'a str,
206    pub values: Vec<Option<T>>,
207}
208
209/// A single sub-property within a shared dictionary parsed value.
210#[derive(Debug, Clone, PartialEq, Eq)]
211#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
212pub struct ParsedSharedDictItem<'a> {
213    /// The suffix name of this sub-property (appended to parent struct name).
214    pub suffix: &'a str,
215    /// Per-feature `(start, end)` byte offsets into the parsed shared corpus.
216    /// Non-negative pairs indicate a present string stored as
217    /// `shared_dict.corpus()[start..end]`.
218    /// `(-1, -1)` indicates NULL.
219    /// Equal `start` and `end` indicate an empty string.
220    pub ranges: Vec<(i32, i32)>,
221}
222
223/// Parsed string values for a single property.
224#[derive(Debug, Clone, PartialEq, Eq)]
225pub struct ParsedStrings<'a> {
226    pub name: &'a str,
227    /// Per-feature cumulative end offsets into `data`.
228    /// Non-negative values indicate a present string and store its exclusive
229    /// end offset in `data`.
230    /// Negative values indicate NULL and encode the current offset as `-end-1`,
231    /// which is equivalent to `!end` in two's-complement form,
232    /// so the next item can still recover its start offset without scanning back
233    /// to the previous non-null value. This allows even the first item to be NULL.
234    /// In other words, if `lengths == [5, 5, -6, 8]`, then the strings are:
235    /// ```ignore
236    /// data[0..5], // 0th string
237    /// data[5..5], // 1st string is empty
238    /// NULL,       // 2nd string, offset stays 5 because -6 == -5-1
239    /// data[5..8], // 3rd string
240    /// ```
241    pub lengths: Vec<i32>,
242    pub data: Cow<'a, str>,
243}
244
245/// TODO: `ParsedSharedDict` should be able to have unparsed child items
246pub type SharedDictItem<'a> = EncDec<RawSharedDictItem<'a>, ParsedSharedDictItem<'a>>;
247
248/// Parsed shared dictionary payload shared by one or more child string properties.
249#[derive(Debug, Clone, PartialEq, Eq)]
250pub struct ParsedSharedDict<'a> {
251    pub prefix: &'a str,
252    pub data: Cow<'a, str>,
253    pub items: Vec<ParsedSharedDictItem<'a>>,
254}
255
256#[derive(Debug, Clone, Default, PartialEq, Eq)]
257#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
258pub struct ParsedPresence(pub Option<Vec<bool>>);
259
260#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::EnumIter)]
261#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
262pub enum PresenceStream {
263    /// Attaches a nullability stream
264    Present,
265    /// If there are nulls, drop them
266    Absent,
267}
268
269/// A single child field within a `SharedDict` raw column
270#[derive(Clone, Debug, PartialEq)]
271pub struct RawSharedDictItem<'a> {
272    pub name: &'a str,
273    pub presence: RawPresence<'a>,
274    pub data: RawStream<'a>,
275}
276
277/// Wire-ready encoded shared dict child column (owns its byte buffers).
278#[derive(Clone, Debug, PartialEq)]
279pub struct EncodedSharedDictItem {
280    pub name: EncodedName,
281    pub presence: EncodedPresence,
282    pub data: EncodedStream,
283}
284
285/// Raw plain data (length stream + data stream) borrowed from input bytes.
286#[derive(Debug, Clone, PartialEq)]
287pub struct RawPlainData<'a> {
288    pub lengths: RawStream<'a>,
289    pub data: RawStream<'a>,
290}
291
292/// Wire-ready encoded plain data (owns its byte buffers).
293#[derive(Debug, Clone, PartialEq)]
294pub struct EncodedPlainData {
295    pub lengths: EncodedStream,
296    pub data: EncodedStream,
297}
298
299/// Raw FSST-compressed data (4 streams) borrowed from input bytes.
300#[derive(Debug, Clone, PartialEq)]
301pub struct RawFsstData<'a> {
302    pub symbol_lengths: RawStream<'a>,
303    pub symbol_table: RawStream<'a>,
304    pub lengths: RawStream<'a>,
305    pub corpus: RawStream<'a>,
306}
307
308/// Wire-ready encoded FSST data (owns its byte buffers).
309#[derive(Debug, Clone, PartialEq)]
310pub struct EncodedFsstData {
311    pub symbol_lengths: EncodedStream,
312    pub symbol_table: EncodedStream,
313    pub lengths: EncodedStream,
314    pub corpus: EncodedStream,
315}
316
317/// Raw presence/nullability stream borrowed from input bytes.
318#[derive(Debug, Clone, PartialEq, Default)]
319pub struct RawPresence<'a>(pub Option<RawStream<'a>>);
320
321/// Wire-ready encoded presence/nullability stream (owns its byte buffers).
322#[derive(Debug, Clone, PartialEq, Default)]
323pub struct EncodedPresence(pub Option<EncodedStream>);
324
325/// Instruction for how to encode a single parsed property when batch-encoding a
326/// `Vec<ParsedProperty>`.
327#[derive(Debug, Clone, PartialEq, Eq)]
328pub enum PropertyEncoder {
329    /// How to encode a scalar property
330    Scalar(ScalarEncoder),
331    /// How to encode a shared dictionary property (multiple string sub-properties)
332    SharedDict(SharedDictEncoder),
333}
334
335/// How to encode properties
336#[derive(Debug, Clone, Copy, PartialEq, Eq)]
337#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
338pub struct ScalarEncoder {
339    pub presence: PresenceStream,
340    pub value: ScalarValueEncoder,
341}
342
343/// How to encode scalar property values.
344#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::IntoStaticStr)]
345#[strum(serialize_all = "snake_case")]
346#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
347pub enum ScalarValueEncoder {
348    Int(IntEncoder),
349    String(StrEncoder),
350    Float,
351    Bool,
352}
353
354/// Encoder for an individual sub-property within a shared dictionary.
355#[derive(Debug, Clone, PartialEq, Eq)]
356pub struct SharedDictItemEncoder {
357    /// If a stream for optional values should be attached.
358    pub presence: PresenceStream,
359    /// Encoder used for the offset-index stream of this child.
360    pub offsets: IntEncoder,
361}
362
363/// Encoder for a shared dictionary property with multiple string sub-properties.
364#[derive(Debug, Clone, PartialEq, Eq)]
365pub struct SharedDictEncoder {
366    /// Encoder for the shared dictionary strings (plain vs FSST).
367    pub dict_encoder: StrEncoder,
368    /// Encoders for individual sub-properties.
369    pub items: Vec<SharedDictItemEncoder>,
370}
371
372#[derive(Debug, Eq, PartialEq, Clone, Copy)]
373#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
374#[cfg_attr(all(not(test), feature = "arbitrary"), derive(arbitrary::Arbitrary))]
375pub enum StrEncoder {
376    Plain { string_lengths: IntEncoder },
377    Fsst(FsstStrEncoder),
378}
379
380// ── Staged* types (encode-side, fully owned) ─────────────────────────────────
381
382/// Owned scalar column prepared for encoding (bool, integer, or float).
383#[derive(Debug, Clone, PartialEq)]
384#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
385pub struct StagedScalar<T: Copy + PartialEq> {
386    pub name: String,
387    pub values: Vec<Option<T>>,
388}
389
390/// Owned string column prepared for encoding.
391#[derive(Debug, Clone, PartialEq, Eq)]
392pub struct StagedStrings {
393    pub name: String,
394    /// Per-feature cumulative end offsets into `data` (same encoding as [`ParsedStrings::lengths`]).
395    pub lengths: Vec<i32>,
396    pub data: String,
397}
398
399/// A single child within a staged shared-dictionary column.
400#[derive(Debug, Clone, PartialEq, Eq)]
401pub struct StagedSharedDictItem {
402    pub suffix: String,
403    /// Per-feature `(start, end)` byte offsets into the shared corpus.
404    pub ranges: Vec<(i32, i32)>,
405}
406
407/// Owned shared-dictionary column prepared for encoding.
408#[derive(Debug, Clone, PartialEq, Eq)]
409pub struct StagedSharedDict {
410    pub prefix: String,
411    pub data: String,
412    pub items: Vec<StagedSharedDictItem>,
413}