open_vector_tile/open/
column_cache.rs

1use crate::{
2    delta_decode_array, delta_encode_array, unweave_and_delta_decode_3d_array,
3    unweave_and_delta_decode_array, weave_and_delta_encode_3d_array, weave_and_delta_encode_array,
4    BBoxQuantization, CustomOrdWrapper, Point, Point3D, VectorPoints, VectorPoints3D,
5};
6use alloc::{collections::BTreeMap, string::String, vec::Vec};
7use core::cell::RefCell;
8use pbf::{ProtoRead, ProtoWrite, Protobuf};
9use s2json::BBOX;
10
11/// Column Types take up 3 bits.
12/// ColumnNames define various common data structures to be stored in a column fashion
13#[derive(Debug, Default, Clone, PartialEq, PartialOrd, Eq, Ord)]
14pub enum OColumnName {
15    /// stores string values
16    #[default]
17    String = 0,
18    /// Note: IDs are stored in unsigned
19    /// Number types are sorted prior to storing
20    Unsigned = 1,
21    /// Number types are sorted prior to storing
22    Signed = 2,
23    /// Floating precision helps ensure only 32 bit cost
24    /// Number types are sorted prior to storing
25    Float = 3,
26    /// worst case, no compression
27    /// Number types are sorted prior to storing
28    Double = 4,
29    /// points is an array of { x: number, y: number }
30    /// points also stores lines.
31    /// if a line is stored, note that it has an acompanying offset and potentially mValues
32    /// Polygons are stored as a collection of lines.
33    /// The points feature type that has more than one will be stored here as well.
34    Points = 5,
35    /// points3D is an array of { x: number, y: number, z: number }
36    /// points3D also stores lines.
37    /// if a line is stored, note that it has an acompanying offset and potentially mValues
38    /// Polygons are stored as a collection of lines.
39    /// The points 3D feature type that has more than one will be stored here as well.
40    Points3D = 6,
41    /// store M-Value, Shape, and Value encodings
42    /// store geometry shapes.
43    /// store geometry indices.
44    Indices = 7,
45    /// Shapes describe how to rebuild objects
46    Shapes = 8,
47    /// BBox - specially compressed to reduce byte cost. each value is only 3 bytes worst case
48    /// BBox3D - specially compressed to reduce byte cost. each value is only 3 bytes worst case.
49    /// The z values are stored as floats and cost 4 bytes.
50    BBox = 9,
51}
52impl From<u8> for OColumnName {
53    fn from(value: u8) -> Self {
54        match value {
55            0 => OColumnName::String,
56            1 => OColumnName::Unsigned,
57            2 => OColumnName::Signed,
58            3 => OColumnName::Float,
59            4 => OColumnName::Double,
60            5 => OColumnName::Points,
61            6 => OColumnName::Points3D,
62            7 => OColumnName::Indices,
63            8 => OColumnName::Shapes,
64            9 => OColumnName::BBox,
65            _ => OColumnName::String,
66        }
67    }
68}
69impl From<OColumnName> for u64 {
70    fn from(col: OColumnName) -> Self {
71        col as u64
72    }
73}
74
75//? READING
76
77/// note: base1 type allows you to decode as needed for each grouping of data.
78/// for instance OColumnString is an array of strings, but you may only need a few strings on use.
79/// Store either data itself or a reference to the position in the protobuf to deserialize
80#[derive(Debug)]
81pub enum ColumnContainer<T> {
82    /// reference to a position in the protobuf
83    Pos(usize),
84    /// data itself
85    Data(T),
86}
87
88/// Column Cache Reader
89/// Stores all data in a column format.
90/// Upon construction, all columns are decoded from the protobuf.
91/// This allows for quick and easy access to data in a column format.
92#[derive(Debug, Default)]
93pub struct ColumnCacheReader {
94    /// strings are stored in a column of strings
95    string: Vec<String>,
96    /// unsigned whole numbers are stored in unsigned
97    unsigned: Vec<u64>,
98    /// negative numbers are stored in signed
99    signed: Vec<i64>,
100    /// non-whole 32-bit numbers are stored in float
101    float: Vec<f32>,
102    /// non-whole numbers greater than 32-bit are stored in double
103    double: Vec<f64>,
104    /// for geometry types each column is individually weaved and delta encoded
105    points: Vec<VectorPoints>,
106    /// for geometry types each column is individually weaved and delta encoded
107    points_3d: Vec<VectorPoints3D>,
108    /// store M-Value indices>, geometry indices>, and geometry shapes
109    indices: Vec<Vec<u32>>,
110    /// shapes and possibly value indices are stored in a number[] to be decoded by readShape
111    shapes: Vec<Vec<usize>>,
112    /// Stores both BBox and BBox3D in a single column
113    bbox: Vec<BBOX>,
114}
115impl ColumnCacheReader {
116    /// create an instance
117    pub fn new() -> Self {
118        ColumnCacheReader { ..Default::default() }
119    }
120
121    /// get a string
122    pub fn get_string(&mut self, index: usize) -> String {
123        self.string[index].clone()
124    }
125
126    /// get an unsigned integer
127    pub fn get_unsigned(&self, index: usize) -> u64 {
128        self.unsigned[index]
129    }
130
131    /// get a signed integer
132    pub fn get_signed(&self, index: usize) -> i64 {
133        self.signed[index]
134    }
135
136    /// get a float
137    pub fn get_float(&self, index: usize) -> f32 {
138        self.float[index]
139    }
140
141    /// get a double
142    pub fn get_double(&self, index: usize) -> f64 {
143        self.double[index]
144    }
145
146    /// get a vector of points used by all geometry types
147    pub fn get_points(&mut self, index: usize) -> VectorPoints {
148        self.points[index].clone()
149    }
150
151    /// get a vector of 3D points used by all geometry types
152    pub fn get_points_3d(&mut self, index: usize) -> VectorPoints3D {
153        self.points_3d[index].clone()
154    }
155
156    /// get a vector of indices used by all geometry types
157    pub fn get_indices(&mut self, index: usize) -> Vec<u32> {
158        self.indices[index].clone()
159    }
160
161    /// get a vector of encoded data that helps decode shapes
162    pub fn get_shapes(&mut self, index: usize) -> Vec<usize> {
163        self.shapes[index].clone()
164    }
165
166    /// get a BBox
167    pub fn get_bbox(&mut self, index: usize) -> BBOX {
168        self.bbox[index]
169    }
170}
171impl ProtoRead for ColumnCacheReader {
172    fn read(&mut self, tag: u64, pb: &mut Protobuf) {
173        match tag {
174            0 => self.string.push(pb.read_string()),
175            1 => self.unsigned.push(pb.read_varint::<u64>()),
176            2 => self.signed.push(pb.read_s_varint::<i64>()),
177            3 => self.float.push(pb.read_varint::<f32>()),
178            4 => self.double.push(pb.read_varint::<f64>()),
179            5 => self.points.push(unweave_and_delta_decode_array(&pb.read_packed::<u64>())),
180            6 => self.points_3d.push(unweave_and_delta_decode_3d_array(&pb.read_packed::<u64>())),
181            7 => self.indices.push(delta_decode_array(&pb.read_packed::<u32>())),
182            8 => self.shapes.push(pb.read_packed::<usize>()),
183            9 => self.bbox.push(BBOX::dequantize(&pb.read_packed::<u8>()[..])),
184            #[tarpaulin::skip]
185            _ => panic!("Unknown column type"),
186        }
187    }
188}
189
190//? WRITING
191
192/// Numbers track their own index for sorting purposes
193#[derive(Debug, Default, Clone, PartialEq, PartialOrd, Eq, Ord)]
194pub struct OColumnBaseChunk {
195    /// The index in the column. Will be updated during the writing phase when converted
196    /// from a map to an array
197    pub index: usize,
198    /// track how many times this chunk is reused
199    pub count: usize,
200}
201/// A value is a collection of lookup devices. A number is decoded by the appropriate function,
202/// but the object is a reference to one of the number columns.
203/// Number types are eventually sorted, so we track the column and index with the data.
204#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
205pub enum ColumnValue {
206    /// raw number index pointing to a location in the cache column
207    Number(usize),
208    /// a reference to a column
209    Column(RefCell<OColumnBaseChunk>),
210}
211impl From<usize> for ColumnValue {
212    fn from(index: usize) -> Self {
213        ColumnValue::Number(index)
214    }
215}
216impl From<RefCell<OColumnBaseChunk>> for ColumnValue {
217    fn from(chunk: RefCell<OColumnBaseChunk>) -> Self {
218        ColumnValue::Column(chunk)
219    }
220}
221/// A building block for all column types.
222pub type OColumnBaseWrite<K> = BTreeMap<K, RefCell<OColumnBaseChunk>>;
223
224/// A building block for all number column types.
225pub type OColumnBaseFloatWrite<K> = BTreeMap<CustomOrdWrapper<K>, RefCell<OColumnBaseChunk>>;
226
227/// The cache where all data is stored in a column format.
228/// Each column type has its own array of data.
229/// Number types maintain their own index for sorting purposes.
230#[derive(Debug, Default)]
231pub struct ColumnCacheWriter {
232    /// strings are grouped by their bytes.
233    string: OColumnBaseWrite<String>,
234    /// Unsigned integers are sorted prior to storing
235    unsigned: OColumnBaseWrite<u64>,
236    /// Signed integers are sorted prior to storing
237    signed: OColumnBaseWrite<i64>,
238    /// 32-bit partial values are sorted prior to storing
239    float: OColumnBaseFloatWrite<f32>,
240    /// 64-bit partial values are sorted prior to storing
241    double: OColumnBaseFloatWrite<f64>,
242    /// for geometry types each column is individually weaved and delta encoded
243    points: OColumnBaseWrite<Vec<Point>>,
244    /// for geometry types each column is individually weaved and delta encoded
245    points_3d: OColumnBaseWrite<Vec<Point3D>>,
246    /// Indices track geometry indices, geometry shapes, or other indexing data
247    indices: OColumnBaseWrite<Vec<u32>>,
248    /// Contains number arrays of how to rebuild objects
249    shapes: OColumnBaseWrite<Vec<ColumnValue>>,
250    /// Features should be sorted by id prior to building a column
251    bbox: OColumnBaseWrite<BBOX>,
252}
253impl ColumnCacheWriter {
254    /// add string to cache
255    pub fn add_string(&mut self, value: String) -> usize {
256        add(&mut self.string, value)
257    }
258
259    /// add u64 to cache
260    pub fn add_u64(&mut self, value: u64) -> RefCell<OColumnBaseChunk> {
261        add_number(&mut self.unsigned, value)
262    }
263
264    /// add i64 to cache
265    pub fn add_i64(&mut self, value: i64) -> RefCell<OColumnBaseChunk> {
266        add_number(&mut self.signed, value)
267    }
268
269    /// add f32 to cache
270    pub fn add_f32(&mut self, value: f32) -> RefCell<OColumnBaseChunk> {
271        add_number(&mut self.float, CustomOrdWrapper(value))
272    }
273
274    /// add f64 to cache
275    pub fn add_f64(&mut self, value: f64) -> RefCell<OColumnBaseChunk> {
276        add_number(&mut self.double, CustomOrdWrapper(value))
277    }
278
279    /// add points to cache
280    pub fn add_points(&mut self, value: Vec<Point>) -> usize {
281        add(&mut self.points, value)
282    }
283
284    /// add points_3d to cache
285    pub fn add_points_3d(&mut self, value: Vec<Point3D>) -> usize {
286        add(&mut self.points_3d, value)
287    }
288
289    /// add indices to cache
290    pub fn add_indices(&mut self, value: Vec<u32>) -> usize {
291        add(&mut self.indices, value)
292    }
293
294    /// add shapes to cache
295    pub fn add_shapes(&mut self, value: Vec<ColumnValue>) -> usize {
296        add(&mut self.shapes, value)
297    }
298
299    /// add bbox to cache
300    pub fn add_bbox(&mut self, value: BBOX) -> usize {
301        add(&mut self.bbox, value)
302    }
303}
304impl ProtoWrite for ColumnCacheWriter {
305    fn write(&self, pbf: &mut Protobuf) {
306        // setup
307        let mut strings: Vec<(&String, &RefCell<OColumnBaseChunk>)> = self.string.iter().collect();
308        let mut unsigned: Vec<(&u64, &RefCell<OColumnBaseChunk>)> = self.unsigned.iter().collect();
309        let mut signed: Vec<(&i64, &RefCell<OColumnBaseChunk>)> = self.signed.iter().collect();
310        let mut float: Vec<(&CustomOrdWrapper<f32>, &RefCell<OColumnBaseChunk>)> =
311            self.float.iter().collect();
312        let mut double: Vec<(&CustomOrdWrapper<f64>, &RefCell<OColumnBaseChunk>)> =
313            self.double.iter().collect();
314        let mut points: Vec<(&Vec<Point>, &RefCell<OColumnBaseChunk>)> =
315            self.points.iter().collect();
316        let mut points_3d: Vec<(&Vec<Point3D>, &RefCell<OColumnBaseChunk>)> =
317            self.points_3d.iter().collect();
318        let mut indices: Vec<(&Vec<u32>, &RefCell<OColumnBaseChunk>)> =
319            self.indices.iter().collect();
320        let mut shapes: Vec<(&Vec<ColumnValue>, &RefCell<OColumnBaseChunk>)> =
321            self.shapes.iter().collect();
322        let mut bbox: Vec<(&BBOX, &RefCell<OColumnBaseChunk>)> = self.bbox.iter().collect();
323
324        // sort them
325        // TODO: bring this back
326        // sort_column(&mut unsigned);
327        // sort_column(&mut signed);
328        // sort_column(&mut float);
329        // sort_column(&mut double);
330        strings.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
331        unsigned.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
332        signed.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
333        float.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
334        double.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
335        points.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
336        points_3d.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
337        indices.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
338        shapes.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
339        bbox.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
340
341        // store
342        // strings
343        for string in strings {
344            pbf.write_string_field(OColumnName::String.into(), string.0);
345        }
346        // u64
347        for u in unsigned {
348            pbf.write_varint_field(OColumnName::Unsigned.into(), *u.0);
349        }
350        // i64
351        for s in signed {
352            pbf.write_s_varint_field(OColumnName::Signed.into(), *s.0);
353        }
354        // f32
355        for f in float {
356            pbf.write_varint_field(OColumnName::Float.into(), f.0 .0);
357        }
358        // f64
359        for d in double {
360            pbf.write_varint_field(OColumnName::Double.into(), d.0 .0);
361        }
362        // points
363        for p in points {
364            pbf.write_packed_varint(OColumnName::Points.into(), &weave_and_delta_encode_array(p.0));
365        }
366        // points 3D
367        for p_3d in points_3d {
368            pbf.write_packed_varint(
369                OColumnName::Points3D.into(),
370                &weave_and_delta_encode_3d_array(p_3d.0),
371            );
372        }
373        // indices
374        for i in indices {
375            pbf.write_packed_varint(OColumnName::Indices.into(), &delta_encode_array(i.0));
376        }
377        // shapes
378        for s in shapes {
379            let packed: Vec<usize> =
380                s.0.iter()
381                    .map(|v| match v {
382                        ColumnValue::Number(n) => *n,
383                        ColumnValue::Column(c) => c.borrow().index,
384                    })
385                    .collect();
386            pbf.write_packed_varint(OColumnName::Shapes.into(), &packed);
387        }
388        // bbox
389        for bbox in bbox {
390            pbf.write_packed_varint(OColumnName::BBox.into(), &bbox.0.quantize());
391        }
392    }
393}
394
395/// Add value to column and return index
396pub fn add<T>(col: &mut OColumnBaseWrite<T>, value: T) -> usize
397where
398    T: Ord,
399{
400    if let Some(col) = col.get_mut(&value) {
401        let mut chunk = col.borrow_mut();
402        chunk.count += 1;
403        chunk.index
404    } else {
405        let index = col.len();
406        col.insert(value, RefCell::new(OColumnBaseChunk { index, count: 1 }));
407        index
408    }
409}
410
411/// Add a **number** value to column and return index
412pub fn add_number<T>(col: &mut OColumnBaseWrite<T>, value: T) -> RefCell<OColumnBaseChunk>
413where
414    T: Ord,
415{
416    if let Some(chunk) = col.get_mut(&value) {
417        {
418            let mut chunk_mut = chunk.borrow_mut();
419            chunk_mut.count += 1;
420        }
421        chunk.clone()
422    } else {
423        let index = col.len();
424        let new_chunk = RefCell::new(OColumnBaseChunk { index, count: 1 });
425        col.insert(value, new_chunk.clone());
426        new_chunk
427    }
428}
429
430// /// Sort number types and value types by index then update the index of each row for better
431// /// compression down the line.
432// pub fn sort_column<T: CustomOrd + core::fmt::Debug>(
433//     input: &mut [(&T, &RefCell<OColumnBaseChunk>)],
434// ) {
435//     // first sort
436//     input.sort_by(|a, b| {
437//         // First sort by count in descending order
438//         match b.1.borrow().count.cmp(&a.1.borrow().count) {
439//             Ordering::Equal => a.0.custom_cmp(b.0), // Then sort by data if counts are equal
440//             other => other,
441//         }
442//     });
443//     // than update indexes
444//     input
445//         .iter_mut()
446//         .enumerate()
447//         .for_each(|(i, v)| v.1.borrow_mut().index = i);
448// }