open_vector_tile/open/
column_cache.rs

1use crate::{
2    BBoxQuantization, CustomOrdWrapper, Point, Point3D, VectorPoints, VectorPoints3D,
3    delta_decode_array, delta_encode_array, unweave_and_delta_decode_3d_array,
4    unweave_and_delta_decode_array, weave_and_delta_encode_3d_array, weave_and_delta_encode_array,
5};
6use alloc::{collections::BTreeMap, string::String, vec::Vec};
7use core::cell::RefCell;
8use pbf::{ProtoRead, ProtoWrite, Protobuf};
9use s2json::BBOX;
10
11/// Column Types take up 3 bits.
12/// ColumnNames define various common data structures to be stored in a column fashion
13#[derive(Debug, Default, Clone, PartialEq, PartialOrd, Eq, Ord)]
14pub enum OColumnName {
15    /// stores string values
16    #[default]
17    String = 1,
18    /// Note: IDs are stored in unsigned
19    /// Number types are sorted prior to storing
20    Unsigned = 2,
21    /// Number types are sorted prior to storing
22    Signed = 3,
23    /// Floating precision helps ensure only 32 bit cost
24    /// Number types are sorted prior to storing
25    Float = 4,
26    /// worst case, no compression
27    /// Number types are sorted prior to storing
28    Double = 5,
29    /// points is an array of { x: number, y: number }
30    /// points also stores lines.
31    /// if a line is stored, note that it has an acompanying offset and potentially mValues
32    /// Polygons are stored as a collection of lines.
33    /// The points feature type that has more than one will be stored here as well.
34    Points = 6,
35    /// points3D is an array of { x: number, y: number, z: number }
36    /// points3D also stores lines.
37    /// if a line is stored, note that it has an acompanying offset and potentially mValues
38    /// Polygons are stored as a collection of lines.
39    /// The points 3D feature type that has more than one will be stored here as well.
40    Points3D = 7,
41    /// store M-Value, Shape, and Value encodings
42    /// store geometry shapes.
43    /// store geometry indices.
44    Indices = 8,
45    /// Shapes describe how to rebuild objects
46    Shapes = 9,
47    /// BBox - specially compressed to reduce byte cost. each value is only 3 bytes worst case
48    /// BBox3D - specially compressed to reduce byte cost. each value is only 3 bytes worst case.
49    /// The z values are stored as floats and cost 4 bytes.
50    BBox = 10,
51}
52impl From<u64> for OColumnName {
53    fn from(value: u64) -> Self {
54        match value {
55            1 => OColumnName::String,
56            2 => OColumnName::Unsigned,
57            3 => OColumnName::Signed,
58            4 => OColumnName::Float,
59            5 => OColumnName::Double,
60            6 => OColumnName::Points,
61            7 => OColumnName::Points3D,
62            8 => OColumnName::Indices,
63            9 => OColumnName::Shapes,
64            10 => OColumnName::BBox,
65            _ => OColumnName::String,
66        }
67    }
68}
69impl From<OColumnName> for u64 {
70    fn from(col: OColumnName) -> Self {
71        col as u64
72    }
73}
74
75//? READING
76
77/// note: base1 type allows you to decode as needed for each grouping of data.
78/// for instance OColumnString is an array of strings, but you may only need a few strings on use.
79/// Store either data itself or a reference to the position in the protobuf to deserialize
80#[derive(Debug)]
81pub enum ColumnContainer<T> {
82    /// reference to a position in the protobuf
83    Pos(usize),
84    /// data itself
85    Data(T),
86}
87
88/// Column Cache Reader
89/// Stores all data in a column format.
90/// Upon construction, all columns are decoded from the protobuf.
91/// This allows for quick and easy access to data in a column format.
92#[derive(Debug, Default)]
93pub struct ColumnCacheReader {
94    /// strings are stored in a column of strings
95    string: Vec<String>,
96    /// unsigned whole numbers are stored in unsigned
97    unsigned: Vec<u64>,
98    /// negative numbers are stored in signed
99    signed: Vec<i64>,
100    /// non-whole 32-bit numbers are stored in float
101    float: Vec<f32>,
102    /// non-whole numbers greater than 32-bit are stored in double
103    double: Vec<f64>,
104    /// for geometry types each column is individually weaved and delta encoded
105    points: Vec<VectorPoints>,
106    /// for geometry types each column is individually weaved and delta encoded
107    points_3d: Vec<VectorPoints3D>,
108    /// store M-Value indices>, geometry indices>, and geometry shapes
109    indices: Vec<Vec<u32>>,
110    /// shapes and possibly value indices are stored in a number[] to be decoded by readShape
111    shapes: Vec<Vec<usize>>,
112    /// Stores both BBox and BBox3D in a single column
113    bbox: Vec<BBOX>,
114}
115impl ColumnCacheReader {
116    /// create an instance
117    pub fn new() -> Self {
118        ColumnCacheReader { ..Default::default() }
119    }
120
121    /// get a string
122    pub fn get_string(&mut self, index: usize) -> String {
123        self.string[index].clone()
124    }
125
126    /// get an unsigned integer
127    pub fn get_unsigned(&self, index: usize) -> u64 {
128        self.unsigned[index]
129    }
130
131    /// get a signed integer
132    pub fn get_signed(&self, index: usize) -> i64 {
133        self.signed[index]
134    }
135
136    /// get a float
137    pub fn get_float(&self, index: usize) -> f32 {
138        self.float[index]
139    }
140
141    /// get a double
142    pub fn get_double(&self, index: usize) -> f64 {
143        self.double[index]
144    }
145
146    /// get a vector of points used by all geometry types
147    pub fn get_points(&mut self, index: usize) -> VectorPoints {
148        self.points[index].clone()
149    }
150
151    /// get a vector of 3D points used by all geometry types
152    pub fn get_points_3d(&mut self, index: usize) -> VectorPoints3D {
153        self.points_3d[index].clone()
154    }
155
156    /// get a vector of indices used by all geometry types
157    pub fn get_indices(&mut self, index: usize) -> Vec<u32> {
158        self.indices[index].clone()
159    }
160
161    /// get a vector of encoded data that helps decode shapes
162    pub fn get_shapes(&mut self, index: usize) -> Vec<usize> {
163        self.shapes[index].clone()
164    }
165
166    /// get a BBox
167    pub fn get_bbox(&mut self, index: usize) -> BBOX {
168        self.bbox[index]
169    }
170}
171impl ProtoRead for ColumnCacheReader {
172    fn read(&mut self, tag: u64, pb: &mut Protobuf) {
173        let col: OColumnName = tag.into();
174        match col {
175            OColumnName::String => self.string.push(pb.read_string()),
176            OColumnName::Unsigned => self.unsigned.push(pb.read_varint::<u64>()),
177            OColumnName::Signed => self.signed.push(pb.read_s_varint::<i64>()),
178            OColumnName::Float => self.float.push(pb.read_varint::<f32>()),
179            OColumnName::Double => self.double.push(pb.read_varint::<f64>()),
180            OColumnName::Points => {
181                self.points.push(unweave_and_delta_decode_array(&pb.read_packed::<u64>()))
182            }
183            OColumnName::Points3D => {
184                self.points_3d.push(unweave_and_delta_decode_3d_array(&pb.read_packed::<u64>()))
185            }
186            OColumnName::Indices => self.indices.push(delta_decode_array(&pb.read_packed::<u32>())),
187            OColumnName::Shapes => self.shapes.push(pb.read_packed::<usize>()),
188            OColumnName::BBox => self.bbox.push(BBOX::dequantize(&pb.read_packed::<u8>()[..])),
189        }
190    }
191}
192
193//? WRITING
194
195/// Numbers track their own index for sorting purposes
196#[derive(Debug, Default, Clone, PartialEq, PartialOrd, Eq, Ord)]
197pub struct OColumnBaseChunk {
198    /// The index in the column. Will be updated during the writing phase when converted
199    /// from a map to an array
200    pub index: usize,
201    /// track how many times this chunk is reused
202    pub count: usize,
203}
204/// A value is a collection of lookup devices. A number is decoded by the appropriate function,
205/// but the object is a reference to one of the number columns.
206/// Number types are eventually sorted, so we track the column and index with the data.
207#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
208pub enum ColumnValue {
209    /// raw number index pointing to a location in the cache column
210    Number(usize),
211    /// a reference to a column
212    Column(RefCell<OColumnBaseChunk>),
213}
214impl From<usize> for ColumnValue {
215    fn from(index: usize) -> Self {
216        ColumnValue::Number(index)
217    }
218}
219impl From<RefCell<OColumnBaseChunk>> for ColumnValue {
220    fn from(chunk: RefCell<OColumnBaseChunk>) -> Self {
221        ColumnValue::Column(chunk)
222    }
223}
224/// A building block for all column types.
225pub type OColumnBaseWrite<K> = BTreeMap<K, RefCell<OColumnBaseChunk>>;
226
227/// A building block for all number column types.
228pub type OColumnBaseFloatWrite<K> = BTreeMap<CustomOrdWrapper<K>, RefCell<OColumnBaseChunk>>;
229
230/// The cache where all data is stored in a column format.
231/// Each column type has its own array of data.
232/// Number types maintain their own index for sorting purposes.
233#[derive(Debug, Default)]
234pub struct ColumnCacheWriter {
235    /// strings are grouped by their bytes.
236    string: OColumnBaseWrite<String>,
237    /// Unsigned integers are sorted prior to storing
238    unsigned: OColumnBaseWrite<u64>,
239    /// Signed integers are sorted prior to storing
240    signed: OColumnBaseWrite<i64>,
241    /// 32-bit partial values are sorted prior to storing
242    float: OColumnBaseFloatWrite<f32>,
243    /// 64-bit partial values are sorted prior to storing
244    double: OColumnBaseFloatWrite<f64>,
245    /// for geometry types each column is individually weaved and delta encoded
246    points: OColumnBaseWrite<Vec<Point>>,
247    /// for geometry types each column is individually weaved and delta encoded
248    points_3d: OColumnBaseWrite<Vec<Point3D>>,
249    /// Indices track geometry indices, geometry shapes, or other indexing data
250    indices: OColumnBaseWrite<Vec<u32>>,
251    /// Contains number arrays of how to rebuild objects
252    shapes: OColumnBaseWrite<Vec<ColumnValue>>,
253    /// Features should be sorted by id prior to building a column
254    bbox: OColumnBaseWrite<BBOX>,
255}
256impl ColumnCacheWriter {
257    /// add string to cache
258    pub fn add_string(&mut self, value: String) -> usize {
259        add(&mut self.string, value)
260    }
261
262    /// add u64 to cache
263    pub fn add_u64(&mut self, value: u64) -> RefCell<OColumnBaseChunk> {
264        add_number(&mut self.unsigned, value)
265    }
266
267    /// add i64 to cache
268    pub fn add_i64(&mut self, value: i64) -> RefCell<OColumnBaseChunk> {
269        add_number(&mut self.signed, value)
270    }
271
272    /// add f32 to cache
273    pub fn add_f32(&mut self, value: f32) -> RefCell<OColumnBaseChunk> {
274        add_number(&mut self.float, CustomOrdWrapper(value))
275    }
276
277    /// add f64 to cache
278    pub fn add_f64(&mut self, value: f64) -> RefCell<OColumnBaseChunk> {
279        add_number(&mut self.double, CustomOrdWrapper(value))
280    }
281
282    /// add points to cache
283    pub fn add_points(&mut self, value: Vec<Point>) -> usize {
284        add(&mut self.points, value)
285    }
286
287    /// add points_3d to cache
288    pub fn add_points_3d(&mut self, value: Vec<Point3D>) -> usize {
289        add(&mut self.points_3d, value)
290    }
291
292    /// add indices to cache
293    pub fn add_indices(&mut self, value: Vec<u32>) -> usize {
294        add(&mut self.indices, value)
295    }
296
297    /// add shapes to cache
298    pub fn add_shapes(&mut self, value: Vec<ColumnValue>) -> usize {
299        add(&mut self.shapes, value)
300    }
301
302    /// add bbox to cache
303    pub fn add_bbox(&mut self, value: BBOX) -> usize {
304        add(&mut self.bbox, value)
305    }
306}
307impl ProtoWrite for ColumnCacheWriter {
308    fn write(&self, pbf: &mut Protobuf) {
309        // setup
310        let mut strings: Vec<(&String, &RefCell<OColumnBaseChunk>)> = self.string.iter().collect();
311        let mut unsigned: Vec<(&u64, &RefCell<OColumnBaseChunk>)> = self.unsigned.iter().collect();
312        let mut signed: Vec<(&i64, &RefCell<OColumnBaseChunk>)> = self.signed.iter().collect();
313        let mut float: Vec<(&CustomOrdWrapper<f32>, &RefCell<OColumnBaseChunk>)> =
314            self.float.iter().collect();
315        let mut double: Vec<(&CustomOrdWrapper<f64>, &RefCell<OColumnBaseChunk>)> =
316            self.double.iter().collect();
317        let mut points: Vec<(&Vec<Point>, &RefCell<OColumnBaseChunk>)> =
318            self.points.iter().collect();
319        let mut points_3d: Vec<(&Vec<Point3D>, &RefCell<OColumnBaseChunk>)> =
320            self.points_3d.iter().collect();
321        let mut indices: Vec<(&Vec<u32>, &RefCell<OColumnBaseChunk>)> =
322            self.indices.iter().collect();
323        let mut shapes: Vec<(&Vec<ColumnValue>, &RefCell<OColumnBaseChunk>)> =
324            self.shapes.iter().collect();
325        let mut bbox: Vec<(&BBOX, &RefCell<OColumnBaseChunk>)> = self.bbox.iter().collect();
326
327        // sort them
328        // TODO: bring this back
329        // sort_column(&mut unsigned);
330        // sort_column(&mut signed);
331        // sort_column(&mut float);
332        // sort_column(&mut double);
333        strings.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
334        unsigned.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
335        signed.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
336        float.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
337        double.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
338        points.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
339        points_3d.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
340        indices.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
341        shapes.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
342        bbox.sort_by(|a, b| a.1.borrow().index.cmp(&b.1.borrow().index));
343
344        // store
345        // strings
346        for string in strings {
347            pbf.write_string_field(OColumnName::String.into(), string.0);
348        }
349        // u64
350        for u in unsigned {
351            pbf.write_varint_field(OColumnName::Unsigned.into(), *u.0);
352        }
353        // i64
354        for s in signed {
355            pbf.write_s_varint_field(OColumnName::Signed.into(), *s.0);
356        }
357        // f32
358        for f in float {
359            pbf.write_varint_field(OColumnName::Float.into(), f.0.0);
360        }
361        // f64
362        for d in double {
363            pbf.write_varint_field(OColumnName::Double.into(), d.0.0);
364        }
365        // points
366        for p in points {
367            pbf.write_packed_varint(OColumnName::Points.into(), &weave_and_delta_encode_array(p.0));
368        }
369        // points 3D
370        for p_3d in points_3d {
371            pbf.write_packed_varint(
372                OColumnName::Points3D.into(),
373                &weave_and_delta_encode_3d_array(p_3d.0),
374            );
375        }
376        // indices
377        for i in indices {
378            pbf.write_packed_varint(OColumnName::Indices.into(), &delta_encode_array(i.0));
379        }
380        // shapes
381        for s in shapes {
382            let packed: Vec<usize> =
383                s.0.iter()
384                    .map(|v| match v {
385                        ColumnValue::Number(n) => *n,
386                        ColumnValue::Column(c) => c.borrow().index,
387                    })
388                    .collect();
389            pbf.write_packed_varint(OColumnName::Shapes.into(), &packed);
390        }
391        // bbox
392        for bbox in bbox {
393            pbf.write_packed_varint(OColumnName::BBox.into(), &bbox.0.quantize());
394        }
395    }
396}
397
398/// Add value to column and return index
399pub fn add<T>(col: &mut OColumnBaseWrite<T>, value: T) -> usize
400where
401    T: Ord,
402{
403    if let Some(col) = col.get_mut(&value) {
404        let mut chunk = col.borrow_mut();
405        chunk.count += 1;
406        chunk.index
407    } else {
408        let index = col.len();
409        col.insert(value, RefCell::new(OColumnBaseChunk { index, count: 1 }));
410        index
411    }
412}
413
414/// Add a **number** value to column and return index
415pub fn add_number<T>(col: &mut OColumnBaseWrite<T>, value: T) -> RefCell<OColumnBaseChunk>
416where
417    T: Ord,
418{
419    if let Some(chunk) = col.get_mut(&value) {
420        {
421            let mut chunk_mut = chunk.borrow_mut();
422            chunk_mut.count += 1;
423        }
424        chunk.clone()
425    } else {
426        let index = col.len();
427        let new_chunk = RefCell::new(OColumnBaseChunk { index, count: 1 });
428        col.insert(value, new_chunk.clone());
429        new_chunk
430    }
431}
432
433// /// Sort number types and value types by index then update the index of each row for better
434// /// compression down the line.
435// pub fn sort_column<T: CustomOrd + core::fmt::Debug>(
436//     input: &mut [(&T, &RefCell<OColumnBaseChunk>)],
437// ) {
438//     // first sort
439//     input.sort_by(|a, b| {
440//         // First sort by count in descending order
441//         match b.1.borrow().count.cmp(&a.1.borrow().count) {
442//             Ordering::Equal => a.0.custom_cmp(b.0), // Then sort by data if counts are equal
443//             other => other,
444//         }
445//     });
446//     // than update indexes
447//     input
448//         .iter_mut()
449//         .enumerate()
450//         .for_each(|(i, v)| v.1.borrow_mut().index = i);
451// }