hdi/hash_path/
shard.rs

1use crate::hash_path::path::Component;
2use crate::hash_path::path::Path;
3use std::str::FromStr;
4
5/// Separates the shard width and depth.
6pub const SHARDSPLIT: &str = ":";
7/// Terminates the end of a shard shorthand.
8pub const SHARDEND: &str = "#";
9
10/// The width of a shard is how many bytes/characters to use for each path component in sharding.
11/// e.g. abcdef with width 1 shards to a.b.c.d.e.f.abcdef and 2 shards to ab.cd.ef.abcdef.
12pub type ShardWidth = u32;
13/// The depth of a shard is the number of path components to stretch out for shards.
14/// e.g. abcdef with a depth of 1 and width 1 shards to a.abcdef and depth 2 shards to a.b.abcdef.
15pub type ShardDepth = u32;
16
17#[derive(Debug)]
18/// A valid strategy for sharding requires both a width and a depth.
19/// At the moment sharding only works well for data that is reliably longer than width/depth.
20/// For example, sharding the username foo with width 4 doesn't make sense.
21/// There is no magic padding or extending of the provided data to make up undersized shards.
22// @todo stretch short shards out in a nice balanced way (append some bytes from the hash?)
23pub struct ShardStrategy(ShardWidth, ShardDepth);
24
25/// impl [`ShardStrategy`] as an immutable/read-only thingy.
26impl ShardStrategy {
27    fn width(&self) -> ShardWidth {
28        self.0
29    }
30
31    fn depth(&self) -> ShardDepth {
32        self.1
33    }
34}
35
36#[derive(Debug)]
37pub enum ParseShardStrategyError {
38    /// Could not parse the shard depth.
39    BadDepth,
40    /// Could not parse the shard width.
41    BadWidth,
42    /// Failed to find the separator between width and depth.
43    ShardSplitNotFound,
44    /// Failed to find the end of the sharding definition.
45    ShardEndNotFound,
46    /// The sharding definition does not start with a number.
47    FirstCharNotADigit,
48    /// The sharding definition is empty.
49    EmptyString,
50}
51
52/// Attempt to parse a "width:depth#" shard out of a string.
53/// This function looks way scarier than it is.
54/// Each level of nesting is just handling a potential parse failure.
55impl FromStr for ShardStrategy {
56    type Err = ParseShardStrategyError;
57
58    /// A shard strategy is parsed as "width:depth#..." at the start of a string.
59    fn from_str(s: &str) -> Result<Self, Self::Err> {
60        // The first char needs to be a digit.
61        match s.chars().next() {
62            Some(first_char) => {
63                match u32::from_str(&first_char.to_string()) {
64                    Ok(_) => {
65                        // There needs to be a #
66                        match s.find(SHARDEND) {
67                            Some(end_index) => {
68                                let (maybe_strategy, _) = s.split_at(end_index);
69                                match maybe_strategy.find(SHARDSPLIT) {
70                                    Some(split_index) => {
71                                        let (maybe_width, maybe_depth) =
72                                            maybe_strategy.split_at(split_index);
73                                        match u32::from_str(maybe_width) {
74                                            Ok(width) => {
75                                                match u32::from_str(
76                                                    maybe_depth.trim_start_matches(SHARDSPLIT),
77                                                ) {
78                                                    Ok(depth) => Ok(ShardStrategy(width, depth)),
79                                                    Err(_) => {
80                                                        Err(ParseShardStrategyError::BadDepth)
81                                                    }
82                                                }
83                                            }
84                                            Err(_) => Err(ParseShardStrategyError::BadWidth),
85                                        }
86                                    }
87                                    None => Err(ParseShardStrategyError::ShardSplitNotFound),
88                                }
89                            }
90                            None => Err(ParseShardStrategyError::ShardEndNotFound),
91                        }
92                    }
93                    Err(_) => Err(ParseShardStrategyError::FirstCharNotADigit),
94                }
95            }
96            None => Err(ParseShardStrategyError::EmptyString),
97        }
98    }
99}
100
101/// Builds a path for a shard strategy and some binary bytes.
102/// This is the trivial case, we just split the bytes out one by one and make a path from it.
103impl From<(&ShardStrategy, &[u8])> for Path {
104    fn from((strategy, bytes): (&ShardStrategy, &[u8])) -> Path {
105        let full_length = strategy.width() * strategy.depth();
106        // Fold a flat slice of bytes into `strategy.depth` number of `strategy.width` length byte
107        // [`Component`]s.
108        let sharded: Vec<Component> = bytes
109            .iter()
110            .take(full_length as _)
111            .fold((vec![], vec![]), |acc, b| {
112                let (mut ret, mut build) = acc;
113                build.push(b);
114                if build.len() == strategy.width() as usize {
115                    ret.push(build.clone());
116                    build.clear();
117                }
118                (ret, build)
119            })
120            .0
121            .iter()
122            .map(|bytes| {
123                let bytes_vec: Vec<u8> = bytes.iter().map(|b| **b).collect();
124                Component::from(bytes_vec)
125            })
126            .collect();
127        Path::from(sharded)
128    }
129}
130/// Wrapper around `&Vec<u8>` to work the same as &[u8].
131impl From<(&ShardStrategy, &Vec<u8>)> for Path {
132    fn from((strategy, bytes): (&ShardStrategy, &Vec<u8>)) -> Path {
133        let bytes: &[u8] = bytes.as_ref();
134        Path::from((strategy, bytes))
135    }
136}
137/// Wrapper around `Vec<u8>` to work the same as &[u8].
138impl From<(&ShardStrategy, Vec<u8>)> for Path {
139    fn from((strategy, bytes): (&ShardStrategy, Vec<u8>)) -> Path {
140        let bytes: &[u8] = bytes.as_ref();
141        Path::from((strategy, bytes))
142    }
143}
144/// Create [`Path`] from [`String`].
145/// To ensure that this works for all utf8, which can have anywhere from 1-4 bytes for a single
146/// character, we first represent each character as a utf32 so it gets padded out with 0 bytes.
147/// This means the width is 4x what it would be for raw bytes with the same strategy.
148impl From<(&ShardStrategy, &str)> for Path {
149    fn from((strategy, s): (&ShardStrategy, &str)) -> Path {
150        // Truncate the string to only relevant chars.
151        let full_length = strategy.width() * strategy.depth();
152        let shard_string: String = s.chars().take(full_length as _).collect();
153
154        Path::from((
155            &ShardStrategy(
156                // Relies on the fact that we're encoding string characters as fixed width u32
157                // bytes rather than variable width utf8 bytes.
158                strategy.width() * std::mem::size_of::<u32>() as u32,
159                strategy.depth(),
160            ),
161            // Defer to the standard utf32 string handling to get the fixed size byte and endian
162            // handling correct.
163            Component::from(&shard_string).as_ref(),
164        ))
165    }
166}
167
168/// [`&String`](std::string::String) wrapper mimicking [`&str`] for [`Path`] building.
169impl From<(&ShardStrategy, &String)> for Path {
170    fn from((strategy, s): (&ShardStrategy, &String)) -> Path {
171        Path::from((strategy, s.as_str()))
172    }
173}
174// [`String`] wrapper mimicking [`&str`] for [`Path`] building.
175impl From<(&ShardStrategy, String)> for Path {
176    fn from((strategy, s): (&ShardStrategy, String)) -> Path {
177        Path::from((strategy, s.as_str()))
178    }
179}
180
181#[test]
182#[cfg(test)]
183fn hash_path_shard_bytes() {
184    for (width, depth, b, output) in vec![
185        // Anything with a zero results in an empty path.
186        (0, 0, vec![1, 2, 3, 4, 5], Path::from(vec![])),
187        (0, 1, vec![1, 2, 3, 4, 5], Path::from(vec![])),
188        (1, 0, vec![1, 2, 3, 4, 5], Path::from(vec![])),
189        (0, 2, vec![1, 2, 3, 4, 5], Path::from(vec![])),
190        (2, 0, vec![1, 2, 3, 4, 5], Path::from(vec![])),
191        // Basic sharding behaviour.
192        (
193            1,
194            1,
195            vec![1, 2, 3, 4, 5],
196            Path::from(vec![Component::from(vec![1_u8])]),
197        ),
198        (
199            2,
200            1,
201            vec![1, 2, 3, 4, 5],
202            Path::from(vec![Component::from(vec![1_u8, 2_u8])]),
203        ),
204        (
205            1,
206            2,
207            vec![1, 2, 3, 4, 5],
208            Path::from(vec![
209                Component::from(vec![1_u8]),
210                Component::from(vec![2_u8]),
211            ]),
212        ),
213        (
214            2,
215            2,
216            vec![1, 2, 3, 4, 5],
217            Path::from(vec![
218                Component::from(vec![1_u8, 2_u8]),
219                Component::from(vec![3_u8, 4_u8]),
220            ]),
221        ),
222    ] {
223        assert_eq!(output, Path::from((&ShardStrategy(width, depth), &b)));
224        let bytes: &[u8] = b.as_ref();
225        assert_eq!(output, Path::from((&ShardStrategy(width, depth), bytes)));
226        assert_eq!(output, Path::from((&ShardStrategy(width, depth), b)));
227    }
228}
229
230#[test]
231#[cfg(test)]
232fn hash_path_shard_string() {
233    for (width, depth, s, output) in vec![
234        // Anything with a zero results in an empty path.
235        (0, 0, "foobar", Path::from("")),
236        (0, 1, "foobar", Path::from("")),
237        (1, 0, "foobar", Path::from("")),
238        (0, 2, "foobar", Path::from("")),
239        (2, 0, "foobar", Path::from("")),
240        // Basic sharding behaviour.
241        (1, 1, "foobar", Path::from("f")),
242        (2, 1, "foobar", Path::from("fo")),
243        (1, 2, "foobar", Path::from("f.o")),
244        (2, 2, "foobar", Path::from("fo.ob")),
245        // Multibyte characters should be handled the way a naive understanding of strings would
246        // expect, i.e. that a 2-byte utf8 character is represented as 1 4-byte utf32 character and
247        // so counts as 1 "width" and 1 "depth" for the purpose of sharding.
248        (2, 2, "€€€€", Path::from("€€.€€")),
249        // If the string is shorter than the width and depth we go as deep as we can cleanly and
250        // truncate the end.
251        (4, 4, "foobar", Path::from("foob")),
252        (4, 4, "foobarbaz", Path::from("foob.arba")),
253        (4, 4, "€€€€€€€€€", Path::from("€€€€.€€€€")),
254    ] {
255        assert_eq!(output, Path::from((&ShardStrategy(width, depth), s)));
256        assert_eq!(
257            output,
258            Path::from((&ShardStrategy(width, depth), s.to_string()))
259        );
260        assert_eq!(
261            output,
262            Path::from((&ShardStrategy(width, depth), &s.to_string()))
263        );
264    }
265}