hdi/hash_path/shard.rs
1use crate::hash_path::path::Component;
2use crate::hash_path::path::Path;
3use std::str::FromStr;
4
5/// Separates the shard width and depth.
6pub const SHARDSPLIT: &str = ":";
7/// Terminates the end of a shard shorthand.
8pub const SHARDEND: &str = "#";
9
10/// The width of a shard is how many bytes/characters to use for each path component in sharding.
11/// e.g. abcdef with width 1 shards to a.b.c.d.e.f.abcdef and 2 shards to ab.cd.ef.abcdef.
12pub type ShardWidth = u32;
13/// The depth of a shard is the number of path components to stretch out for shards.
14/// e.g. abcdef with a depth of 1 and width 1 shards to a.abcdef and depth 2 shards to a.b.abcdef.
15pub type ShardDepth = u32;
16
17#[derive(Debug)]
18/// A valid strategy for sharding requires both a width and a depth.
19/// At the moment sharding only works well for data that is reliably longer than width/depth.
20/// For example, sharding the username foo with width 4 doesn't make sense.
21/// There is no magic padding or extending of the provided data to make up undersized shards.
22// @todo stretch short shards out in a nice balanced way (append some bytes from the hash?)
23pub struct ShardStrategy(ShardWidth, ShardDepth);
24
25/// impl [`ShardStrategy`] as an immutable/read-only thingy.
26impl ShardStrategy {
27 fn width(&self) -> ShardWidth {
28 self.0
29 }
30
31 fn depth(&self) -> ShardDepth {
32 self.1
33 }
34}
35
36#[derive(Debug)]
37pub enum ParseShardStrategyError {
38 /// Could not parse the shard depth.
39 BadDepth,
40 /// Could not parse the shard width.
41 BadWidth,
42 /// Failed to find the separator between width and depth.
43 ShardSplitNotFound,
44 /// Failed to find the end of the sharding definition.
45 ShardEndNotFound,
46 /// The sharding definition does not start with a number.
47 FirstCharNotADigit,
48 /// The sharding definition is empty.
49 EmptyString,
50}
51
52/// Attempt to parse a "width:depth#" shard out of a string.
53/// This function looks way scarier than it is.
54/// Each level of nesting is just handling a potential parse failure.
55impl FromStr for ShardStrategy {
56 type Err = ParseShardStrategyError;
57
58 /// A shard strategy is parsed as "width:depth#..." at the start of a string.
59 fn from_str(s: &str) -> Result<Self, Self::Err> {
60 // The first char needs to be a digit.
61 match s.chars().next() {
62 Some(first_char) => {
63 match u32::from_str(&first_char.to_string()) {
64 Ok(_) => {
65 // There needs to be a #
66 match s.find(SHARDEND) {
67 Some(end_index) => {
68 let (maybe_strategy, _) = s.split_at(end_index);
69 match maybe_strategy.find(SHARDSPLIT) {
70 Some(split_index) => {
71 let (maybe_width, maybe_depth) =
72 maybe_strategy.split_at(split_index);
73 match u32::from_str(maybe_width) {
74 Ok(width) => {
75 match u32::from_str(
76 maybe_depth.trim_start_matches(SHARDSPLIT),
77 ) {
78 Ok(depth) => Ok(ShardStrategy(width, depth)),
79 Err(_) => {
80 Err(ParseShardStrategyError::BadDepth)
81 }
82 }
83 }
84 Err(_) => Err(ParseShardStrategyError::BadWidth),
85 }
86 }
87 None => Err(ParseShardStrategyError::ShardSplitNotFound),
88 }
89 }
90 None => Err(ParseShardStrategyError::ShardEndNotFound),
91 }
92 }
93 Err(_) => Err(ParseShardStrategyError::FirstCharNotADigit),
94 }
95 }
96 None => Err(ParseShardStrategyError::EmptyString),
97 }
98 }
99}
100
101/// Builds a path for a shard strategy and some binary bytes.
102/// This is the trivial case, we just split the bytes out one by one and make a path from it.
103impl From<(&ShardStrategy, &[u8])> for Path {
104 fn from((strategy, bytes): (&ShardStrategy, &[u8])) -> Path {
105 let full_length = strategy.width() * strategy.depth();
106 // Fold a flat slice of bytes into `strategy.depth` number of `strategy.width` length byte
107 // [`Component`]s.
108 let sharded: Vec<Component> = bytes
109 .iter()
110 .take(full_length as _)
111 .fold((vec![], vec![]), |acc, b| {
112 let (mut ret, mut build) = acc;
113 build.push(b);
114 if build.len() == strategy.width() as usize {
115 ret.push(build.clone());
116 build.clear();
117 }
118 (ret, build)
119 })
120 .0
121 .iter()
122 .map(|bytes| {
123 let bytes_vec: Vec<u8> = bytes.iter().map(|b| **b).collect();
124 Component::from(bytes_vec)
125 })
126 .collect();
127 Path::from(sharded)
128 }
129}
130/// Wrapper around `&Vec<u8>` to work the same as &[u8].
131impl From<(&ShardStrategy, &Vec<u8>)> for Path {
132 fn from((strategy, bytes): (&ShardStrategy, &Vec<u8>)) -> Path {
133 let bytes: &[u8] = bytes.as_ref();
134 Path::from((strategy, bytes))
135 }
136}
137/// Wrapper around `Vec<u8>` to work the same as &[u8].
138impl From<(&ShardStrategy, Vec<u8>)> for Path {
139 fn from((strategy, bytes): (&ShardStrategy, Vec<u8>)) -> Path {
140 let bytes: &[u8] = bytes.as_ref();
141 Path::from((strategy, bytes))
142 }
143}
144/// Create [`Path`] from [`String`].
145/// To ensure that this works for all utf8, which can have anywhere from 1-4 bytes for a single
146/// character, we first represent each character as a utf32 so it gets padded out with 0 bytes.
147/// This means the width is 4x what it would be for raw bytes with the same strategy.
148impl From<(&ShardStrategy, &str)> for Path {
149 fn from((strategy, s): (&ShardStrategy, &str)) -> Path {
150 // Truncate the string to only relevant chars.
151 let full_length = strategy.width() * strategy.depth();
152 let shard_string: String = s.chars().take(full_length as _).collect();
153
154 Path::from((
155 &ShardStrategy(
156 // Relies on the fact that we're encoding string characters as fixed width u32
157 // bytes rather than variable width utf8 bytes.
158 strategy.width() * std::mem::size_of::<u32>() as u32,
159 strategy.depth(),
160 ),
161 // Defer to the standard utf32 string handling to get the fixed size byte and endian
162 // handling correct.
163 Component::from(&shard_string).as_ref(),
164 ))
165 }
166}
167
168/// [`&String`](std::string::String) wrapper mimicking [`&str`] for [`Path`] building.
169impl From<(&ShardStrategy, &String)> for Path {
170 fn from((strategy, s): (&ShardStrategy, &String)) -> Path {
171 Path::from((strategy, s.as_str()))
172 }
173}
174// [`String`] wrapper mimicking [`&str`] for [`Path`] building.
175impl From<(&ShardStrategy, String)> for Path {
176 fn from((strategy, s): (&ShardStrategy, String)) -> Path {
177 Path::from((strategy, s.as_str()))
178 }
179}
180
181#[test]
182#[cfg(test)]
183fn hash_path_shard_bytes() {
184 for (width, depth, b, output) in vec![
185 // Anything with a zero results in an empty path.
186 (0, 0, vec![1, 2, 3, 4, 5], Path::from(vec![])),
187 (0, 1, vec![1, 2, 3, 4, 5], Path::from(vec![])),
188 (1, 0, vec![1, 2, 3, 4, 5], Path::from(vec![])),
189 (0, 2, vec![1, 2, 3, 4, 5], Path::from(vec![])),
190 (2, 0, vec![1, 2, 3, 4, 5], Path::from(vec![])),
191 // Basic sharding behaviour.
192 (
193 1,
194 1,
195 vec![1, 2, 3, 4, 5],
196 Path::from(vec![Component::from(vec![1_u8])]),
197 ),
198 (
199 2,
200 1,
201 vec![1, 2, 3, 4, 5],
202 Path::from(vec![Component::from(vec![1_u8, 2_u8])]),
203 ),
204 (
205 1,
206 2,
207 vec![1, 2, 3, 4, 5],
208 Path::from(vec![
209 Component::from(vec![1_u8]),
210 Component::from(vec![2_u8]),
211 ]),
212 ),
213 (
214 2,
215 2,
216 vec![1, 2, 3, 4, 5],
217 Path::from(vec![
218 Component::from(vec![1_u8, 2_u8]),
219 Component::from(vec![3_u8, 4_u8]),
220 ]),
221 ),
222 ] {
223 assert_eq!(output, Path::from((&ShardStrategy(width, depth), &b)));
224 let bytes: &[u8] = b.as_ref();
225 assert_eq!(output, Path::from((&ShardStrategy(width, depth), bytes)));
226 assert_eq!(output, Path::from((&ShardStrategy(width, depth), b)));
227 }
228}
229
230#[test]
231#[cfg(test)]
232fn hash_path_shard_string() {
233 for (width, depth, s, output) in vec![
234 // Anything with a zero results in an empty path.
235 (0, 0, "foobar", Path::from("")),
236 (0, 1, "foobar", Path::from("")),
237 (1, 0, "foobar", Path::from("")),
238 (0, 2, "foobar", Path::from("")),
239 (2, 0, "foobar", Path::from("")),
240 // Basic sharding behaviour.
241 (1, 1, "foobar", Path::from("f")),
242 (2, 1, "foobar", Path::from("fo")),
243 (1, 2, "foobar", Path::from("f.o")),
244 (2, 2, "foobar", Path::from("fo.ob")),
245 // Multibyte characters should be handled the way a naive understanding of strings would
246 // expect, i.e. that a 2-byte utf8 character is represented as 1 4-byte utf32 character and
247 // so counts as 1 "width" and 1 "depth" for the purpose of sharding.
248 (2, 2, "€€€€", Path::from("€€.€€")),
249 // If the string is shorter than the width and depth we go as deep as we can cleanly and
250 // truncate the end.
251 (4, 4, "foobar", Path::from("foob")),
252 (4, 4, "foobarbaz", Path::from("foob.arba")),
253 (4, 4, "€€€€€€€€€", Path::from("€€€€.€€€€")),
254 ] {
255 assert_eq!(output, Path::from((&ShardStrategy(width, depth), s)));
256 assert_eq!(
257 output,
258 Path::from((&ShardStrategy(width, depth), s.to_string()))
259 );
260 assert_eq!(
261 output,
262 Path::from((&ShardStrategy(width, depth), &s.to_string()))
263 );
264 }
265}