rust_unixfs/dir/
sharded_lookup.rs

1use super::{try_convert_cid, MaybeResolved, MultipleMatchingLinks, ResolveError};
2use crate::pb::{FlatUnixFs, PBLink, ParsingFailed, UnixFsType};
3use crate::{InvalidCidInLink, UnexpectedNodeType};
4use alloc::borrow::Cow;
5use alloc::collections::VecDeque;
6use core::convert::TryFrom;
7use core::fmt;
8use ipld_core::cid::Cid;
9
10/// A cache of data structures used while traversing. Reduces allocations when walking over multiple
11/// path segments.
12pub struct Cache {
13    buffer: VecDeque<Cid>,
14}
15
16impl From<VecDeque<Cid>> for Cache {
17    fn from(mut buffer: VecDeque<Cid>) -> Self {
18        buffer.clear();
19        Cache { buffer }
20    }
21}
22impl fmt::Debug for Cache {
23    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
24        write!(fmt, "Cache {{ buffer: {} }}", self.buffer.capacity())
25    }
26}
27
28/// `ShardedLookup` can walk over multiple HAMT sharded directory nodes which allows multiple block
29/// spanning directories.
30pub struct ShardedLookup<'needle> {
31    links: VecDeque<Cid>,
32    // this will be tricky if we ever need to have case-insensitive resolving *but* we can then
33    // make a custom Cow type; important not to expose Cow in any API.
34    needle: Cow<'needle, str>,
35}
36
37impl fmt::Debug for ShardedLookup<'_> {
38    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
39        write!(
40            fmt,
41            "ShardedLookup {{ links: {}, needle: {:?} }}",
42            self.links.len(),
43            self.needle.as_ref(),
44        )
45    }
46}
47
48impl<'needle> ShardedLookup<'needle> {
49    /// Returns the next pending link and an iterator over the rest.
50    pub fn pending_links(&self) -> (&Cid, impl Iterator<Item = &Cid>) {
51        let mut iter = self.links.iter();
52        let first = iter.next().expect("Already validated there are links");
53        (first, iter)
54    }
55
56    /// Continues the walk in the DAG of HAMT buckets searching for the original `needle`.
57    #[allow(clippy::result_large_err)]
58    pub fn continue_walk(
59        mut self,
60        next: &[u8],
61        cache: &mut Option<Cache>,
62    ) -> Result<MaybeResolved<'needle>, LookupError> {
63        // just to make sure not to mess this up
64        debug_assert_eq!(Some(self.pending_links().0), self.links.front());
65
66        self.links
67            .pop_front()
68            .expect("Already validated there are links");
69
70        let mut hamt = match FlatUnixFs::try_from(next) {
71            Ok(hamt) if hamt.data.Type == UnixFsType::HAMTShard => hamt,
72            Ok(other) => return Err(LookupError::UnexpectedBucketType(other.data.Type.into())),
73            Err(ParsingFailed::InvalidDagPb(e)) | Err(ParsingFailed::InvalidUnixFs(e, _)) => {
74                *cache = Some(self.links.into());
75                return Err(LookupError::Read(Some(e)));
76            }
77            Err(ParsingFailed::NoData(_)) => {
78                *cache = Some(self.links.into());
79                return Err(LookupError::Read(None));
80            }
81        };
82
83        Self::check_supported(&mut hamt)?;
84
85        let found = Self::partition(
86            hamt.links.into_iter(),
87            self.needle.as_ref(),
88            &mut self.links,
89        )?;
90
91        if let Some(cid) = found {
92            *cache = Some(self.links.into());
93            Ok(MaybeResolved::Found(cid))
94        } else if self.links.is_empty() {
95            *cache = Some(self.links.into());
96            Ok(MaybeResolved::NotFound)
97        } else {
98            Ok(MaybeResolved::NeedToLoadMore(self))
99        }
100    }
101
102    /// Transforms this `ShardedLookup` into a `ShardedLookup<'static>` by taking ownership of the
103    /// needle we are trying to find.
104    pub fn with_owned_needle(self) -> ShardedLookup<'static> {
105        let ShardedLookup { links, needle } = self;
106        let needle = Cow::Owned(needle.into_owned());
107        ShardedLookup { links, needle }
108    }
109
110    /// Finds or starts a lookup of multiple buckets.
111    ///
112    /// Returns the found link, the definitive negative or the means to continue the traversal.
113    #[allow(clippy::result_large_err)]
114    pub(crate) fn lookup_or_start(
115        mut hamt: FlatUnixFs<'_>,
116        needle: &'needle str,
117        cache: &mut Option<Cache>,
118    ) -> Result<MaybeResolved<'needle>, LookupError> {
119        Self::check_supported(&mut hamt)?;
120
121        let mut links = cache.take().map(|c| c.buffer).unwrap_or_default();
122
123        let found = Self::partition(hamt.links.into_iter(), needle, &mut links)?;
124
125        if let Some(cid) = found {
126            *cache = Some(links.into());
127            Ok(MaybeResolved::Found(cid))
128        } else if links.is_empty() {
129            *cache = Some(links.into());
130            Ok(MaybeResolved::NotFound)
131        } else {
132            Ok(MaybeResolved::NeedToLoadMore(ShardedLookup {
133                links,
134                needle: Cow::Borrowed(needle),
135            }))
136        }
137    }
138
139    /// Takes the validated object as mutable reference to move data out of it in case of error.
140    ///
141    /// Returns an error if we don't support the properties on the HAMTShard-typed node
142    pub(crate) fn check_supported(hamt: &mut FlatUnixFs<'_>) -> Result<(), ShardError> {
143        assert_eq!(hamt.data.Type, UnixFsType::HAMTShard);
144
145        if hamt.data.fanout != Some(256) || hamt.data.hashType != Some(34) {
146            Err(ShardError::UnsupportedProperties {
147                hash_type: hamt.data.hashType,
148                fanout: hamt.data.fanout,
149            })
150        } else if hamt.data.filesize.is_some() || !hamt.data.blocksizes.is_empty() {
151            Err(ShardError::UnexpectedProperties {
152                filesize: hamt.data.filesize,
153                blocksizes: core::mem::take(&mut hamt.data.blocksizes),
154            })
155        } else {
156            Ok(())
157        }
158    }
159
160    /// Partition the original links based on their kind; if the link:
161    ///
162    ///  - matches the needle uniquely, it will be returned as `Some(cid)`
163    ///  - is a bucket, it is pushed back to the work
164    #[allow(clippy::result_large_err)]
165    fn partition<'a>(
166        iter: impl Iterator<Item = PBLink<'a>>,
167        needle: &str,
168        work: &mut VecDeque<Cid>,
169    ) -> Result<Option<Cid>, PartitioningError> {
170        let mut found = None;
171
172        for (i, link) in iter.enumerate() {
173            let name = link.Name.as_deref().unwrap_or_default();
174
175            if name.len() > 2 && &name[2..] == needle {
176                if let Some(first) = found.take() {
177                    return Err(MultipleMatchingLinks::from((first, (i, link))).into());
178                } else {
179                    found = Some((i, try_convert_cid(i, link)?));
180                }
181            } else if name.len() == 2 {
182                // the magic number of two comes from the fanout (256) probably
183                let cid = try_convert_cid(i, link)?;
184                work.push_back(cid);
185            } else {
186                // no match, not interesting for us
187            }
188        }
189
190        Ok(found.map(|(_, cid)| cid))
191    }
192}
193
194pub(crate) enum PartitioningError {
195    Multiple(MultipleMatchingLinks),
196    InvalidCid(InvalidCidInLink),
197}
198
199impl From<InvalidCidInLink> for PartitioningError {
200    fn from(e: InvalidCidInLink) -> PartitioningError {
201        PartitioningError::InvalidCid(e)
202    }
203}
204
205impl From<MultipleMatchingLinks> for PartitioningError {
206    fn from(e: MultipleMatchingLinks) -> PartitioningError {
207        PartitioningError::Multiple(e)
208    }
209}
210
211impl From<PartitioningError> for ResolveError {
212    fn from(e: PartitioningError) -> ResolveError {
213        ResolveError::Lookup(LookupError::from(e))
214    }
215}
216
217impl From<PartitioningError> for LookupError {
218    fn from(e: PartitioningError) -> LookupError {
219        use PartitioningError::*;
220        match e {
221            Multiple(m) => LookupError::Multiple(m),
222            InvalidCid(e) => LookupError::InvalidCid(e),
223        }
224    }
225}
226
227/// Shard does not fit into expectations.
228#[derive(Debug)]
229pub enum ShardError {
230    /// Encountered an HAMT sharded directory which had an unsupported configuration.
231    UnsupportedProperties {
232        /// Unsupported multihash hash.
233        hash_type: Option<u64>,
234        /// Unsupported fanout value.
235        fanout: Option<u64>,
236    },
237    /// Encountered an HAMT sharded directory which had a unexpected properties.
238    UnexpectedProperties {
239        /// Filesize is used with UnixFS files.
240        filesize: Option<u64>,
241        /// Blocksizes are in general used with UnixFS files.
242        blocksizes: Vec<u64>,
243    },
244}
245
246impl fmt::Display for ShardError {
247    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
248        use ShardError::*;
249        match self {
250            UnsupportedProperties { hash_type, fanout } => write!(
251                fmt,
252                "unsupported HAMTShard properties: hash_type={hash_type:?}, fanout={fanout:?}"
253            ),
254            UnexpectedProperties {
255                filesize,
256                blocksizes,
257            } => write!(
258                fmt,
259                "unexpected HAMTShard properties: filesize=({:?}), {} blocksizes",
260                filesize,
261                blocksizes.len()
262            ),
263        }
264    }
265}
266
267impl std::error::Error for ShardError {}
268
269/// Errors which can occur when looking up a HAMTSharded directory.
270#[derive(Debug)]
271pub enum LookupError {
272    /// Multiple matching links were found
273    Multiple(MultipleMatchingLinks),
274    /// Invalid Cid was matched
275    InvalidCid(InvalidCidInLink),
276    /// Unexpected HAMT shard bucket type
277    UnexpectedBucketType(UnexpectedNodeType),
278    /// Unsupported or unexpected property of the UnixFS node
279    Shard(ShardError),
280    /// Parsing failed or the inner dag-pb data was contained no bytes.
281    Read(Option<quick_protobuf::Error>),
282}
283
284impl LookupError {
285    /// Converts this HAMT lookup error to the more general ResolveError
286    pub fn into_resolve_error(self) -> ResolveError {
287        self.into()
288    }
289}
290
291impl From<MultipleMatchingLinks> for LookupError {
292    fn from(e: MultipleMatchingLinks) -> Self {
293        LookupError::Multiple(e)
294    }
295}
296
297impl From<InvalidCidInLink> for LookupError {
298    fn from(e: InvalidCidInLink) -> Self {
299        LookupError::InvalidCid(e)
300    }
301}
302
303impl From<ShardError> for LookupError {
304    fn from(e: ShardError) -> Self {
305        LookupError::Shard(e)
306    }
307}
308
309impl fmt::Display for LookupError {
310    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
311        use LookupError::*;
312
313        match self {
314            InvalidCid(e) => write!(fmt, "Invalid link: {e:?}"),
315            Multiple(e) => write!(fmt, "Multiple matching links found: {e:?}"),
316            UnexpectedBucketType(ut) => write!(fmt, "unexpected type for HAMT bucket: {ut:?}"),
317            Shard(e) => write!(fmt, "{e}"),
318            Read(Some(e)) => write!(
319                fmt,
320                "failed to parse the block as unixfs or dag-pb node: {e}"
321            ),
322            Read(None) => write!(fmt, "HAMTDirectory not found in empty dag-pb node"),
323        }
324    }
325}
326
327impl std::error::Error for LookupError {
328    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
329        use LookupError::*;
330        match self {
331            Read(Some(e)) => Some(e),
332            _ => None,
333        }
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::{LookupError, MaybeResolved, ShardError, ShardedLookup};
340    use crate::pb::FlatUnixFs;
341    use core::convert::TryFrom;
342    use hex_literal::hex;
343
344    // a directory from some linux kernel tree import: linux-5.5-rc5/tools/testing/selftests/rcutorture/
345    const DIR: &[u8] = &hex!("122e0a2212204baf5104fe53d495223f8e2ba95375a31fda6b18e926cb54edd61f30b5f1de6512053641646f6318b535122c0a221220fd9f545068048e647d5d0b275ed171596e0c1c04b8fed09dc13bee7607e75bc7120242391883c00312330a2212208a4a68f6b88594ce373419586c12d24bde2d519ab636b1d2dcc986eb6265b7a3120a43444d616b6566696c65189601122f0a2212201ededc99d23a7ef43a8f17e6dd8b89934993245ef39e18936a37e412e536ed681205463562696e18c5ad030a280805121f200000000020000200000000000000000004000000000000000000000000002822308002");
346    const FILE: &[u8] = &hex!("0a130802120d666f6f6261720a666f6f626172180d");
347
348    #[test]
349    fn direct_hit() {
350        let parsed = FlatUnixFs::try_from(DIR).unwrap();
351
352        // calling shardedlookup directly makes little sense, but through `resolve` it would make
353        // sense
354
355        // testing this is a bit ... not nice, since we can only find out if there is a negative
356        // hit through exhausting the buckets
357        let found = ShardedLookup::lookup_or_start(parsed, "bin", &mut None);
358
359        match found {
360            Ok(MaybeResolved::Found(cid))
361                if cid.to_string() == "QmQRA3JX9JNSccQpXjuKzMVCpfTaP4XpHbrrefqaQFWf5Z" => {}
362            x => unreachable!("{:?}", x),
363        }
364    }
365
366    #[test]
367    fn found_in_the_other_bucket() {
368        let parsed = FlatUnixFs::try_from(DIR).unwrap();
369
370        // there is a single bin "B9" which would contain "formal" but our implementation cannot
371        // see it, it just guesses to start looking up in other buckets
372        let see_next = ShardedLookup::lookup_or_start(parsed, "formal", &mut None);
373
374        let next = match see_next {
375            Ok(MaybeResolved::NeedToLoadMore(next)) => next,
376            x => unreachable!("{:?}", x),
377        };
378
379        {
380            let (first, mut rest) = next.pending_links();
381
382            // there is only one bin: in other cases we would just walk in BFS order
383            assert_eq!(
384                first.to_string(),
385                "QmfQgmYMYmGQP4X6V3JhTELkQmGVP9kpJgv9duejQ8vWez"
386            );
387            assert!(rest.next().is_none());
388        }
389
390        // then we error on anything other than HAMTShard
391        let err = next.continue_walk(FILE, &mut None).unwrap_err();
392        match err {
393            LookupError::UnexpectedBucketType(ut) if ut.is_file() => {}
394            x => unreachable!("{:?}", x),
395        }
396    }
397
398    #[test]
399    fn unsupported_hash_type_or_fanout() {
400        use crate::pb::{FlatUnixFs, UnixFs, UnixFsType};
401        use alloc::borrow::Cow;
402
403        let example = FlatUnixFs {
404            data: UnixFs {
405                Type: UnixFsType::HAMTShard,
406                Data: Some(Cow::Borrowed(
407                    b"this cannot be interpreted yet but would be an error",
408                )),
409                filesize: None,
410                blocksizes: Vec::new(),
411                hashType: Some(33), // supported 34 or murmur128 le cut as u64?
412                fanout: Some(255),  // supported 256
413                mode: None,         // these are not read by the lookup
414                mtime: None,
415            },
416            links: Vec::new(),
417        };
418
419        let err = ShardedLookup::lookup_or_start(example, "doesnt matter", &mut None).unwrap_err();
420        assert!(
421            matches!(
422                err,
423                LookupError::Shard(ShardError::UnsupportedProperties { .. })
424            ),
425            "{err:?}"
426        );
427    }
428
429    #[test]
430    fn unexpected_properties() {
431        use crate::pb::{FlatUnixFs, UnixFs, UnixFsType};
432        use alloc::borrow::Cow;
433
434        let example = FlatUnixFs {
435            data: UnixFs {
436                Type: UnixFsType::HAMTShard,
437                Data: Some(Cow::Borrowed(
438                    b"this cannot be interpreted yet but would be an error",
439                )),
440                filesize: Some(1),   // err
441                blocksizes: vec![1], // err
442                hashType: Some(34),
443                fanout: Some(256),
444                mode: None,
445                mtime: None,
446            },
447            links: Vec::new(),
448        };
449
450        let err = ShardedLookup::lookup_or_start(example, "doesnt matter", &mut None).unwrap_err();
451        assert!(
452            matches!(
453                err,
454                LookupError::Shard(ShardError::UnexpectedProperties { .. })
455            ),
456            "{err:?}"
457        );
458    }
459}