rust_unixfs/dir/builder/
iter.rs

1use super::{
2    CustomFlatUnixFs, DirBuilder, Entry, Leaf, NamedLeaf, TreeConstructionFailed, TreeOptions,
3};
4use core::fmt;
5use ipld_core::cid::Cid;
6use multihash::Multihash;
7use multihash_codetable::Code;
8use std::collections::HashMap;
9
10/// Constructs the directory nodes required for a tree.
11///
12/// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`.
13/// The tree is fully constructed once this has been exhausted.
14pub struct PostOrderIterator {
15    full_path: String,
16    old_depth: usize,
17    block_buffer: Vec<u8>,
18    // our stack of pending work
19    pending: Vec<Visited>,
20    // "communication channel" from nested entries back to their parents; this hashmap is only used
21    // in the event of mixed child nodes (leaves and nodes).
22    persisted_cids: HashMap<u64, Vec<Option<NamedLeaf>>>,
23    reused_children: Vec<Visited>,
24    cid: Option<Cid>,
25    total_size: u64,
26    // from TreeOptions
27    opts: TreeOptions,
28}
29
30/// The link list used to create the directory node. This list is created from a the BTreeMap
31/// inside DirBuilder, and initially it will have `Some` values only for the initial leaves and
32/// `None` values for subnodes which are not yet ready. At the time of use, this list is expected
33/// to have only `Some` values.
34type Leaves = Vec<Option<NamedLeaf>>;
35
36/// The nodes in the visit. We need to do a post-order visit, which starts from a single
37/// `DescentRoot`, followed by N `Descents` where N is the deepest directory in the tree. On each
38/// descent, we'll need to first schedule a `Post` (or `PostRoot`) followed the immediate children
39/// of the node. Directories are rendered when all of their direct and indirect descendants have
40/// been serialized into NamedLeafs.
41#[derive(Debug)]
42enum Visited {
43    // handle root differently not to infect with the Option<String> and Option<usize>
44    DescentRoot(DirBuilder),
45    Descent {
46        node: DirBuilder,
47        name: String,
48        depth: usize,
49        /// The index in the parents `Leaves` accessible through `PostOrderIterator::persisted_cids`.
50        index: usize,
51    },
52    Post {
53        parent_id: u64,
54        depth: usize,
55        name: String,
56        index: usize,
57        /// Leaves will be stored directly in this field when there are no DirBuilder descendants,
58        /// in the `PostOrderIterator::persisted_cids` otherwise.
59        leaves: LeafStorage,
60    },
61    PostRoot {
62        leaves: LeafStorage,
63    },
64}
65
66impl PostOrderIterator {
67    pub(super) fn new(root: DirBuilder, opts: TreeOptions, longest_path: usize) -> Self {
68        let root = Visited::DescentRoot(root);
69        PostOrderIterator {
70            full_path: String::with_capacity(longest_path),
71            old_depth: 0,
72            block_buffer: Default::default(),
73            pending: vec![root],
74            persisted_cids: Default::default(),
75            reused_children: Vec::new(),
76            cid: None,
77            total_size: 0,
78            opts,
79        }
80    }
81
82    fn render_directory(
83        links: &[Option<NamedLeaf>],
84        buffer: &mut Vec<u8>,
85        block_size_limit: &Option<u64>,
86    ) -> Result<Leaf, TreeConstructionFailed> {
87        use crate::pb::{UnixFs, UnixFsType};
88        use quick_protobuf::{BytesWriter, MessageWrite, Writer};
89        use sha2::{Digest, Sha256};
90
91        // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably
92        // need to introduce states in to the "iterator":
93        //
94        // 1. bucketization
95        // 2. another post order visit of the buckets?
96        //
97        // the nested post order visit should probably re-use the existing infra ("message
98        // passing") and new ids can be generated by giving this iterator the counter from
99        // BufferedTreeBuilder.
100        //
101        // could also be that the HAMT shard building should start earlier, since the same
102        // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and
103        // this would "just" be a single node rendering, and not need any additional states..
104
105        let node = CustomFlatUnixFs {
106            links,
107            data: UnixFs {
108                Type: UnixFsType::Directory,
109                ..Default::default()
110            },
111        };
112
113        let size = node.get_size();
114
115        if let Some(limit) = block_size_limit {
116            let size = size as u64;
117            if *limit < size {
118                // FIXME: this could probably be detected at builder
119                return Err(TreeConstructionFailed::TooLargeBlock(size));
120            }
121        }
122
123        let cap = buffer.capacity();
124
125        if let Some(additional) = size.checked_sub(cap) {
126            buffer.reserve(additional);
127        }
128
129        if let Some(mut needed_zeroes) = size.checked_sub(buffer.len()) {
130            let zeroes = [0; 8];
131
132            while needed_zeroes > 8 {
133                buffer.extend_from_slice(&zeroes[..]);
134                needed_zeroes -= zeroes.len();
135            }
136
137            buffer.extend(core::iter::repeat(0).take(needed_zeroes));
138        }
139
140        let mut writer = Writer::new(BytesWriter::new(&mut buffer[..]));
141        node.write_message(&mut writer)
142            .map_err(TreeConstructionFailed::Protobuf)?;
143
144        buffer.truncate(size);
145
146        #[allow(clippy::needless_borrows_for_generic_args)]
147        let mh = Multihash::wrap(Code::Sha2_256.into(), &Sha256::digest(&buffer)).unwrap();
148        let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0");
149
150        let combined_from_links = links
151            .iter()
152            .map(|opt| {
153                opt.as_ref()
154                    .map(|NamedLeaf(_, _, total_size)| total_size)
155                    .unwrap()
156            })
157            .sum::<u64>();
158
159        Ok(Leaf {
160            link: cid,
161            total_size: buffer.len() as u64 + combined_from_links,
162        })
163    }
164
165    /// Construct the next dag-pb node, if any.
166    ///
167    /// Returns a `TreeNode` of the latest constructed tree node.
168    pub fn next_borrowed(&mut self) -> Option<Result<TreeNode<'_>, TreeConstructionFailed>> {
169        while let Some(visited) = self.pending.pop() {
170            let (name, depth) = match &visited {
171                Visited::DescentRoot(_) => (None, 0),
172                Visited::Descent { name, depth, .. } => (Some(name.as_ref()), *depth),
173                Visited::Post { name, depth, .. } => (Some(name.as_ref()), *depth),
174                Visited::PostRoot { .. } => (None, 0),
175            };
176
177            update_full_path((&mut self.full_path, &mut self.old_depth), name, depth);
178
179            match visited {
180                Visited::DescentRoot(node) => {
181                    let children = &mut self.reused_children;
182                    let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children);
183                    let any_children = !children.is_empty();
184
185                    let leaves = if any_children {
186                        self.persisted_cids.insert(node.id, leaves);
187                        LeafStorage::from(node.id)
188                    } else {
189                        leaves.into()
190                    };
191
192                    self.pending.push(Visited::PostRoot { leaves });
193                    self.pending.append(children);
194                }
195                Visited::Descent {
196                    node,
197                    name,
198                    depth,
199                    index,
200                } => {
201                    let children = &mut self.reused_children;
202                    let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children);
203                    let any_children = !children.is_empty();
204                    let parent_id = node.parent_id.expect("only roots parent_id is None");
205
206                    let leaves = if any_children {
207                        self.persisted_cids.insert(node.id, leaves);
208                        node.id.into()
209                    } else {
210                        leaves.into()
211                    };
212
213                    self.pending.push(Visited::Post {
214                        parent_id,
215                        name,
216                        depth,
217                        leaves,
218                        index,
219                    });
220
221                    self.pending.append(children);
222                }
223                Visited::Post {
224                    parent_id,
225                    name,
226                    leaves,
227                    index,
228                    ..
229                } => {
230                    let leaves = leaves.into_inner(&mut self.persisted_cids);
231                    let buffer = &mut self.block_buffer;
232
233                    let leaf = match Self::render_directory(
234                        &leaves,
235                        buffer,
236                        &self.opts.block_size_limit,
237                    ) {
238                        Ok(leaf) => leaf,
239                        Err(e) => return Some(Err(e)),
240                    };
241
242                    self.cid = Some(leaf.link);
243                    self.total_size = leaf.total_size;
244
245                    {
246                        // name is None only for wrap_with_directory, which cannot really be
247                        // propagated up but still the parent_id is allowed to be None
248                        let parent_leaves = self.persisted_cids.get_mut(&parent_id);
249
250                        match (parent_id, parent_leaves, index) {
251                            (pid, None, index) => {
252                                panic!("leaves not found for parent_id = {pid} and index = {index}")
253                            }
254                            (_, Some(vec), index) => {
255                                let cell = &mut vec[index];
256                                // all
257                                assert!(cell.is_none());
258                                *cell = Some(NamedLeaf(name, leaf.link, leaf.total_size));
259                            }
260                        }
261                    }
262
263                    return Some(Ok(TreeNode {
264                        path: self.full_path.as_str(),
265                        cid: self.cid.as_ref().unwrap(),
266                        total_size: self.total_size,
267                        block: &self.block_buffer,
268                    }));
269                }
270                Visited::PostRoot { leaves } => {
271                    let leaves = leaves.into_inner(&mut self.persisted_cids);
272
273                    if !self.opts.wrap_with_directory {
274                        break;
275                    }
276
277                    let buffer = &mut self.block_buffer;
278
279                    let leaf = match Self::render_directory(
280                        &leaves,
281                        buffer,
282                        &self.opts.block_size_limit,
283                    ) {
284                        Ok(leaf) => leaf,
285                        Err(e) => return Some(Err(e)),
286                    };
287
288                    self.cid = Some(leaf.link);
289                    self.total_size = leaf.total_size;
290
291                    return Some(Ok(TreeNode {
292                        path: self.full_path.as_str(),
293                        cid: self.cid.as_ref().unwrap(),
294                        total_size: self.total_size,
295                        block: &self.block_buffer,
296                    }));
297                }
298            }
299        }
300        None
301    }
302}
303
304impl Iterator for PostOrderIterator {
305    type Item = Result<OwnedTreeNode, TreeConstructionFailed>;
306
307    fn next(&mut self) -> Option<Self::Item> {
308        self.next_borrowed()
309            .map(|res| res.map(TreeNode::into_owned))
310    }
311}
312
313/// Borrowed representation of a node in the tree.
314pub struct TreeNode<'a> {
315    /// Full path to the node.
316    pub path: &'a str,
317    /// The Cid of the document.
318    pub cid: &'a Cid,
319    /// Cumulative total size of the subtree in bytes.
320    pub total_size: u64,
321    /// Raw dag-pb document.
322    pub block: &'a [u8],
323}
324
325impl<'a> fmt::Debug for TreeNode<'a> {
326    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
327        fmt.debug_struct("TreeNode")
328            .field("path", &format_args!("{:?}", self.path))
329            .field("cid", &format_args!("{}", self.cid))
330            .field("total_size", &self.total_size)
331            .field("size", &self.block.len())
332            .finish()
333    }
334}
335
336impl TreeNode<'_> {
337    /// Convert to an owned and detached representation.
338    pub fn into_owned(self) -> OwnedTreeNode {
339        OwnedTreeNode {
340            path: self.path.to_owned(),
341            cid: self.cid.to_owned(),
342            total_size: self.total_size,
343            block: self.block.into(),
344        }
345    }
346}
347
348/// Owned representation of a node in the tree.
349pub struct OwnedTreeNode {
350    /// Full path to the node.
351    pub path: String,
352    /// The Cid of the document.
353    pub cid: Cid,
354    /// Cumulative total size of the subtree in bytes.
355    pub total_size: u64,
356    /// Raw dag-pb document.
357    pub block: Box<[u8]>,
358}
359
360fn update_full_path(
361    (full_path, old_depth): (&mut String, &mut usize),
362    name: Option<&str>,
363    depth: usize,
364) {
365    if depth < 2 {
366        // initially thought it might be a good idea to add a slash to all components; removing it made
367        // it impossible to get back down to empty string, so fixing this for depths 0 and 1.
368        full_path.clear();
369        *old_depth = 0;
370    } else {
371        while *old_depth >= depth && *old_depth > 0 {
372            // we now want to pop the last segment
373            // this would be easier with PathBuf
374            let slash_at = full_path.bytes().rposition(|ch| ch == b'/');
375            if let Some(slash_at) = slash_at {
376                if *old_depth == depth && Some(&full_path[(slash_at + 1)..]) == name {
377                    // minor unmeasurable perf optimization:
378                    // going from a/b/foo/zz => a/b/foo does not need to go through the a/b
379                    return;
380                }
381                full_path.truncate(slash_at);
382                *old_depth -= 1;
383            } else {
384                todo!(
385                    "no last slash_at in {:?} yet {} >= {}",
386                    full_path,
387                    old_depth,
388                    depth
389                );
390            }
391        }
392    }
393
394    debug_assert!(*old_depth <= depth);
395
396    if let Some(name) = name {
397        if !full_path.is_empty() {
398            full_path.push('/');
399        }
400        full_path.push_str(name);
401        *old_depth += 1;
402    }
403
404    assert_eq!(*old_depth, depth);
405}
406
407/// Returns a Vec of the links in order with only the leaves, the given `children` will contain yet
408/// incomplete nodes of the tree.
409fn partition_children_leaves(
410    depth: usize,
411    it: impl Iterator<Item = (String, Entry)>,
412    children: &mut Vec<Visited>,
413) -> Leaves {
414    let mut leaves = Vec::new();
415
416    for (i, (k, v)) in it.enumerate() {
417        match v {
418            Entry::Directory(node) => {
419                children.push(Visited::Descent {
420                    node,
421                    // this needs to be pushed down to update the full_path
422                    name: k,
423                    depth: depth + 1,
424                    index: i,
425                });
426
427                // this will be overwritten later, but the order is fixed
428                leaves.push(None);
429            }
430            Entry::Leaf(leaf) => leaves.push(Some(NamedLeaf(k, leaf.link, leaf.total_size))),
431        }
432    }
433
434    leaves
435}
436
437#[derive(Debug)]
438enum LeafStorage {
439    Direct(Leaves),
440    Stashed(u64),
441}
442
443impl LeafStorage {
444    fn into_inner(self, stash: &mut HashMap<u64, Leaves>) -> Leaves {
445        use LeafStorage::*;
446
447        match self {
448            Direct(leaves) => leaves,
449            Stashed(id) => stash
450                .remove(&id)
451                .ok_or(id)
452                .expect("leaves are either stashed or direct, must able to find with id"),
453        }
454    }
455}
456
457impl From<u64> for LeafStorage {
458    fn from(key: u64) -> LeafStorage {
459        LeafStorage::Stashed(key)
460    }
461}
462
463impl From<Leaves> for LeafStorage {
464    fn from(leaves: Leaves) -> LeafStorage {
465        LeafStorage::Direct(leaves)
466    }
467}