unixfs_v1/dir/builder/
iter.rs

1use crate::pb::DAG_PB;
2
3use super::{
4    CustomFlatUnixFs, DirBuilder, Entry, Leaf, NamedLeaf, TreeConstructionFailed, TreeOptions,
5};
6use core::fmt;
7use libipld::multihash::{Code, MultihashDigest};
8use libipld::Cid;
9use std::collections::HashMap;
10
11/// Constructs the directory nodes required for a tree.
12///
13/// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`.
14/// The tree is fully constructed once this has been exhausted.
15pub struct PostOrderIterator {
16    full_path: String,
17    old_depth: usize,
18    block_buffer: Vec<u8>,
19    // our stack of pending work
20    pending: Vec<Visited>,
21    // "communication channel" from nested entries back to their parents; this hashmap is only used
22    // in the event of mixed child nodes (leaves and nodes).
23    persisted_cids: HashMap<u64, Vec<Option<NamedLeaf>>>,
24    reused_children: Vec<Visited>,
25    cid: Option<Cid>,
26    total_size: u64,
27    // from TreeOptions
28    opts: TreeOptions,
29}
30
31/// The link list used to create the directory node. This list is created from a the BTreeMap
32/// inside DirBuilder, and initially it will have `Some` values only for the initial leaves and
33/// `None` values for subnodes which are not yet ready. At the time of use, this list is expected
34/// to have only `Some` values.
35type Leaves = Vec<Option<NamedLeaf>>;
36
37/// The nodes in the visit. We need to do a post-order visit, which starts from a single
38/// `DescentRoot`, followed by N `Descents` where N is the deepest directory in the tree. On each
39/// descent, we'll need to first schedule a `Post` (or `PostRoot`) followed the immediate children
40/// of the node. Directories are rendered when all of their direct and indirect descendants have
41/// been serialized into NamedLeafs.
42#[derive(Debug)]
43enum Visited {
44    // handle root differently not to infect with the Option<String> and Option<usize>
45    DescentRoot(DirBuilder),
46    Descent {
47        node: DirBuilder,
48        name: String,
49        depth: usize,
50        /// The index in the parents `Leaves` accessible through `PostOrderIterator::persisted_cids`.
51        index: usize,
52    },
53    Post {
54        parent_id: u64,
55        depth: usize,
56        name: String,
57        index: usize,
58        /// Leaves will be stored directly in this field when there are no DirBuilder descendants,
59        /// in the `PostOrderIterator::persisted_cids` otherwise.
60        leaves: LeafStorage,
61    },
62    PostRoot {
63        leaves: LeafStorage,
64    },
65}
66
67impl PostOrderIterator {
68    pub(super) fn new(root: DirBuilder, opts: TreeOptions, longest_path: usize) -> Self {
69        let root = Visited::DescentRoot(root);
70        PostOrderIterator {
71            full_path: String::with_capacity(longest_path),
72            old_depth: 0,
73            block_buffer: Default::default(),
74            pending: vec![root],
75            persisted_cids: Default::default(),
76            reused_children: Vec::new(),
77            cid: None,
78            total_size: 0,
79            opts,
80        }
81    }
82
83    fn render_directory(
84        links: &[Option<NamedLeaf>],
85        buffer: &mut Vec<u8>,
86        block_size_limit: &Option<u64>,
87    ) -> Result<Leaf, TreeConstructionFailed> {
88        use crate::pb::{UnixFs, UnixFsType};
89        use quick_protobuf::{BytesWriter, MessageWrite, Writer};
90
91        // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably
92        // need to introduce states in to the "iterator":
93        //
94        // 1. bucketization
95        // 2. another post order visit of the buckets?
96        //
97        // the nested post order visit should probably re-use the existing infra ("message
98        // passing") and new ids can be generated by giving this iterator the counter from
99        // BufferedTreeBuilder.
100        //
101        // could also be that the HAMT shard building should start earlier, since the same
102        // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and
103        // this would "just" be a single node rendering, and not need any additional states..
104
105        let node = CustomFlatUnixFs {
106            links,
107            data: UnixFs {
108                Type: UnixFsType::Directory,
109                ..Default::default()
110            },
111        };
112
113        let size = node.get_size();
114
115        if let Some(limit) = block_size_limit {
116            let size = size as u64;
117            if *limit < size {
118                // FIXME: this could probably be detected at builder
119                return Err(TreeConstructionFailed::TooLargeBlock(size));
120            }
121        }
122
123        let cap = buffer.capacity();
124
125        if let Some(additional) = size.checked_sub(cap) {
126            buffer.reserve(additional);
127        }
128
129        if let Some(mut needed_zeroes) = size.checked_sub(buffer.len()) {
130            let zeroes = [0; 8];
131
132            while needed_zeroes > 8 {
133                buffer.extend_from_slice(&zeroes[..]);
134                needed_zeroes -= zeroes.len();
135            }
136
137            buffer.extend(core::iter::repeat(0).take(needed_zeroes));
138        }
139
140        let mut writer = Writer::new(BytesWriter::new(&mut buffer[..]));
141        node.write_message(&mut writer)
142            .map_err(TreeConstructionFailed::Protobuf)?;
143
144        buffer.truncate(size);
145
146        let mh = Code::Sha2_256.digest(buffer);
147        let cid = Cid::new_v1(DAG_PB, mh);
148
149        let combined_from_links = links
150            .iter()
151            .map(|opt| {
152                opt.as_ref()
153                    .map(|NamedLeaf(_, _, total_size)| total_size)
154                    .unwrap()
155            })
156            .sum::<u64>();
157
158        Ok(Leaf {
159            link: cid,
160            total_size: buffer.len() as u64 + combined_from_links,
161        })
162    }
163
164    /// Construct the next dag-pb node, if any.
165    ///
166    /// Returns a `TreeNode` of the latest constructed tree node.
167    pub fn next_borrowed(&mut self) -> Option<Result<TreeNode<'_>, TreeConstructionFailed>> {
168        while let Some(visited) = self.pending.pop() {
169            let (name, depth) = match &visited {
170                Visited::DescentRoot(_) => (None, 0),
171                Visited::Descent { name, depth, .. } => (Some(name.as_ref()), *depth),
172                Visited::Post { name, depth, .. } => (Some(name.as_ref()), *depth),
173                Visited::PostRoot { .. } => (None, 0),
174            };
175
176            update_full_path((&mut self.full_path, &mut self.old_depth), name, depth);
177
178            match visited {
179                Visited::DescentRoot(node) => {
180                    let children = &mut self.reused_children;
181                    let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children);
182                    let any_children = !children.is_empty();
183
184                    let leaves = if any_children {
185                        self.persisted_cids.insert(node.id, leaves);
186                        LeafStorage::from(node.id)
187                    } else {
188                        leaves.into()
189                    };
190
191                    self.pending.push(Visited::PostRoot { leaves });
192                    self.pending.append(children);
193                }
194                Visited::Descent {
195                    node,
196                    name,
197                    depth,
198                    index,
199                } => {
200                    let children = &mut self.reused_children;
201                    let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children);
202                    let any_children = !children.is_empty();
203                    let parent_id = node.parent_id.expect("only roots parent_id is None");
204
205                    let leaves = if any_children {
206                        self.persisted_cids.insert(node.id, leaves);
207                        node.id.into()
208                    } else {
209                        leaves.into()
210                    };
211
212                    self.pending.push(Visited::Post {
213                        parent_id,
214                        name,
215                        depth,
216                        leaves,
217                        index,
218                    });
219
220                    self.pending.append(children);
221                }
222                Visited::Post {
223                    parent_id,
224                    name,
225                    leaves,
226                    index,
227                    ..
228                } => {
229                    let leaves = leaves.into_inner(&mut self.persisted_cids);
230                    let buffer = &mut self.block_buffer;
231
232                    let leaf = match Self::render_directory(
233                        &leaves,
234                        buffer,
235                        &self.opts.block_size_limit,
236                    ) {
237                        Ok(leaf) => leaf,
238                        Err(e) => return Some(Err(e)),
239                    };
240
241                    self.cid = Some(leaf.link);
242                    self.total_size = leaf.total_size;
243
244                    {
245                        // name is None only for wrap_with_directory, which cannot really be
246                        // propagated up but still the parent_id is allowed to be None
247                        let parent_leaves = self.persisted_cids.get_mut(&parent_id);
248
249                        match (parent_id, parent_leaves, index) {
250                            (pid, None, index) => panic!(
251                                "leaves not found for parent_id = {} and index = {}",
252                                pid, index
253                            ),
254                            (_, Some(vec), index) => {
255                                let cell = &mut vec[index];
256                                // all
257                                assert!(cell.is_none());
258                                *cell = Some(NamedLeaf(name, leaf.link, leaf.total_size));
259                            }
260                        }
261                    }
262
263                    return Some(Ok(TreeNode {
264                        path: self.full_path.as_str(),
265                        cid: self.cid.as_ref().unwrap(),
266                        total_size: self.total_size,
267                        block: &self.block_buffer,
268                    }));
269                }
270                Visited::PostRoot { leaves } => {
271                    let leaves = leaves.into_inner(&mut self.persisted_cids);
272
273                    if !self.opts.wrap_with_directory {
274                        break;
275                    }
276
277                    let buffer = &mut self.block_buffer;
278
279                    let leaf = match Self::render_directory(
280                        &leaves,
281                        buffer,
282                        &self.opts.block_size_limit,
283                    ) {
284                        Ok(leaf) => leaf,
285                        Err(e) => return Some(Err(e)),
286                    };
287
288                    self.cid = Some(leaf.link);
289                    self.total_size = leaf.total_size;
290
291                    return Some(Ok(TreeNode {
292                        path: self.full_path.as_str(),
293                        cid: self.cid.as_ref().unwrap(),
294                        total_size: self.total_size,
295                        block: &self.block_buffer,
296                    }));
297                }
298            }
299        }
300        None
301    }
302}
303
304impl Iterator for PostOrderIterator {
305    type Item = Result<OwnedTreeNode, TreeConstructionFailed>;
306
307    fn next(&mut self) -> Option<Self::Item> {
308        self.next_borrowed()
309            .map(|res| res.map(TreeNode::into_owned))
310    }
311}
312
313/// Borrowed representation of a node in the tree.
314pub struct TreeNode<'a> {
315    /// Full path to the node.
316    pub path: &'a str,
317    /// The Cid of the document.
318    pub cid: &'a Cid,
319    /// Cumulative total size of the subtree in bytes.
320    pub total_size: u64,
321    /// Raw dag-pb document.
322    pub block: &'a [u8],
323}
324
325impl<'a> fmt::Debug for TreeNode<'a> {
326    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
327        fmt.debug_struct("TreeNode")
328            .field("path", &format_args!("{:?}", self.path))
329            .field("cid", &format_args!("{}", self.cid))
330            .field("total_size", &self.total_size)
331            .field("size", &self.block.len())
332            .finish()
333    }
334}
335
336impl TreeNode<'_> {
337    /// Convert to an owned and detached representation.
338    pub fn into_owned(self) -> OwnedTreeNode {
339        OwnedTreeNode {
340            path: self.path.to_owned(),
341            cid: self.cid.to_owned(),
342            total_size: self.total_size,
343            block: self.block.into(),
344        }
345    }
346}
347
348/// Owned representation of a node in the tree.
349pub struct OwnedTreeNode {
350    /// Full path to the node.
351    pub path: String,
352    /// The Cid of the document.
353    pub cid: Cid,
354    /// Cumulative total size of the subtree in bytes.
355    pub total_size: u64,
356    /// Raw dag-pb document.
357    pub block: Box<[u8]>,
358}
359
360fn update_full_path(
361    (full_path, old_depth): (&mut String, &mut usize),
362    name: Option<&str>,
363    depth: usize,
364) {
365    if depth < 2 {
366        // initially thought it might be a good idea to add a slash to all components; removing it made
367        // it impossible to get back down to empty string, so fixing this for depths 0 and 1.
368        full_path.clear();
369        *old_depth = 0;
370    } else {
371        while *old_depth >= depth && *old_depth > 0 {
372            // we now want to pop the last segment
373            // this would be easier with PathBuf
374            let slash_at = full_path.bytes().rposition(|ch| ch == b'/');
375            if let Some(slash_at) = slash_at {
376                if *old_depth == depth && Some(&full_path[(slash_at + 1)..]) == name {
377                    // minor unmeasurable perf optimization:
378                    // going from a/b/foo/zz => a/b/foo does not need to go through the a/b
379                    return;
380                }
381                full_path.truncate(slash_at);
382                *old_depth -= 1;
383            } else {
384                todo!(
385                    "no last slash_at in {:?} yet {} >= {}",
386                    full_path,
387                    old_depth,
388                    depth
389                );
390            }
391        }
392    }
393
394    debug_assert!(*old_depth <= depth);
395
396    if let Some(name) = name {
397        if !full_path.is_empty() {
398            full_path.push('/');
399        }
400        full_path.push_str(name);
401        *old_depth += 1;
402    }
403
404    assert_eq!(*old_depth, depth);
405}
406
407/// Returns a Vec of the links in order with only the leaves, the given `children` will contain yet
408/// incomplete nodes of the tree.
409fn partition_children_leaves(
410    depth: usize,
411    it: impl Iterator<Item = (String, Entry)>,
412    children: &mut Vec<Visited>,
413) -> Leaves {
414    let mut leaves = Vec::new();
415
416    for (i, (k, v)) in it.enumerate() {
417        match v {
418            Entry::Directory(node) => {
419                children.push(Visited::Descent {
420                    node,
421                    // this needs to be pushed down to update the full_path
422                    name: k,
423                    depth: depth + 1,
424                    index: i,
425                });
426
427                // this will be overwritten later, but the order is fixed
428                leaves.push(None);
429            }
430            Entry::Leaf(leaf) => leaves.push(Some(NamedLeaf(k, leaf.link, leaf.total_size))),
431        }
432    }
433
434    leaves
435}
436
437#[derive(Debug)]
438enum LeafStorage {
439    Direct(Leaves),
440    Stashed(u64),
441}
442
443impl LeafStorage {
444    fn into_inner(self, stash: &mut HashMap<u64, Leaves>) -> Leaves {
445        use LeafStorage::*;
446
447        match self {
448            Direct(leaves) => leaves,
449            Stashed(id) => stash
450                .remove(&id)
451                .ok_or(id)
452                .expect("leaves are either stashed or direct, must able to find with id"),
453        }
454    }
455}
456
457impl From<u64> for LeafStorage {
458    fn from(key: u64) -> LeafStorage {
459        LeafStorage::Stashed(key)
460    }
461}
462
463impl From<Leaves> for LeafStorage {
464    fn from(leaves: Leaves) -> LeafStorage {
465        LeafStorage::Direct(leaves)
466    }
467}