ipfs_unixfs/dir/builder/
iter.rs

1use super::{
2    CustomFlatUnixFs, DirBuilder, Entry, Leaf, NamedLeaf, TreeConstructionFailed, TreeOptions,
3};
4use cid::Cid;
5use core::fmt;
6use std::collections::HashMap;
7
8/// Constructs the directory nodes required for a tree.
9///
10/// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`.
11/// The tree is fully constructed once this has been exhausted.
12pub struct PostOrderIterator {
13    full_path: String,
14    old_depth: usize,
15    block_buffer: Vec<u8>,
16    // our stack of pending work
17    pending: Vec<Visited>,
18    // "communication channel" from nested entries back to their parents; this hashmap is only used
19    // in the event of mixed child nodes (leaves and nodes).
20    persisted_cids: HashMap<u64, Vec<Option<NamedLeaf>>>,
21    reused_children: Vec<Visited>,
22    cid: Option<Cid>,
23    total_size: u64,
24    // from TreeOptions
25    opts: TreeOptions,
26}
27
28/// The link list used to create the directory node. This list is created from a the BTreeMap
29/// inside DirBuilder, and initially it will have `Some` values only for the initial leaves and
30/// `None` values for subnodes which are not yet ready. At the time of use, this list is expected
31/// to have only `Some` values.
32type Leaves = Vec<Option<NamedLeaf>>;
33
34/// The nodes in the visit. We need to do a post-order visit, which starts from a single
35/// `DescentRoot`, followed by N `Descents` where N is the deepest directory in the tree. On each
36/// descent, we'll need to first schedule a `Post` (or `PostRoot`) followed the immediate children
37/// of the node. Directories are rendered when all of their direct and indirect descendants have
38/// been serialized into NamedLeafs.
39#[derive(Debug)]
40enum Visited {
41    // handle root differently not to infect with the Option<String> and Option<usize>
42    DescentRoot(DirBuilder),
43    Descent {
44        node: DirBuilder,
45        name: String,
46        depth: usize,
47        /// The index in the parents `Leaves` accessible through `PostOrderIterator::persisted_cids`.
48        index: usize,
49    },
50    Post {
51        parent_id: u64,
52        depth: usize,
53        name: String,
54        index: usize,
55        /// Leaves will be stored directly in this field when there are no DirBuilder descendants,
56        /// in the `PostOrderIterator::persisted_cids` otherwise.
57        leaves: LeafStorage,
58    },
59    PostRoot {
60        leaves: LeafStorage,
61    },
62}
63
64impl PostOrderIterator {
65    pub(super) fn new(root: DirBuilder, opts: TreeOptions, longest_path: usize) -> Self {
66        let root = Visited::DescentRoot(root);
67        PostOrderIterator {
68            full_path: String::with_capacity(longest_path),
69            old_depth: 0,
70            block_buffer: Default::default(),
71            pending: vec![root],
72            persisted_cids: Default::default(),
73            reused_children: Vec::new(),
74            cid: None,
75            total_size: 0,
76            opts,
77        }
78    }
79
80    fn render_directory(
81        links: &[Option<NamedLeaf>],
82        buffer: &mut Vec<u8>,
83        block_size_limit: &Option<u64>,
84    ) -> Result<Leaf, TreeConstructionFailed> {
85        use crate::pb::{UnixFs, UnixFsType};
86        use quick_protobuf::{BytesWriter, MessageWrite, Writer};
87        use sha2::{Digest, Sha256};
88
89        // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably
90        // need to introduce states in to the "iterator":
91        //
92        // 1. bucketization
93        // 2. another post order visit of the buckets?
94        //
95        // the nested post order visit should probably re-use the existing infra ("message
96        // passing") and new ids can be generated by giving this iterator the counter from
97        // BufferedTreeBuilder.
98        //
99        // could also be that the HAMT shard building should start earlier, since the same
100        // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and
101        // this would "just" be a single node rendering, and not need any additional states..
102
103        let node = CustomFlatUnixFs {
104            links,
105            data: UnixFs {
106                Type: UnixFsType::Directory,
107                ..Default::default()
108            },
109        };
110
111        let size = node.get_size();
112
113        if let Some(limit) = block_size_limit {
114            let size = size as u64;
115            if *limit < size {
116                // FIXME: this could probably be detected at builder
117                return Err(TreeConstructionFailed::TooLargeBlock(size));
118            }
119        }
120
121        let cap = buffer.capacity();
122
123        if let Some(additional) = size.checked_sub(cap) {
124            buffer.reserve(additional);
125        }
126
127        if let Some(mut needed_zeroes) = size.checked_sub(buffer.len()) {
128            let zeroes = [0; 8];
129
130            while needed_zeroes > 8 {
131                buffer.extend_from_slice(&zeroes[..]);
132                needed_zeroes -= zeroes.len();
133            }
134
135            buffer.extend(core::iter::repeat(0).take(needed_zeroes));
136        }
137
138        let mut writer = Writer::new(BytesWriter::new(&mut buffer[..]));
139        node.write_message(&mut writer)
140            .map_err(TreeConstructionFailed::Protobuf)?;
141
142        buffer.truncate(size);
143
144        let mh = multihash::wrap(multihash::Code::Sha2_256, &Sha256::digest(&buffer));
145        let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0");
146
147        let combined_from_links = links
148            .iter()
149            .map(|opt| {
150                opt.as_ref()
151                    .map(|NamedLeaf(_, _, total_size)| total_size)
152                    .unwrap()
153            })
154            .sum::<u64>();
155
156        Ok(Leaf {
157            link: cid,
158            total_size: buffer.len() as u64 + combined_from_links,
159        })
160    }
161
162    /// Construct the next dag-pb node, if any.
163    ///
164    /// Returns a `TreeNode` of the latest constructed tree node.
165    pub fn next_borrowed(&mut self) -> Option<Result<TreeNode<'_>, TreeConstructionFailed>> {
166        while let Some(visited) = self.pending.pop() {
167            let (name, depth) = match &visited {
168                Visited::DescentRoot(_) => (None, 0),
169                Visited::Descent { name, depth, .. } => (Some(name.as_ref()), *depth),
170                Visited::Post { name, depth, .. } => (Some(name.as_ref()), *depth),
171                Visited::PostRoot { .. } => (None, 0),
172            };
173
174            update_full_path((&mut self.full_path, &mut self.old_depth), name, depth);
175
176            match visited {
177                Visited::DescentRoot(node) => {
178                    let children = &mut self.reused_children;
179                    let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children);
180                    let any_children = !children.is_empty();
181
182                    let leaves = if any_children {
183                        self.persisted_cids.insert(node.id, leaves);
184                        LeafStorage::from(node.id)
185                    } else {
186                        leaves.into()
187                    };
188
189                    self.pending.push(Visited::PostRoot { leaves });
190                    self.pending.extend(children.drain(..));
191                }
192                Visited::Descent {
193                    node,
194                    name,
195                    depth,
196                    index,
197                } => {
198                    let children = &mut self.reused_children;
199                    let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children);
200                    let any_children = !children.is_empty();
201                    let parent_id = node.parent_id.expect("only roots parent_id is None");
202
203                    let leaves = if any_children {
204                        self.persisted_cids.insert(node.id, leaves);
205                        node.id.into()
206                    } else {
207                        leaves.into()
208                    };
209
210                    self.pending.push(Visited::Post {
211                        parent_id,
212                        name,
213                        depth,
214                        leaves,
215                        index,
216                    });
217
218                    self.pending.extend(children.drain(..));
219                }
220                Visited::Post {
221                    parent_id,
222                    name,
223                    leaves,
224                    index,
225                    ..
226                } => {
227                    let leaves = leaves.into_inner(&mut self.persisted_cids);
228                    let buffer = &mut self.block_buffer;
229
230                    let leaf = match Self::render_directory(
231                        &leaves,
232                        buffer,
233                        &self.opts.block_size_limit,
234                    ) {
235                        Ok(leaf) => leaf,
236                        Err(e) => return Some(Err(e)),
237                    };
238
239                    self.cid = Some(leaf.link.clone());
240                    self.total_size = leaf.total_size;
241
242                    {
243                        // name is None only for wrap_with_directory, which cannot really be
244                        // propagated up but still the parent_id is allowed to be None
245                        let parent_leaves = self.persisted_cids.get_mut(&parent_id);
246
247                        match (parent_id, parent_leaves, index) {
248                            (pid, None, index) => panic!(
249                                "leaves not found for parent_id = {} and index = {}",
250                                pid, index
251                            ),
252                            (_, Some(vec), index) => {
253                                let cell = &mut vec[index];
254                                // all
255                                assert!(cell.is_none());
256                                *cell = Some(NamedLeaf(name, leaf.link, leaf.total_size));
257                            }
258                        }
259                    }
260
261                    return Some(Ok(TreeNode {
262                        path: self.full_path.as_str(),
263                        cid: self.cid.as_ref().unwrap(),
264                        total_size: self.total_size,
265                        block: &self.block_buffer,
266                    }));
267                }
268                Visited::PostRoot { leaves } => {
269                    let leaves = leaves.into_inner(&mut self.persisted_cids);
270
271                    if !self.opts.wrap_with_directory {
272                        break;
273                    }
274
275                    let buffer = &mut self.block_buffer;
276
277                    let leaf = match Self::render_directory(
278                        &leaves,
279                        buffer,
280                        &self.opts.block_size_limit,
281                    ) {
282                        Ok(leaf) => leaf,
283                        Err(e) => return Some(Err(e)),
284                    };
285
286                    self.cid = Some(leaf.link.clone());
287                    self.total_size = leaf.total_size;
288
289                    return Some(Ok(TreeNode {
290                        path: self.full_path.as_str(),
291                        cid: self.cid.as_ref().unwrap(),
292                        total_size: self.total_size,
293                        block: &self.block_buffer,
294                    }));
295                }
296            }
297        }
298        None
299    }
300}
301
302impl Iterator for PostOrderIterator {
303    type Item = Result<OwnedTreeNode, TreeConstructionFailed>;
304
305    fn next(&mut self) -> Option<Self::Item> {
306        self.next_borrowed()
307            .map(|res| res.map(TreeNode::into_owned))
308    }
309}
310
311/// Borrowed representation of a node in the tree.
312pub struct TreeNode<'a> {
313    /// Full path to the node.
314    pub path: &'a str,
315    /// The Cid of the document.
316    pub cid: &'a Cid,
317    /// Cumulative total size of the subtree in bytes.
318    pub total_size: u64,
319    /// Raw dag-pb document.
320    pub block: &'a [u8],
321}
322
323impl<'a> fmt::Debug for TreeNode<'a> {
324    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
325        fmt.debug_struct("TreeNode")
326            .field("path", &format_args!("{:?}", self.path))
327            .field("cid", &format_args!("{}", self.cid))
328            .field("total_size", &self.total_size)
329            .field("size", &self.block.len())
330            .finish()
331    }
332}
333
334impl TreeNode<'_> {
335    /// Convert to an owned and detached representation.
336    pub fn into_owned(self) -> OwnedTreeNode {
337        OwnedTreeNode {
338            path: self.path.to_owned(),
339            cid: self.cid.to_owned(),
340            total_size: self.total_size,
341            block: self.block.into(),
342        }
343    }
344}
345
346/// Owned representation of a node in the tree.
347pub struct OwnedTreeNode {
348    /// Full path to the node.
349    pub path: String,
350    /// The Cid of the document.
351    pub cid: Cid,
352    /// Cumulative total size of the subtree in bytes.
353    pub total_size: u64,
354    /// Raw dag-pb document.
355    pub block: Box<[u8]>,
356}
357
358fn update_full_path(
359    (full_path, old_depth): (&mut String, &mut usize),
360    name: Option<&str>,
361    depth: usize,
362) {
363    if depth < 2 {
364        // initially thought it might be a good idea to add a slash to all components; removing it made
365        // it impossible to get back down to empty string, so fixing this for depths 0 and 1.
366        full_path.clear();
367        *old_depth = 0;
368    } else {
369        while *old_depth >= depth && *old_depth > 0 {
370            // we now want to pop the last segment
371            // this would be easier with PathBuf
372            let slash_at = full_path.bytes().rposition(|ch| ch == b'/');
373            if let Some(slash_at) = slash_at {
374                if *old_depth == depth && Some(&full_path[(slash_at + 1)..]) == name {
375                    // minor unmeasurable perf optimization:
376                    // going from a/b/foo/zz => a/b/foo does not need to go through the a/b
377                    return;
378                }
379                full_path.truncate(slash_at);
380                *old_depth -= 1;
381            } else {
382                todo!(
383                    "no last slash_at in {:?} yet {} >= {}",
384                    full_path,
385                    old_depth,
386                    depth
387                );
388            }
389        }
390    }
391
392    debug_assert!(*old_depth <= depth);
393
394    if let Some(name) = name {
395        if !full_path.is_empty() {
396            full_path.push_str("/");
397        }
398        full_path.push_str(name);
399        *old_depth += 1;
400    }
401
402    assert_eq!(*old_depth, depth);
403}
404
405/// Returns a Vec of the links in order with only the leaves, the given `children` will contain yet
406/// incomplete nodes of the tree.
407fn partition_children_leaves(
408    depth: usize,
409    it: impl Iterator<Item = (String, Entry)>,
410    children: &mut Vec<Visited>,
411) -> Leaves {
412    let mut leaves = Vec::new();
413
414    for (i, (k, v)) in it.enumerate() {
415        match v {
416            Entry::Directory(node) => {
417                children.push(Visited::Descent {
418                    node,
419                    // this needs to be pushed down to update the full_path
420                    name: k,
421                    depth: depth + 1,
422                    index: i,
423                });
424
425                // this will be overwritten later, but the order is fixed
426                leaves.push(None);
427            }
428            Entry::Leaf(leaf) => leaves.push(Some(NamedLeaf(k, leaf.link, leaf.total_size))),
429        }
430    }
431
432    leaves
433}
434
435#[derive(Debug)]
436enum LeafStorage {
437    Direct(Leaves),
438    Stashed(u64),
439}
440
441impl LeafStorage {
442    fn into_inner(self, stash: &mut HashMap<u64, Leaves>) -> Leaves {
443        use LeafStorage::*;
444
445        match self {
446            Direct(leaves) => leaves,
447            Stashed(id) => stash
448                .remove(&id)
449                .ok_or(id)
450                .expect("leaves are either stashed or direct, must able to find with id"),
451        }
452    }
453}
454
455impl From<u64> for LeafStorage {
456    fn from(key: u64) -> LeafStorage {
457        LeafStorage::Stashed(key)
458    }
459}
460
461impl From<Leaves> for LeafStorage {
462    fn from(leaves: Leaves) -> LeafStorage {
463        LeafStorage::Direct(leaves)
464    }
465}