rust_unixfs/file/
visit.rs

1use core::convert::TryFrom;
2use core::ops::Range;
3use ipld_core::cid::Cid;
4
5use crate::file::reader::{FileContent, FileReader, Traversal};
6use crate::file::{FileReadFailed, Metadata};
7use crate::pb::{merkledag::PBLink, FlatUnixFs};
8use crate::InvalidCidInLink;
9
10/// IdleFileVisit represents a prepared file visit over a tree. The user has to know the CID and be
11/// able to get the block for the visit.
12///
13/// **Note**: For easier to use interface, you should consider using `ipfs_unixfs::walk::Walker`.
14/// It uses `IdleFileVisit` and `FileVisit` internally but has a better API.
15#[derive(Default, Debug)]
16pub struct IdleFileVisit {
17    range: Option<Range<u64>>,
18}
19
20type FileVisitResult<'a> = (&'a [u8], u64, Metadata, Option<FileVisit>);
21
22impl IdleFileVisit {
23    /// Target range represents the target byte range of the file we are interested in visiting.
24    pub fn with_target_range(self, range: Range<u64>) -> Self {
25        Self { range: Some(range) }
26    }
27
28    /// Begins the visitation by processing the first block to be visited.
29    ///
30    /// Returns (on success) a tuple of file bytes, total file size, any metadata associated, and
31    /// optionally a `FileVisit` to continue the walk.
32    pub fn start(self, block: &'_ [u8]) -> Result<FileVisitResult<'_>, FileReadFailed> {
33        let fr = FileReader::from_block(block)?;
34        self.start_from_reader(fr, &mut None)
35    }
36
37    pub(crate) fn start_from_parsed<'a>(
38        self,
39        block: FlatUnixFs<'a>,
40        cache: &'_ mut Option<Cache>,
41    ) -> Result<FileVisitResult<'a>, FileReadFailed> {
42        let fr = FileReader::from_parsed(block)?;
43        self.start_from_reader(fr, cache)
44    }
45
46    fn start_from_reader<'a>(
47        self,
48        fr: FileReader<'a>,
49        cache: &'_ mut Option<Cache>,
50    ) -> Result<FileVisitResult<'a>, FileReadFailed> {
51        let metadata = fr.as_ref().to_owned();
52
53        let (content, traversal) = fr.content();
54
55        match content {
56            FileContent::Bytes(content) => {
57                let block = 0..content.len() as u64;
58                let content = maybe_target_slice(content, &block, self.range.as_ref());
59                Ok((content, traversal.file_size(), metadata, None))
60            }
61            FileContent::Links(iter) => {
62                // we need to select suitable here
63                let mut links = cache.take().unwrap_or_default().inner;
64
65                let pending = iter.enumerate().filter_map(|(i, (link, range))| {
66                    if !block_is_in_target_range(&range, self.range.as_ref()) {
67                        return None;
68                    }
69
70                    Some(to_pending(i, link, range))
71                });
72
73                for item in pending {
74                    links.push(item?);
75                }
76
77                // order is reversed to consume them in the depth first order
78                links.reverse();
79
80                if links.is_empty() {
81                    *cache = Some(links.into());
82                    Ok((&[][..], traversal.file_size(), metadata, None))
83                } else {
84                    Ok((
85                        &[][..],
86                        traversal.file_size(),
87                        metadata,
88                        Some(FileVisit {
89                            pending: links,
90                            state: traversal,
91                            range: self.range,
92                        }),
93                    ))
94                }
95            }
96        }
97    }
98}
99
100/// Optional cache for datastructures which can be re-used without re-allocation between walks of
101/// different files.
102#[derive(Default)]
103pub struct Cache {
104    inner: Vec<(Cid, Range<u64>)>,
105}
106
107impl From<Vec<(Cid, Range<u64>)>> for Cache {
108    fn from(mut inner: Vec<(Cid, Range<u64>)>) -> Self {
109        inner.clear();
110        Cache { inner }
111    }
112}
113
114/// FileVisit represents an ongoing visitation over an UnixFs File tree.
115///
116/// The file visitor does **not** implement size validation of merkledag links at the moment. This
117/// could be implmented with generational storage and it would require an u64 per link.
118///
119/// **Note**: For easier to use interface, you should consider using `ipfs_unixfs::walk::Walker`.
120/// It uses `IdleFileVisit` and `FileVisit` internally but has a better API.
121#[derive(Debug)]
122pub struct FileVisit {
123    /// The internal cache for pending work. Order is such that the next is always the last item,
124    /// so it can be popped. This currently does use a lot of memory for very large files.
125    ///
126    /// One workaround would be to transform excess links to relative links to some block of a Cid.
127    pending: Vec<(Cid, Range<u64>)>,
128    /// Target range, if any. Used to filter the links so that we will only visit interesting
129    /// parts.
130    range: Option<Range<u64>>,
131    state: Traversal,
132}
133
134impl FileVisit {
135    /// Access hashes of all pending links for prefetching purposes. The block for the first item
136    /// returned by this method is the one which needs to be processed next with `continue_walk`.
137    ///
138    /// Returns tuple of the next Cid which needs to be processed and an iterator over the
139    /// remaining.
140    pub fn pending_links(&self) -> (&Cid, impl Iterator<Item = &Cid>) {
141        let mut iter = self.pending.iter().rev().map(|(link, _)| link);
142        let first = iter
143            .next()
144            .expect("the presence of links has been validated");
145        (first, iter)
146    }
147
148    /// Continues the walk with the data for the first `pending_link` key.
149    ///
150    /// Returns on success a tuple of bytes and new version of `FileVisit` to continue the visit,
151    /// when there is something more to visit.
152    pub fn continue_walk<'a>(
153        mut self,
154        next: &'a [u8],
155        cache: &mut Option<Cache>,
156    ) -> Result<(&'a [u8], Option<Self>), FileReadFailed> {
157        let traversal = self.state;
158        let (_, range) = self
159            .pending
160            .pop()
161            .expect("User called continue_walk there must have been a next link");
162
163        // interesting, validation doesn't trigger if the range is the same?
164        let fr = traversal.continue_walk(next, &range)?;
165        let (content, traversal) = fr.content();
166        match content {
167            FileContent::Bytes(content) => {
168                let content = maybe_target_slice(content, &range, self.range.as_ref());
169
170                if !self.pending.is_empty() {
171                    self.state = traversal;
172                    Ok((content, Some(self)))
173                } else {
174                    *cache = Some(self.pending.into());
175                    Ok((content, None))
176                }
177            }
178            FileContent::Links(iter) => {
179                let before = self.pending.len();
180
181                for (i, (link, range)) in iter.enumerate() {
182                    if !block_is_in_target_range(&range, self.range.as_ref()) {
183                        continue;
184                    }
185
186                    self.pending.push(to_pending(i, link, range)?);
187                }
188
189                // reverse to keep the next link we need to traverse as last, where pop() operates.
190                self.pending[before..].reverse();
191
192                self.state = traversal;
193                Ok((&[][..], Some(self)))
194            }
195        }
196    }
197
198    /// Returns the total size of the file in bytes.
199    pub fn file_size(&self) -> u64 {
200        self.state.file_size()
201    }
202}
203
204impl AsRef<Metadata> for FileVisit {
205    fn as_ref(&self) -> &Metadata {
206        self.state.as_ref()
207    }
208}
209
210fn to_pending(
211    nth: usize,
212    link: PBLink<'_>,
213    range: Range<u64>,
214) -> Result<(Cid, Range<u64>), FileReadFailed> {
215    let hash = link.Hash.as_deref().unwrap_or_default();
216
217    match Cid::try_from(hash) {
218        Ok(cid) => Ok((cid, range)),
219        Err(e) => Err(FileReadFailed::InvalidCid(InvalidCidInLink::from((
220            nth, link, e,
221        )))),
222    }
223}
224
225/// Returns true if the blocks byte offsets are interesting for our target range, false otherwise.
226/// If there is no target, all blocks are of interest.
227fn block_is_in_target_range(block: &Range<u64>, target: Option<&Range<u64>>) -> bool {
228    use core::cmp::{max, min};
229
230    if let Some(target) = target {
231        max(block.start, target.start) <= min(block.end, target.end)
232    } else {
233        true
234    }
235}
236
237/// Whenever we propagate the content from the tree upwards, we need to make sure it's inside the
238/// range we were originally interested in.
239fn maybe_target_slice<'a>(
240    content: &'a [u8],
241    block: &Range<u64>,
242    target: Option<&Range<u64>>,
243) -> &'a [u8] {
244    if let Some(target) = target {
245        target_slice(content, block, target)
246    } else {
247        content
248    }
249}
250
251fn target_slice<'a>(content: &'a [u8], block: &Range<u64>, target: &Range<u64>) -> &'a [u8] {
252    use core::cmp::min;
253
254    if !block_is_in_target_range(block, Some(target)) {
255        // defaulting to empty slice is good, and similar to the "cat" HTTP API operation.
256        &[][..]
257    } else {
258        let start;
259        let end;
260
261        // FIXME: these must have bugs and must be possible to simplify
262        if target.start < block.start {
263            // we mostly need something before
264            start = 0;
265            end = (min(target.end, block.end) - block.start) as usize;
266        } else if target.end > block.end {
267            // we mostly need something after
268            start = (target.start - block.start) as usize;
269            end = (min(target.end, block.end) - block.start) as usize;
270        } else {
271            // inside
272            start = (target.start - block.start) as usize;
273            end = start + (target.end - target.start) as usize;
274        }
275
276        &content[start..end]
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use super::target_slice;
283
284    #[test]
285    #[allow(clippy::type_complexity)]
286    fn slice_for_target() {
287        use core::ops::Range;
288
289        // turns out these examples are not easy to determine at all
290        // writing out the type here avoids &b""[..] inside the array.
291        let cases: &[(&[u8], u64, Range<u64>, &[u8])] = &[
292            // xxxx xxxx cont ent_
293            // ^^^^ ^^^^
294            (b"content_", 8, 0..8, b""),
295            // xxxx xxxx cont ent_
296            // ^^^^ ^^^^ ^
297            (b"content_", 8, 0..9, b"c"),
298            // xxxx xxxx cont ent_
299            //  ^^^ ^^^^ ^^^^ ^^^^ ...
300            (b"content_", 8, 1..20, b"content_"),
301            // xxxx xxxx cont ent_
302            //         ^ ^^^^ ^^^^ ...
303            (b"content_", 8, 7..20, b"content_"),
304            // xxxx xxxx cont ent_
305            //           ^^^^ ^^^^ ...
306            (b"content_", 8, 8..20, b"content_"),
307            // xxxx xxxx cont ent_
308            //            ^^^ ^^^^ ...
309            (b"content_", 8, 9..20, b"ontent_"),
310            // xxxx xxxx cont ent_
311            //                   ^ ...
312            (b"content_", 8, 15..20, b"_"),
313            // xxxx xxxx cont ent_ yyyy
314            //                     ^^^^
315            (b"content_", 8, 16..20, b""),
316        ];
317
318        for (block_data, block_offset, target_range, expected) in cases {
319            let block_range = *block_offset..(block_offset + block_data.len() as u64);
320            let sliced = target_slice(block_data, &block_range, target_range);
321            assert_eq!(
322                sliced, *expected,
323                "slice {target_range:?} of block {block_range:?}"
324            );
325        }
326    }
327}