Skip to main content

dua/
traverse.rs

1use crate::{Throttle, WalkOptions, crossdev, get_size_or_panic, inodefilter::InodeFilter};
2
3use crossbeam::channel::Receiver;
4use filesize::PathExt;
5use petgraph::{Directed, Direction, graph::NodeIndex, stable_graph::StableGraph};
6use std::time::Instant;
7use std::{
8    fmt,
9    fs::Metadata,
10    io,
11    path::{Path, PathBuf},
12    sync::Arc,
13    time::{Duration, SystemTime, UNIX_EPOCH},
14};
15
16/// Node index type used by the traversal tree graph.
17pub type TreeIndex = NodeIndex;
18/// Graph type used to represent traversed filesystem entries.
19pub type Tree = StableGraph<EntryData, (), Directed>;
20
21/// Data stored for each filesystem entry in the traversal tree.
22#[derive(Eq, PartialEq, Clone)]
23pub struct EntryData {
24    /// The entry name relative to its parent.
25    pub name: PathBuf,
26    /// The entry's size in bytes. If it's a directory, the size is the aggregated file size of all children
27    /// plus the  size of the directory entry itself
28    pub size: u128,
29    /// Last modification time if available.
30    pub mtime: SystemTime,
31    /// Recursive entry count for directories, or `None` for files.
32    pub entry_count: Option<u64>,
33    /// If set, the item meta-data could not be obtained
34    pub metadata_io_error: bool,
35    /// `true` if the entry is a directory.
36    pub is_dir: bool,
37}
38
39impl Default for EntryData {
40    fn default() -> EntryData {
41        EntryData {
42            name: PathBuf::default(),
43            size: u128::default(),
44            mtime: UNIX_EPOCH,
45            entry_count: None,
46            metadata_io_error: bool::default(),
47            is_dir: false,
48        }
49    }
50}
51
52impl fmt::Debug for EntryData {
53    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54        f.debug_struct("EntryData")
55            .field("name", &self.name)
56            .field("size", &self.size)
57            .field("entry_count", &self.entry_count)
58            // Skip mtime
59            .field("metadata_io_error", &self.metadata_io_error)
60            .finish()
61    }
62}
63
64/// The result of the previous filesystem traversal
65#[derive(Debug)]
66pub struct Traversal {
67    /// A tree representing the entire filestem traversal
68    pub tree: Tree,
69    /// The top-level node of the tree.
70    pub root_index: TreeIndex,
71    /// The time at which the instance was created, typically the start of the traversal.
72    pub start_time: Instant,
73    /// The time it cost to compute the traversal, when done.
74    pub cost: Option<Duration>,
75}
76
77impl Default for Traversal {
78    fn default() -> Self {
79        Self::new()
80    }
81}
82
83impl Traversal {
84    /// Create a new empty traversal with a synthetic root node.
85    pub fn new() -> Self {
86        let mut tree = Tree::new();
87        let root_index = tree.add_node(EntryData::default());
88        Self {
89            tree,
90            root_index,
91            start_time: Instant::now(),
92            cost: None,
93        }
94    }
95
96    /// Recompute the recursive size of `node_index` from its direct children.
97    pub(crate) fn recompute_node_size(&self, node_index: TreeIndex) -> u128 {
98        self.tree
99            .neighbors_directed(node_index, Direction::Outgoing)
100            .map(|idx| get_size_or_panic(&self.tree, idx))
101            .sum()
102    }
103
104    /// Return `true` if this traversal is considered expensive to recompute.
105    pub fn is_costly(&self) -> bool {
106        self.cost.is_none_or(|d| d.as_secs_f32() > 10.0)
107    }
108}
109
110/// Runtime statistics gathered while traversal is running.
111#[derive(Clone, Copy)]
112pub struct TraversalStats {
113    /// Amount of files or directories we have seen during the filesystem traversal
114    pub entries_traversed: u64,
115    /// The time at which the traversal started.
116    pub start: std::time::Instant,
117    /// The amount of time it took to finish the traversal. Set only once done.
118    pub elapsed: Option<std::time::Duration>,
119    /// Total amount of IO errors encountered when traversing the filesystem
120    pub io_errors: u64,
121    /// Total amount of bytes seen during the traversal
122    pub total_bytes: Option<u128>,
123}
124
125impl Default for TraversalStats {
126    fn default() -> Self {
127        Self {
128            entries_traversed: 0,
129            start: std::time::Instant::now(),
130            elapsed: None,
131            io_errors: 0,
132            total_bytes: None,
133        }
134    }
135}
136
137/// Accumulator used while rolling up directory information.
138#[derive(Default, Copy, Clone)]
139struct EntryInfo {
140    /// Accumulated size in bytes.
141    size: u128,
142    /// Accumulated entry count if known.
143    entries_count: Option<u64>,
144}
145
146impl EntryInfo {
147    /// Add `other`'s `entries_count` into `self`, preserving `None` semantics.
148    fn add_count(&mut self, other: &Self) {
149        self.entries_count = match (self.entries_count, other.entries_count) {
150            (Some(a), Some(b)) => Some(a + b),
151            (None, Some(b)) => Some(b),
152            (Some(a), None) => Some(a),
153            (None, None) => None,
154        };
155    }
156}
157
158/// Update `node_idx` with aggregated directory information.
159///
160/// `node_own_size` is added on top of `EntryInfo::size`.
161fn set_entry_info_or_panic(
162    tree: &mut Tree,
163    node_idx: TreeIndex,
164    node_own_size: u128,
165    EntryInfo {
166        size,
167        entries_count,
168    }: EntryInfo,
169) {
170    let node = tree
171        .node_weight_mut(node_idx)
172        .expect("node for parent index we just retrieved");
173    node.size = size + node_own_size;
174    node.entry_count = entries_count.map(|n| n + 1);
175}
176
177/// Return the parent of `parent_node_idx` or panic if none exists.
178fn parent_or_panic(tree: &mut Tree, parent_node_idx: TreeIndex) -> TreeIndex {
179    tree.neighbors_directed(parent_node_idx, Direction::Incoming)
180        .next()
181        .expect("every node in the iteration has a parent")
182}
183
184/// Pop one element from `v` or panic if `v` is empty.
185fn pop_or_panic<T>(v: &mut Vec<T>) -> T {
186    v.pop().expect("sizes per level to be in sync with graph")
187}
188
189/// A single result emitted by the jwalk iterator.
190type TraversalEntry =
191    Result<jwalk::DirEntry<((), Option<Result<std::fs::Metadata, jwalk::Error>>)>, jwalk::Error>;
192
193#[allow(clippy::large_enum_variant)]
194/// Events emitted by a background filesystem traversal.
195pub enum TraversalEvent {
196    /// A discovered entry and its traversal context.
197    Entry(TraversalEntry, Arc<PathBuf>, u64),
198    /// Traversal completed with additional I/O error count.
199    Finished(u64),
200}
201
202/// An in-progress traversal which exposes newly obtained entries
203pub struct BackgroundTraversal {
204    walk_options: WalkOptions,
205    /// Tree node index that acts as root for this traversal integration.
206    pub root_idx: TreeIndex,
207    /// Running traversal statistics.
208    pub stats: TraversalStats,
209    previous_node_idx: TreeIndex,
210    parent_node_idx: TreeIndex,
211    directory_info_per_depth_level: Vec<EntryInfo>,
212    current_directory_at_depth: EntryInfo,
213    parent_node_size: u128,
214    parent_node_size_per_depth_level: Vec<u128>,
215    previous_node_size: u128,
216    previous_depth: usize,
217    inodes: InodeFilter,
218    throttle: Option<Throttle>,
219    skip_root: bool,
220    use_root_path: bool,
221    /// Receiver used to obtain traversal events from the worker thread.
222    pub event_rx: Receiver<TraversalEvent>,
223}
224
225impl BackgroundTraversal {
226    /// Start a background thread to perform the actual tree walk, and dispatch the results
227    /// as events to be received on [BackgroundTraversal::event_rx].
228    pub fn start(
229        root_idx: TreeIndex,
230        walk_options: &WalkOptions,
231        input: Vec<PathBuf>,
232        skip_root: bool,
233        use_root_path: bool,
234    ) -> anyhow::Result<BackgroundTraversal> {
235        let (entry_tx, entry_rx) = crossbeam::channel::bounded(100);
236        std::thread::Builder::new()
237            .name("dua-fs-walk-dispatcher".to_string())
238            .spawn({
239                let walk_options = walk_options.clone();
240                let mut io_errors: u64 = 0;
241                move || {
242                    for root_path in input.into_iter() {
243                        log::info!("Walking {root_path:?}");
244                        let device_id = match crossdev::init(root_path.as_ref()) {
245                            Ok(id) => id,
246                            Err(_) => {
247                                io_errors += 1;
248                                continue;
249                            }
250                        };
251
252                        let root_path = Arc::new(root_path);
253                        for entry in walk_options
254                            .iter_from_path(root_path.as_ref(), device_id, skip_root)
255                            .into_iter()
256                        {
257                            if entry_tx
258                                .send(TraversalEvent::Entry(
259                                    entry,
260                                    Arc::clone(&root_path),
261                                    device_id,
262                                ))
263                                .is_err()
264                            {
265                                // The channel is closed, this means the user has
266                                // requested to quit the app. Abort the walking.
267                                return;
268                            }
269                        }
270                    }
271                    if entry_tx.send(TraversalEvent::Finished(io_errors)).is_err() {
272                        log::error!("Failed to send TraversalEvents::Finished event");
273                    }
274                }
275            })?;
276
277        Ok(Self {
278            walk_options: walk_options.clone(),
279            root_idx,
280            stats: TraversalStats::default(),
281            previous_node_idx: root_idx,
282            parent_node_idx: root_idx,
283            previous_node_size: 0,
284            parent_node_size: 0,
285            parent_node_size_per_depth_level: Vec::new(),
286            directory_info_per_depth_level: Vec::new(),
287            current_directory_at_depth: EntryInfo::default(),
288            previous_depth: 0,
289            inodes: InodeFilter::default(),
290            throttle: Some(Throttle::new(Duration::from_millis(250), None)),
291            skip_root,
292            use_root_path,
293            event_rx: entry_rx,
294        })
295    }
296
297    /// Integrate `event` into traversal `t` so its information is represented by it.
298    /// This builds the traversal tree from a directory-walk.
299    ///
300    /// Returns
301    /// * `Some(true)` if the traversal is finished
302    /// * `Some(false)` if the caller may update its state after throttling kicked in
303    /// * `None` - the event was written into the traversal, but there is nothing else to do
304    pub fn integrate_traversal_event(
305        &mut self,
306        traversal: &mut Traversal,
307        event: TraversalEvent,
308    ) -> Option<bool> {
309        match event {
310            TraversalEvent::Entry(entry, root_path, device_id) => {
311                self.stats.entries_traversed += 1;
312                let mut data = EntryData::default();
313                match entry {
314                    Ok(mut entry) => {
315                        if self.skip_root {
316                            entry.depth -= 1;
317                            data.name = entry.file_name.into()
318                        } else {
319                            data.name = if entry.depth < 1 && self.use_root_path {
320                                (*root_path).clone()
321                            } else {
322                                entry.file_name.into()
323                            }
324                        }
325
326                        let mut file_size = 0u128;
327                        let mut mtime: SystemTime = UNIX_EPOCH;
328                        let mut file_count = 0u64;
329                        match &entry.client_state {
330                            Some(Ok(m)) => {
331                                if self.walk_options.count_hard_links
332                                    || self.inodes.add(m)
333                                        && (self.walk_options.cross_filesystems
334                                            || crossdev::is_same_device(device_id, m))
335                                {
336                                    file_count = 1;
337                                    if self.walk_options.apparent_size {
338                                        file_size = m.len() as u128;
339                                    } else {
340                                        file_size = size_on_disk(&entry.parent_path, &data.name, m)
341                                            .unwrap_or_else(|_| {
342                                                self.stats.io_errors += 1;
343                                                data.metadata_io_error = true;
344                                                0
345                                            })
346                                            as u128;
347                                    }
348                                } else {
349                                    data.entry_count = Some(0);
350                                    data.is_dir = true;
351                                }
352
353                                match m.modified() {
354                                    Ok(modified) => {
355                                        mtime = modified;
356                                    }
357                                    Err(_) => {
358                                        self.stats.io_errors += 1;
359                                        data.metadata_io_error = true;
360                                    }
361                                }
362                            }
363                            Some(Err(_)) => {
364                                self.stats.io_errors += 1;
365                                data.metadata_io_error = true;
366                            }
367                            None => {}
368                        }
369
370                        match (entry.depth, self.previous_depth) {
371                            (n, p) if n > p => {
372                                self.directory_info_per_depth_level
373                                    .push(self.current_directory_at_depth);
374                                self.current_directory_at_depth = EntryInfo {
375                                    size: file_size,
376                                    entries_count: Some(file_count),
377                                };
378
379                                self.parent_node_size_per_depth_level
380                                    .push(self.previous_node_size);
381
382                                self.parent_node_idx = self.previous_node_idx;
383                                self.parent_node_size = self.previous_node_size;
384                            }
385                            (n, p) if n < p => {
386                                for _ in n..p {
387                                    set_entry_info_or_panic(
388                                        &mut traversal.tree,
389                                        self.parent_node_idx,
390                                        self.parent_node_size,
391                                        self.current_directory_at_depth,
392                                    );
393                                    let dir_info =
394                                        pop_or_panic(&mut self.directory_info_per_depth_level);
395
396                                    self.current_directory_at_depth.size += dir_info.size;
397                                    self.current_directory_at_depth.add_count(&dir_info);
398
399                                    self.parent_node_idx =
400                                        parent_or_panic(&mut traversal.tree, self.parent_node_idx);
401                                    self.parent_node_size =
402                                        pop_or_panic(&mut self.parent_node_size_per_depth_level);
403                                }
404                                self.current_directory_at_depth.size += file_size;
405                                *self
406                                    .current_directory_at_depth
407                                    .entries_count
408                                    .get_or_insert(0) += file_count;
409                                set_entry_info_or_panic(
410                                    &mut traversal.tree,
411                                    self.parent_node_idx,
412                                    self.parent_node_size,
413                                    self.current_directory_at_depth,
414                                );
415                            }
416                            _ => {
417                                self.current_directory_at_depth.size += file_size;
418                                *self
419                                    .current_directory_at_depth
420                                    .entries_count
421                                    .get_or_insert(0) += file_count;
422                            }
423                        };
424
425                        data.mtime = mtime;
426                        data.size = file_size;
427                        let entry_index = traversal.tree.add_node(data);
428
429                        traversal
430                            .tree
431                            .add_edge(self.parent_node_idx, entry_index, ());
432                        self.previous_node_idx = entry_index;
433                        self.previous_node_size = file_size;
434                        self.previous_depth = entry.depth;
435                    }
436                    Err(_) => {
437                        if self.previous_depth == 0 {
438                            data.name.clone_from(&(*root_path));
439                            let entry_index = traversal.tree.add_node(data);
440                            traversal
441                                .tree
442                                .add_edge(self.parent_node_idx, entry_index, ());
443                        }
444
445                        self.stats.io_errors += 1
446                    }
447                }
448
449                if self.throttle.as_ref().is_some_and(|t| t.can_update()) {
450                    return Some(false);
451                }
452            }
453            TraversalEvent::Finished(io_errors) => {
454                self.stats.io_errors += io_errors;
455
456                self.throttle = None;
457                self.directory_info_per_depth_level
458                    .push(self.current_directory_at_depth);
459                self.current_directory_at_depth = EntryInfo::default();
460                for _ in 0..self.previous_depth {
461                    let dir_info = pop_or_panic(&mut self.directory_info_per_depth_level);
462                    self.current_directory_at_depth.size += dir_info.size;
463                    self.current_directory_at_depth.add_count(&dir_info);
464
465                    set_entry_info_or_panic(
466                        &mut traversal.tree,
467                        self.parent_node_idx,
468                        self.parent_node_size,
469                        self.current_directory_at_depth,
470                    );
471                    self.parent_node_idx =
472                        parent_or_panic(&mut traversal.tree, self.parent_node_idx);
473                }
474                let root_size = traversal.recompute_node_size(self.root_idx);
475                set_entry_info_or_panic(
476                    &mut traversal.tree,
477                    self.root_idx,
478                    root_size,
479                    EntryInfo {
480                        size: root_size,
481                        entries_count: (self.stats.entries_traversed > 0)
482                            .then_some(self.stats.entries_traversed),
483                    },
484                );
485                self.stats.total_bytes = Some(root_size);
486                self.stats.elapsed = Some(self.stats.start.elapsed());
487
488                return Some(true);
489            }
490        }
491        None
492    }
493}
494
495#[cfg(not(windows))]
496/// Return disk usage for `name` on Unix-like platforms.
497fn size_on_disk(_parent: &Path, name: &Path, meta: &Metadata) -> io::Result<u64> {
498    name.size_on_disk_fast(meta)
499}
500
501#[cfg(windows)]
502/// Return disk usage for `name` on Windows platforms.
503fn size_on_disk(parent: &Path, name: &Path, meta: &Metadata) -> io::Result<u64> {
504    parent.join(name).size_on_disk_fast(meta)
505}
506
507#[cfg(test)]
508mod tests {
509    use super::*;
510
511    #[test]
512    fn size_of_entry_data() {
513        assert!(
514            std::mem::size_of::<EntryData>() <= 80,
515            "the size of this ({}) should not exceed 80 as it affects overall memory consumption",
516            std::mem::size_of::<EntryData>()
517        );
518    }
519}