dua/
traverse.rs

1use crate::{Throttle, WalkOptions, crossdev, get_size_or_panic, inodefilter::InodeFilter};
2
3use crossbeam::channel::Receiver;
4use filesize::PathExt;
5use petgraph::{Directed, Direction, graph::NodeIndex, stable_graph::StableGraph};
6use std::time::Instant;
7use std::{
8    fmt,
9    fs::Metadata,
10    io,
11    path::{Path, PathBuf},
12    sync::Arc,
13    time::{Duration, SystemTime, UNIX_EPOCH},
14};
15
16pub type TreeIndex = NodeIndex;
17pub type Tree = StableGraph<EntryData, (), Directed>;
18
19#[derive(Eq, PartialEq, Clone)]
20pub struct EntryData {
21    pub name: PathBuf,
22    /// The entry's size in bytes. If it's a directory, the size is the aggregated file size of all children
23    /// plus the  size of the directory entry itself
24    pub size: u128,
25    pub mtime: SystemTime,
26    pub entry_count: Option<u64>,
27    /// If set, the item meta-data could not be obtained
28    pub metadata_io_error: bool,
29    pub is_dir: bool,
30}
31
32impl Default for EntryData {
33    fn default() -> EntryData {
34        EntryData {
35            name: PathBuf::default(),
36            size: u128::default(),
37            mtime: UNIX_EPOCH,
38            entry_count: None,
39            metadata_io_error: bool::default(),
40            is_dir: false,
41        }
42    }
43}
44
45impl fmt::Debug for EntryData {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        f.debug_struct("EntryData")
48            .field("name", &self.name)
49            .field("size", &self.size)
50            .field("entry_count", &self.entry_count)
51            // Skip mtime
52            .field("metadata_io_error", &self.metadata_io_error)
53            .finish()
54    }
55}
56
57/// The result of the previous filesystem traversal
58#[derive(Debug)]
59pub struct Traversal {
60    /// A tree representing the entire filestem traversal
61    pub tree: Tree,
62    /// The top-level node of the tree.
63    pub root_index: TreeIndex,
64    /// The time at which the instance was created, typically the start of the traversal.
65    pub start_time: Instant,
66    /// The time it cost to compute the traversal, when done.
67    pub cost: Option<Duration>,
68}
69
70impl Default for Traversal {
71    fn default() -> Self {
72        Self::new()
73    }
74}
75
76impl Traversal {
77    pub fn new() -> Self {
78        let mut tree = Tree::new();
79        let root_index = tree.add_node(EntryData::default());
80        Self {
81            tree,
82            root_index,
83            start_time: Instant::now(),
84            cost: None,
85        }
86    }
87
88    pub fn recompute_node_size(&self, node_index: TreeIndex) -> u128 {
89        self.tree
90            .neighbors_directed(node_index, Direction::Outgoing)
91            .map(|idx| get_size_or_panic(&self.tree, idx))
92            .sum()
93    }
94
95    pub fn is_costly(&self) -> bool {
96        self.cost.is_none_or(|d| d.as_secs_f32() > 10.0)
97    }
98}
99
100#[derive(Clone, Copy)]
101pub struct TraversalStats {
102    /// Amount of files or directories we have seen during the filesystem traversal
103    pub entries_traversed: u64,
104    /// The time at which the traversal started.
105    pub start: std::time::Instant,
106    /// The amount of time it took to finish the traversal. Set only once done.
107    pub elapsed: Option<std::time::Duration>,
108    /// Total amount of IO errors encountered when traversing the filesystem
109    pub io_errors: u64,
110    /// Total amount of bytes seen during the traversal
111    pub total_bytes: Option<u128>,
112}
113
114impl Default for TraversalStats {
115    fn default() -> Self {
116        Self {
117            entries_traversed: 0,
118            start: std::time::Instant::now(),
119            elapsed: None,
120            io_errors: 0,
121            total_bytes: None,
122        }
123    }
124}
125
126#[derive(Default, Copy, Clone)]
127pub struct EntryInfo {
128    pub size: u128,
129    pub entries_count: Option<u64>,
130}
131
132impl EntryInfo {
133    pub fn add_count(&mut self, other: &Self) {
134        self.entries_count = match (self.entries_count, other.entries_count) {
135            (Some(a), Some(b)) => Some(a + b),
136            (None, Some(b)) => Some(b),
137            (Some(a), None) => Some(a),
138            (None, None) => None,
139        };
140    }
141}
142
143pub fn set_entry_info_or_panic(
144    tree: &mut Tree,
145    node_idx: TreeIndex,
146    node_own_size: u128,
147    EntryInfo {
148        size,
149        entries_count,
150    }: EntryInfo,
151) {
152    let node = tree
153        .node_weight_mut(node_idx)
154        .expect("node for parent index we just retrieved");
155    node.size = size + node_own_size;
156    node.entry_count = entries_count.map(|n| n + 1);
157}
158
159pub fn parent_or_panic(tree: &mut Tree, parent_node_idx: TreeIndex) -> TreeIndex {
160    tree.neighbors_directed(parent_node_idx, Direction::Incoming)
161        .next()
162        .expect("every node in the iteration has a parent")
163}
164
165pub fn pop_or_panic<T>(v: &mut Vec<T>) -> T {
166    v.pop().expect("sizes per level to be in sync with graph")
167}
168
169pub type TraversalEntry =
170    Result<jwalk::DirEntry<((), Option<Result<std::fs::Metadata, jwalk::Error>>)>, jwalk::Error>;
171
172#[allow(clippy::large_enum_variant)]
173pub enum TraversalEvent {
174    Entry(TraversalEntry, Arc<PathBuf>, u64),
175    Finished(u64),
176}
177
178/// An in-progress traversal which exposes newly obtained entries
179pub struct BackgroundTraversal {
180    walk_options: WalkOptions,
181    pub root_idx: TreeIndex,
182    pub stats: TraversalStats,
183    previous_node_idx: TreeIndex,
184    parent_node_idx: TreeIndex,
185    directory_info_per_depth_level: Vec<EntryInfo>,
186    current_directory_at_depth: EntryInfo,
187    parent_node_size: u128,
188    parent_node_size_per_depth_level: Vec<u128>,
189    previous_node_size: u128,
190    previous_depth: usize,
191    inodes: InodeFilter,
192    throttle: Option<Throttle>,
193    skip_root: bool,
194    use_root_path: bool,
195    pub event_rx: Receiver<TraversalEvent>,
196}
197
198impl BackgroundTraversal {
199    /// Start a background thread to perform the actual tree walk, and dispatch the results
200    /// as events to be received on [BackgroundTraversal::event_rx].
201    pub fn start(
202        root_idx: TreeIndex,
203        walk_options: &WalkOptions,
204        input: Vec<PathBuf>,
205        skip_root: bool,
206        use_root_path: bool,
207    ) -> anyhow::Result<BackgroundTraversal> {
208        let (entry_tx, entry_rx) = crossbeam::channel::bounded(100);
209        std::thread::Builder::new()
210            .name("dua-fs-walk-dispatcher".to_string())
211            .spawn({
212                let walk_options = walk_options.clone();
213                let mut io_errors: u64 = 0;
214                move || {
215                    for root_path in input.into_iter() {
216                        log::info!("Walking {root_path:?}");
217                        let device_id = match crossdev::init(root_path.as_ref()) {
218                            Ok(id) => id,
219                            Err(_) => {
220                                io_errors += 1;
221                                continue;
222                            }
223                        };
224
225                        let root_path = Arc::new(root_path);
226                        for entry in walk_options
227                            .iter_from_path(root_path.as_ref(), device_id, skip_root)
228                            .into_iter()
229                        {
230                            if entry_tx
231                                .send(TraversalEvent::Entry(
232                                    entry,
233                                    Arc::clone(&root_path),
234                                    device_id,
235                                ))
236                                .is_err()
237                            {
238                                // The channel is closed, this means the user has
239                                // requested to quit the app. Abort the walking.
240                                return;
241                            }
242                        }
243                    }
244                    if entry_tx.send(TraversalEvent::Finished(io_errors)).is_err() {
245                        log::error!("Failed to send TraversalEvents::Finished event");
246                    }
247                }
248            })?;
249
250        Ok(Self {
251            walk_options: walk_options.clone(),
252            root_idx,
253            stats: TraversalStats::default(),
254            previous_node_idx: root_idx,
255            parent_node_idx: root_idx,
256            previous_node_size: 0,
257            parent_node_size: 0,
258            parent_node_size_per_depth_level: Vec::new(),
259            directory_info_per_depth_level: Vec::new(),
260            current_directory_at_depth: EntryInfo::default(),
261            previous_depth: 0,
262            inodes: InodeFilter::default(),
263            throttle: Some(Throttle::new(Duration::from_millis(250), None)),
264            skip_root,
265            use_root_path,
266            event_rx: entry_rx,
267        })
268    }
269
270    /// Integrate `event` into traversal `t` so its information is represented by it.
271    /// This builds the traversal tree from a directory-walk.
272    ///
273    /// Returns
274    /// * `Some(true)` if the traversal is finished
275    /// * `Some(false)` if the caller may update its state after throttling kicked in
276    /// * `None` - the event was written into the traversal, but there is nothing else to do
277    pub fn integrate_traversal_event(
278        &mut self,
279        traversal: &mut Traversal,
280        event: TraversalEvent,
281    ) -> Option<bool> {
282        match event {
283            TraversalEvent::Entry(entry, root_path, device_id) => {
284                self.stats.entries_traversed += 1;
285                let mut data = EntryData::default();
286                match entry {
287                    Ok(mut entry) => {
288                        if self.skip_root {
289                            entry.depth -= 1;
290                            data.name = entry.file_name.into()
291                        } else {
292                            data.name = if entry.depth < 1 && self.use_root_path {
293                                (*root_path).clone()
294                            } else {
295                                entry.file_name.into()
296                            }
297                        }
298
299                        let mut file_size = 0u128;
300                        let mut mtime: SystemTime = UNIX_EPOCH;
301                        let mut file_count = 0u64;
302                        match &entry.client_state {
303                            Some(Ok(m)) => {
304                                if self.walk_options.count_hard_links
305                                    || self.inodes.add(m)
306                                        && (self.walk_options.cross_filesystems
307                                            || crossdev::is_same_device(device_id, m))
308                                {
309                                    file_count = 1;
310                                    if self.walk_options.apparent_size {
311                                        file_size = m.len() as u128;
312                                    } else {
313                                        file_size = size_on_disk(&entry.parent_path, &data.name, m)
314                                            .unwrap_or_else(|_| {
315                                                self.stats.io_errors += 1;
316                                                data.metadata_io_error = true;
317                                                0
318                                            })
319                                            as u128;
320                                    }
321                                } else {
322                                    data.entry_count = Some(0);
323                                    data.is_dir = true;
324                                }
325
326                                match m.modified() {
327                                    Ok(modified) => {
328                                        mtime = modified;
329                                    }
330                                    Err(_) => {
331                                        self.stats.io_errors += 1;
332                                        data.metadata_io_error = true;
333                                    }
334                                }
335                            }
336                            Some(Err(_)) => {
337                                self.stats.io_errors += 1;
338                                data.metadata_io_error = true;
339                            }
340                            None => {}
341                        }
342
343                        match (entry.depth, self.previous_depth) {
344                            (n, p) if n > p => {
345                                self.directory_info_per_depth_level
346                                    .push(self.current_directory_at_depth);
347                                self.current_directory_at_depth = EntryInfo {
348                                    size: file_size,
349                                    entries_count: Some(file_count),
350                                };
351
352                                self.parent_node_size_per_depth_level
353                                    .push(self.previous_node_size);
354
355                                self.parent_node_idx = self.previous_node_idx;
356                                self.parent_node_size = self.previous_node_size;
357                            }
358                            (n, p) if n < p => {
359                                for _ in n..p {
360                                    set_entry_info_or_panic(
361                                        &mut traversal.tree,
362                                        self.parent_node_idx,
363                                        self.parent_node_size,
364                                        self.current_directory_at_depth,
365                                    );
366                                    let dir_info =
367                                        pop_or_panic(&mut self.directory_info_per_depth_level);
368
369                                    self.current_directory_at_depth.size += dir_info.size;
370                                    self.current_directory_at_depth.add_count(&dir_info);
371
372                                    self.parent_node_idx =
373                                        parent_or_panic(&mut traversal.tree, self.parent_node_idx);
374                                    self.parent_node_size =
375                                        pop_or_panic(&mut self.parent_node_size_per_depth_level);
376                                }
377                                self.current_directory_at_depth.size += file_size;
378                                *self
379                                    .current_directory_at_depth
380                                    .entries_count
381                                    .get_or_insert(0) += file_count;
382                                set_entry_info_or_panic(
383                                    &mut traversal.tree,
384                                    self.parent_node_idx,
385                                    self.parent_node_size,
386                                    self.current_directory_at_depth,
387                                );
388                            }
389                            _ => {
390                                self.current_directory_at_depth.size += file_size;
391                                *self
392                                    .current_directory_at_depth
393                                    .entries_count
394                                    .get_or_insert(0) += file_count;
395                            }
396                        };
397
398                        data.mtime = mtime;
399                        data.size = file_size;
400                        let entry_index = traversal.tree.add_node(data);
401
402                        traversal
403                            .tree
404                            .add_edge(self.parent_node_idx, entry_index, ());
405                        self.previous_node_idx = entry_index;
406                        self.previous_node_size = file_size;
407                        self.previous_depth = entry.depth;
408                    }
409                    Err(_) => {
410                        if self.previous_depth == 0 {
411                            data.name.clone_from(&(*root_path));
412                            let entry_index = traversal.tree.add_node(data);
413                            traversal
414                                .tree
415                                .add_edge(self.parent_node_idx, entry_index, ());
416                        }
417
418                        self.stats.io_errors += 1
419                    }
420                }
421
422                if self.throttle.as_ref().is_some_and(|t| t.can_update()) {
423                    return Some(false);
424                }
425            }
426            TraversalEvent::Finished(io_errors) => {
427                self.stats.io_errors += io_errors;
428
429                self.throttle = None;
430                self.directory_info_per_depth_level
431                    .push(self.current_directory_at_depth);
432                self.current_directory_at_depth = EntryInfo::default();
433                for _ in 0..self.previous_depth {
434                    let dir_info = pop_or_panic(&mut self.directory_info_per_depth_level);
435                    self.current_directory_at_depth.size += dir_info.size;
436                    self.current_directory_at_depth.add_count(&dir_info);
437
438                    set_entry_info_or_panic(
439                        &mut traversal.tree,
440                        self.parent_node_idx,
441                        self.parent_node_size,
442                        self.current_directory_at_depth,
443                    );
444                    self.parent_node_idx =
445                        parent_or_panic(&mut traversal.tree, self.parent_node_idx);
446                }
447                let root_size = traversal.recompute_node_size(self.root_idx);
448                set_entry_info_or_panic(
449                    &mut traversal.tree,
450                    self.root_idx,
451                    root_size,
452                    EntryInfo {
453                        size: root_size,
454                        entries_count: (self.stats.entries_traversed > 0)
455                            .then_some(self.stats.entries_traversed),
456                    },
457                );
458                self.stats.total_bytes = Some(root_size);
459                self.stats.elapsed = Some(self.stats.start.elapsed());
460
461                return Some(true);
462            }
463        }
464        None
465    }
466}
467
468#[cfg(not(windows))]
469pub fn size_on_disk(_parent: &Path, name: &Path, meta: &Metadata) -> io::Result<u64> {
470    name.size_on_disk_fast(meta)
471}
472
473#[cfg(windows)]
474pub fn size_on_disk(parent: &Path, name: &Path, meta: &Metadata) -> io::Result<u64> {
475    parent.join(name).size_on_disk_fast(meta)
476}
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    #[test]
483    fn size_of_entry_data() {
484        assert!(
485            std::mem::size_of::<EntryData>() <= 80,
486            "the size of this ({}) should not exceed 80 as it affects overall memory consumption",
487            std::mem::size_of::<EntryData>()
488        );
489    }
490}