Skip to main content

swh_graph_stdlib/
fs.rs

1// Copyright (C) 2024-2026  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Filesystem manipulation functions, ie. directories and contents.
7
8use std::collections::HashMap;
9
10use anyhow::{Context, Result, ensure};
11use log::warn;
12use rapidhash::RapidHashMap;
13use smallvec::{SmallVec, smallvec};
14use sux::bits::BitFieldVec;
15use sux::traits::bit_field_slice::BitFieldSlice;
16
17use swh_graph::NodeType;
18use swh_graph::graph::*;
19use swh_graph::labels::{EdgeLabel, LabelNameId, Permission};
20use swh_graph::properties;
21
22fn msg_no_label_name_id(name: impl AsRef<[u8]>) -> String {
23    format!(
24        "no label_name id found for entry \"{}\"",
25        String::from_utf8_lossy(name.as_ref())
26    )
27}
28
29/// An ordered collection of [`LabelNameId`].
30///
31/// Usually this is `Box<[LabelNameId]>` for simplicity, but [`BitFieldIdPath`] is an
32/// implementation backed by [`BitFieldVec`] for cases where memory usage matters.
33pub trait IdPath {
34    fn iter(&self) -> impl Iterator<Item = LabelNameId>;
35}
36
37impl IdPath for Vec<LabelNameId> {
38    fn iter(&self) -> impl Iterator<Item = LabelNameId> {
39        #[allow(clippy::into_iter_on_ref)] // false positive
40        self.into_iter().copied()
41    }
42}
43
44impl IdPath for Box<[LabelNameId]> {
45    fn iter(&self) -> impl Iterator<Item = LabelNameId> {
46        self.into_iter().copied()
47    }
48}
49
50impl IdPath for &[LabelNameId] {
51    fn iter(&self) -> impl Iterator<Item = LabelNameId> {
52        #[allow(clippy::into_iter_on_ref)] // false positive
53        self.into_iter().copied()
54    }
55}
56
57/// Newtype wrapping a [`BitFieldVec`] (or other implementations of [`BitFieldSlice`])
58/// to interpret them as an [`IdPath`].
59#[derive(Debug, Clone, PartialEq, Eq, Hash)]
60pub struct BitFieldIdPath<B: BitFieldSlice<u64>>(B);
61
62impl<B: BitFieldSlice<u64>> From<B> for BitFieldIdPath<B> {
63    fn from(v: B) -> Self {
64        Self(v)
65    }
66}
67
68#[inline]
69fn bitfieldvec_from_slice(b: &[LabelNameId]) -> BitFieldVec<u64, Vec<u64>> {
70    let bit_width = u64::BITS
71        - b.iter()
72            .map(|LabelNameId(item)| item)
73            .max()
74            .unwrap_or(&1)
75            .leading_zeros();
76    let mut v = BitFieldVec::with_capacity(
77        bit_width
78            .try_into()
79            .expect("overflowed BitFieldVec item length"),
80        b.len(),
81    );
82    for &LabelNameId(item) in b {
83        v.push(item);
84    }
85
86    v
87}
88
89impl From<Box<[LabelNameId]>> for BitFieldIdPath<BitFieldVec<u64, Box<[u64]>>> {
90    #[inline(always)]
91    fn from(b: Box<[LabelNameId]>) -> Self {
92        BitFieldIdPath(bitfieldvec_from_slice(&b).into())
93    }
94}
95
96impl From<Vec<LabelNameId>> for BitFieldIdPath<BitFieldVec<u64, Box<[u64]>>> {
97    #[inline(always)]
98    fn from(v: Vec<LabelNameId>) -> Self {
99        BitFieldIdPath(bitfieldvec_from_slice(&v).into())
100    }
101}
102
103impl From<&[LabelNameId]> for BitFieldIdPath<BitFieldVec<u64, Box<[u64]>>> {
104    #[inline(always)]
105    fn from(s: &[LabelNameId]) -> Self {
106        BitFieldIdPath(bitfieldvec_from_slice(s).into())
107    }
108}
109
110impl<B: BitFieldSlice<u64>> From<BitFieldIdPath<B>> for Box<[LabelNameId]> {
111    #[inline(always)]
112    fn from(v: BitFieldIdPath<B>) -> Self {
113        v.iter().collect()
114    }
115}
116
117impl<B: BitFieldSlice<u64>> IdPath for BitFieldIdPath<B> {
118    #[inline(always)]
119    fn iter(&self) -> impl Iterator<Item = LabelNameId> {
120        (0..self.0.len()).map(|i| LabelNameId(self.0.get_value(i).unwrap()))
121    }
122}
123
124/// Given a graph and a directory node, return the node id of a named directory
125/// entry located (not recursively) in that directory, if it exists.
126///
127/// See [resolve_path] for a version of this function that traverses
128/// sub-directories recursively.
129///
130/// ```ignore
131/// if let Ok(Some(node)) == resolve_name(&graph, 42, "README.md") {
132///     // do something with node
133/// }
134/// ```
135pub fn resolve_name<G>(graph: &G, dir: NodeId, name: impl AsRef<[u8]>) -> Result<Option<NodeId>>
136where
137    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
138    <G as SwhGraphWithProperties>::LabelNames: properties::LabelNames,
139    <G as SwhGraphWithProperties>::Maps: properties::Maps,
140{
141    let props = graph.properties();
142    let name_id = props
143        .label_name_id(name.as_ref())
144        .with_context(|| msg_no_label_name_id(name))?;
145    resolve_name_by_id(&graph, dir, name_id)
146}
147
148/// Same as [resolve_name], but using a pre-resolved [LabelNameId] as entry
149/// name. Using this function is more efficient in case the same name (e.g.,
150/// "README.md") is to be looked up in many directories.
151pub fn resolve_name_by_id<G>(graph: &G, dir: NodeId, name: LabelNameId) -> Result<Option<NodeId>>
152where
153    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
154    <G as SwhGraphWithProperties>::Maps: properties::Maps,
155{
156    let node_type = graph.properties().node_type(dir);
157    ensure!(
158        node_type == NodeType::Directory,
159        "Cannot resolve name in {}: not a directory",
160        graph.properties().swhid(dir),
161    );
162
163    for (succ, label) in graph.labeled_successors(dir).flatten_labels() {
164        #[allow(clippy::collapsible_if)]
165        if let EdgeLabel::DirEntry(dentry) = label {
166            if dentry.label_name_id() == name {
167                return Ok(Some(succ));
168            }
169        }
170    }
171    Ok(None)
172}
173
174/// Given a graph and a directory node, return the node id of a directory entry
175/// located at a given path within that directory, if it exists.
176///
177/// Slashes (`/`) contained in `path` are interpreted as path separators.
178///
179/// See [resolve_name] for a non-recursive version of this function.
180///
181/// ```ignore
182/// if let Ok(Some(node)) == resolve_path(&graph, 42, "src/main.c") {
183///     // do something with node
184/// }
185/// ```
186pub fn resolve_path<G>(graph: &G, dir: NodeId, path: impl AsRef<[u8]>) -> Result<Option<NodeId>>
187where
188    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
189    <G as SwhGraphWithProperties>::LabelNames: properties::LabelNames,
190    <G as SwhGraphWithProperties>::Maps: properties::Maps,
191{
192    let props = graph.properties();
193    let path = path
194        .as_ref()
195        .split(|byte| *byte == b'/')
196        .map(|name| {
197            props
198                .label_name_id(name)
199                .with_context(|| msg_no_label_name_id(name))
200        })
201        .collect::<Result<Vec<LabelNameId>, _>>()?;
202    resolve_path_by_id(&graph, dir, path)
203}
204
205/// Same as [resolve_path], but using as path a sequence of pre-resolved
206/// [LabelNameId]-s. Using this function is more efficient in case the same path
207/// (e.g., "src/main.c") is to be looked up in many directories.
208pub fn resolve_path_by_id<G, P: IdPath>(graph: &G, dir: NodeId, path: P) -> Result<Option<NodeId>>
209where
210    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
211    <G as SwhGraphWithProperties>::Maps: properties::Maps,
212{
213    let mut cur_entry = dir;
214    for name in path.iter() {
215        match resolve_name_by_id(graph, cur_entry, name)? {
216            None => return Ok(None),
217            Some(entry) => cur_entry = entry,
218        }
219    }
220    Ok(Some(cur_entry))
221}
222
223/// Optimised version of [`resolve_path_by_id`] when the same path is queried from many different
224/// directories.
225///
226/// This internally caches partial resolutions.
227pub struct CachedPathResolver<'g, G, P: IdPath>
228where
229    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
230    <G as SwhGraphWithProperties>::Maps: properties::Maps,
231{
232    graph: &'g G,
233    path: P,
234    /// cache that maps `(midpoint, depth)` to `resolve_path_by_id(graph, midpoint, path[depth..]`
235    partial_resolutions: RapidHashMap<(NodeId, usize), NodeId>,
236}
237
238impl<'g, G, P: IdPath> CachedPathResolver<'g, G, P>
239where
240    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
241    <G as SwhGraphWithProperties>::Maps: properties::Maps,
242{
243    pub fn new(graph: &'g G, path: P) -> Self {
244        Self {
245            graph,
246            path,
247            partial_resolutions: RapidHashMap::default(),
248        }
249    }
250
251    pub fn resolve_from(&mut self, dir: NodeId) -> Result<Option<NodeId>> {
252        let mut cur_entry = dir;
253        let mut midpoints: SmallVec<[_; 4]> = smallvec![]; // 4 is arbitrary
254        for (depth, name) in self.path.iter().enumerate() {
255            midpoints.push(cur_entry);
256            if let Some(&entry) = self.partial_resolutions.get(&(cur_entry, depth)) {
257                cur_entry = entry;
258                break;
259            }
260
261            match resolve_name_by_id(self.graph, cur_entry, name)? {
262                None => return Ok(None), // TODO: cache this too?
263                Some(entry) => cur_entry = entry,
264            }
265        }
266
267        for (midpoint_depth, midpoint) in midpoints.into_iter().enumerate() {
268            self.partial_resolutions
269                .insert((midpoint, midpoint_depth), cur_entry);
270        }
271
272        Ok(Some(cur_entry))
273    }
274}
275
276/// Recursive representation of a directory tree, ignoring sharing.
277///
278/// Note that a `Revision` variant can in fact point to either revision or
279/// release nodes.
280#[derive(Debug, Default, PartialEq)]
281pub enum FsTree {
282    #[default]
283    Content,
284    Directory(HashMap<Vec<u8>, (FsTree, Option<Permission>)>),
285    Revision(NodeId),
286}
287
288/// Given a graph and a directory node in it (usually, but not necessarily, the
289/// *root* directory of a repository), return a recursive list of the contained
290/// files and directories.
291///
292/// Note that symlinks are not followed during listing and are reported as
293/// files in the returned tree. To recognize them as links, check the
294/// permissions of the associated directory entries.
295pub fn ls_tree<G>(graph: &G, dir: NodeId) -> Result<FsTree>
296where
297    G: SwhLabeledForwardGraph + SwhGraphWithProperties,
298    <G as SwhGraphWithProperties>::LabelNames: properties::LabelNames,
299    <G as SwhGraphWithProperties>::Maps: properties::Maps,
300{
301    let props = graph.properties();
302    let node_type = props.node_type(dir);
303    ensure!(
304        node_type == NodeType::Directory,
305        "Cannot resolve list {}: not a directory",
306        graph.properties().swhid(dir),
307    );
308
309    let mut dir_entries = HashMap::new();
310    for (succ, labels) in graph.labeled_successors(dir) {
311        let node_type = props.node_type(succ);
312        for label in labels {
313            if let EdgeLabel::DirEntry(dentry) = label {
314                let file_name = props.label_name(dentry.label_name_id());
315                let perm = dentry.permission();
316                match node_type {
317                    NodeType::Content => {
318                        dir_entries.insert(file_name, (FsTree::Content, perm));
319                    }
320                    NodeType::Directory => {
321                        // recurse into subdir
322                        if let Ok(subdir) = ls_tree(graph, succ) {
323                            dir_entries.insert(file_name, (subdir, perm));
324                        } else {
325                            warn!("Cannot list (sub-)directory {succ}, skipping it");
326                        }
327                    }
328                    NodeType::Revision | NodeType::Release => {
329                        dir_entries.insert(file_name, (FsTree::Revision(succ), perm));
330                    }
331                    NodeType::Origin | NodeType::Snapshot => {
332                        warn!("Ignoring dir entry with unexpected type {node_type}");
333                    }
334                }
335            }
336        }
337    }
338
339    Ok(FsTree::Directory(dir_entries))
340}