swiftide_indexing/loaders/
file_loader.rs

1//! Load files from a directory
2use std::{
3    io::Read as _,
4    path::{Path, PathBuf},
5};
6
7use anyhow::Context as _;
8use ignore::{DirEntry, Walk};
9use swiftide_core::{Loader, indexing::IndexingStream, indexing::TextNode};
10use tracing::{Span, debug_span, instrument};
11
12/// The `FileLoader` struct is responsible for loading files from a specified directory, filtering
13/// them based on their extensions, and creating a stream of these files for further processing.
14///
15/// # Example
16///
17/// Create a pipeline that loads the current directory and indexes all files with the ".rs"
18///
19/// ```no_run
20/// # use swiftide_indexing as indexing;
21/// # use swiftide_indexing::loaders::FileLoader;
22/// indexing::Pipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]));
23/// ```
24#[derive(Clone, Debug)]
25pub struct FileLoader {
26    pub(crate) root: PathBuf,
27    pub(crate) extensions: Option<Vec<String>>,
28}
29
30impl FileLoader {
31    /// Creates a new `FileLoader` with the specified path.
32    ///
33    /// # Arguments
34    ///
35    /// - `root`: The root directory to load files from.
36    ///
37    /// # Returns
38    ///
39    /// A new instance of `FileLoader`.
40    pub fn new(root: impl AsRef<Path>) -> Self {
41        Self {
42            root: root.as_ref().to_path_buf(),
43            extensions: None,
44        }
45    }
46
47    /// Adds extensions to the loader.
48    ///
49    /// # Arguments
50    ///
51    /// - `extensions`: A list of extensions to add without the leading dot.
52    ///
53    /// # Returns
54    ///
55    /// The `FileLoader` instance with the added extensions.
56    #[must_use]
57    pub fn with_extensions(mut self, extensions: &[impl AsRef<str>]) -> Self {
58        let existing = self.extensions.get_or_insert_default();
59        existing.extend(extensions.iter().map(|ext| ext.as_ref().to_string()));
60        self
61    }
62
63    /// Lists the nodes (files) that match the specified extensions.
64    ///
65    /// # Returns
66    ///
67    /// A vector of `TextNode` representing the matching files.
68    ///
69    /// # Panics
70    ///
71    /// This method will panic if it fails to read a file's content.
72    pub fn list_nodes(&self) -> Vec<TextNode> {
73        self.iter().filter_map(Result::ok).collect()
74    }
75
76    /// Iterates over the files in the directory
77    pub fn iter(&self) -> impl Iterator<Item = anyhow::Result<TextNode>> + use<> {
78        Iter::new(&self.root, self.extensions.clone()).fuse()
79    }
80}
81
82/// An iterator that walks over the files in a directory and loads them.
83///
84/// This is a private struct that is used to implement the `FileLoader` iterator.
85struct Iter {
86    /// The walk instance that iterates over the files in the directory.
87    walk: Walk,
88    /// The extensions to include.
89    include_extensions: Option<Vec<String>>,
90    /// A span that tracks the current file loader.
91    span: Span,
92}
93
94impl Iterator for Iter {
95    type Item = anyhow::Result<TextNode>;
96
97    fn next(&mut self) -> Option<Self::Item> {
98        let _span = self.span.enter();
99        loop {
100            // stop the iteration if there are no more entries
101            let entry = self.walk.next()?;
102
103            // propagate any errors that occur during the directory traversal
104            let entry = match entry {
105                Ok(entry) => entry,
106                Err(err) => return Some(Err(err.into())),
107            };
108
109            if let Some(node) = self.load(&entry) {
110                return Some(node);
111            }
112        }
113    }
114}
115
116impl Iter {
117    /// Creates a new `Iter` instance.
118    fn new(root: &Path, include_extensions: Option<Vec<String>>) -> Self {
119        let span = debug_span!("file_loader", root = %root.display());
120        tracing::debug!(parent: &span, extensions = ?include_extensions, "Loading files");
121        Self {
122            walk: Walk::new(root),
123            include_extensions,
124            span,
125        }
126    }
127
128    #[instrument(skip_all, fields(path = %entry.path().display()))]
129    fn load(&self, entry: &DirEntry) -> Option<anyhow::Result<TextNode>> {
130        if entry.file_type().is_some_and(|ft| !ft.is_file()) {
131            // Skip directories and non-files
132            return None;
133        }
134        if let Some(extensions) = &self.include_extensions {
135            let Some(extension) = entry.path().extension() else {
136                tracing::trace!("Skipping file without extension");
137                return None;
138            };
139            let extension = extension.to_string_lossy();
140            if !extensions.iter().any(|ext| ext == &extension) {
141                tracing::trace!("Skipping file with extension {extension}");
142                return None;
143            }
144        }
145        tracing::debug!("Loading file");
146        match read_node(entry) {
147            Ok(node) => {
148                tracing::debug!(node_id = %node.id(), "Loaded file");
149                Some(Ok(node))
150            }
151            Err(err) => {
152                tracing::error!(error = %err, "Failed to load file");
153                Some(Err(err))
154            }
155        }
156    }
157}
158
159fn read_node(entry: &DirEntry) -> anyhow::Result<TextNode> {
160    // Files might be invalid utf-8, so we need to read them as bytes and convert it lossy, as
161    // Swiftide (currently) works internally with strings.
162    let mut file = fs_err::File::open(entry.path()).context("Failed to open file")?;
163    let mut buf = vec![];
164    file.read_to_end(&mut buf).context("Failed to read file")?;
165    let content = String::from_utf8_lossy(&buf);
166
167    let original_size = content.len();
168
169    TextNode::builder()
170        .path(entry.path())
171        .chunk(content)
172        .original_size(original_size)
173        .build()
174}
175
176impl Loader for FileLoader {
177    type Output = String;
178
179    /// Converts the `FileLoader` into a stream of `TextNode`.
180    ///
181    /// # Returns
182    ///
183    /// An `IndexingStream` representing the stream of files.
184    ///
185    /// # Errors
186    /// This method will return an error if it fails to read a file's content.
187    fn into_stream(self) -> IndexingStream<String> {
188        IndexingStream::iter(self.iter())
189    }
190
191    fn into_stream_boxed(self: Box<Self>) -> IndexingStream<String> {
192        self.into_stream()
193    }
194}
195
196#[cfg(test)]
197mod test {
198
199    use tokio_stream::StreamExt as _;
200
201    use super::*;
202
203    #[test]
204    fn test_with_extensions() {
205        let loader = FileLoader::new("/tmp").with_extensions(&["rs"]);
206        assert_eq!(loader.extensions, Some(vec!["rs".to_string()]));
207    }
208
209    #[tokio::test]
210    async fn test_ignores_invalid_utf8() {
211        let tempdir = temp_dir::TempDir::new().unwrap();
212
213        fs_err::write(tempdir.child("invalid.txt"), [0x80, 0x80, 0x80]).unwrap();
214
215        let loader = FileLoader::new(tempdir.path()).with_extensions(&["txt"]);
216        let result = loader.into_stream().collect::<Vec<_>>().await;
217
218        assert_eq!(result.len(), 1);
219
220        let first = result.first().unwrap();
221
222        assert_eq!(first.as_ref().unwrap().chunk, "���".to_string());
223    }
224}