swiftide_indexing/loaders/
file_loader.rs

1//! Load files from a directory
2use std::{
3    io::Read as _,
4    path::{Path, PathBuf},
5};
6
7use anyhow::Context as _;
8use swiftide_core::{indexing::IndexingStream, indexing::Node, Loader};
9
10/// The `FileLoader` struct is responsible for loading files from a specified directory,
11/// filtering them based on their extensions, and creating a stream of these files for further processing.
12///
13/// # Example
14///
15/// ```no_run
16/// // Create a pipeline that loads the current directory
17/// // and indexes all files with the ".rs" extension.
18/// # use swiftide_indexing as indexing;
19/// # use swiftide_indexing::loaders::FileLoader;
20/// indexing::Pipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]));
21/// ```
22#[derive(Clone, Debug)]
23pub struct FileLoader {
24    pub(crate) path: PathBuf,
25    pub(crate) extensions: Option<Vec<String>>,
26}
27
28impl FileLoader {
29    /// Creates a new `FileLoader` with the specified path.
30    ///
31    /// # Arguments
32    /// * `path` - The path to the directory to load files from.
33    ///
34    /// # Returns
35    /// A new instance of `FileLoader`.
36    pub fn new(path: impl Into<PathBuf>) -> Self {
37        Self {
38            path: path.into(),
39            extensions: None,
40        }
41    }
42
43    /// Adds extensions to the loader.
44    ///
45    /// # Arguments
46    /// * `extensions` - A list of extensions to add without the leading dot.
47    ///
48    /// # Returns
49    /// The `FileLoader` instance with the added extensions.
50    #[must_use]
51    pub fn with_extensions(mut self, extensions: &[impl AsRef<str>]) -> Self {
52        self.extensions = Some(
53            self.extensions
54                .unwrap_or_default()
55                .into_iter()
56                .chain(extensions.iter().map(|ext| ext.as_ref().to_string()))
57                .collect(),
58        );
59        self
60    }
61
62    /// Lists the nodes (files) that match the specified extensions.
63    ///
64    /// # Returns
65    /// A vector of `Node` representing the matching files.
66    ///
67    /// # Panics
68    /// This method will panic if it fails to read a file's content.
69    pub fn list_nodes(&self) -> Vec<Node> {
70        self.iter().filter_map(Result::ok).collect()
71    }
72
73    /// Iterates over the files in the directory
74    pub fn iter(&self) -> impl Iterator<Item = anyhow::Result<Node>> {
75        let path = self.path.clone();
76        let extensions = self.extensions.clone();
77
78        ignore::Walk::new(path)
79            .filter_map(Result::ok)
80            .filter(|entry| entry.file_type().is_some_and(|ft| ft.is_file()))
81            .filter(move |entry| file_has_extension(extensions.as_deref(), entry.path()))
82            .map(move |entry| {
83                tracing::debug!("Reading file: {:?}", entry);
84
85                // Files might be invalid utf-8, so we need to read them as bytes and convert it
86                // lossy, as Swiftide (currently) works internally with strings.
87                let mut file = std::fs::File::open(entry.path()).context("Failed to open file")?;
88                let mut buf = vec![];
89                file.read_to_end(&mut buf).context("Failed to read file")?;
90                let content = String::from_utf8_lossy(&buf);
91
92                let original_size = content.len();
93
94                Node::builder()
95                    .path(entry.path())
96                    .chunk(content)
97                    .original_size(original_size)
98                    .build()
99            })
100    }
101}
102
103// Helper function to check if a file has the specified extension.
104// If no extensions are specified, this function will return true.
105// If the file has no extension, this function will return false.
106fn file_has_extension(extensions: Option<&[impl AsRef<str>]>, path: &Path) -> bool {
107    extensions.as_ref().is_none_or(|exts| {
108        let Some(ext) = path.extension() else {
109            return false;
110        };
111        exts.iter()
112            .any(|e| e.as_ref() == ext.to_string_lossy().as_ref())
113    })
114}
115
116impl Loader for FileLoader {
117    /// Converts the `FileLoader` into a stream of `Node`.
118    ///
119    /// # Returns
120    /// An `IndexingStream` representing the stream of files.
121    ///
122    /// # Errors
123    /// This method will return an error if it fails to read a file's content.
124    fn into_stream(self) -> IndexingStream {
125        IndexingStream::iter(self.iter())
126    }
127
128    fn into_stream_boxed(self: Box<Self>) -> IndexingStream {
129        self.into_stream()
130    }
131}
132
133#[cfg(test)]
134mod test {
135
136    use tokio_stream::StreamExt as _;
137
138    use super::*;
139
140    #[test]
141    fn test_with_extensions() {
142        let loader = FileLoader::new("/tmp").with_extensions(&["rs"]);
143        assert_eq!(loader.extensions, Some(vec!["rs".to_string()]));
144    }
145
146    #[tokio::test]
147    async fn test_ignores_invalid_utf8() {
148        let tempdir = temp_dir::TempDir::new().unwrap();
149
150        std::fs::write(tempdir.child("invalid.txt"), [0x80, 0x80, 0x80]).unwrap();
151
152        let loader = FileLoader::new(tempdir.path()).with_extensions(&["txt"]);
153        let result = loader.into_stream().collect::<Vec<_>>().await;
154
155        assert_eq!(result.len(), 1);
156
157        let first = result.first().unwrap();
158
159        assert_eq!(first.as_ref().unwrap().chunk, "���".to_string());
160    }
161}