lychee_lib/types/input/
input.rs

1//! Core input types and construction logic.
2//!
3//! The `Input` type handles the construction and validation of various input
4//! sources including URLs, file paths, glob patterns, and stdin.
5
6use super::InputResolver;
7use super::content::InputContent;
8use super::source::{InputSource, ResolvedInputSource};
9use crate::Preprocessor;
10use crate::filter::PathExcludes;
11use crate::types::{FileType, RequestError, file::FileExtensions, resolver::UrlContentResolver};
12use crate::{ErrorKind, LycheeResult};
13use async_stream::try_stream;
14use futures::stream::{Stream, StreamExt};
15use log::debug;
16use std::path::{Path, PathBuf};
17use tokio::io::{AsyncReadExt, stdin};
18
19/// Lychee Input with optional file hint for parsing
20#[derive(Clone, Debug, PartialEq, Eq, Hash)]
21pub struct Input {
22    /// Origin of input
23    pub source: InputSource,
24
25    /// Hint to indicate which extractor to use
26    ///
27    /// If this is not provided, the extractor will be guessed from the input
28    /// (e.g. file extension or URL path)
29    pub file_type_hint: Option<FileType>,
30}
31
32impl Input {
33    /// Construct a new `Input` source. In case the input is a `glob` pattern,
34    /// `glob_ignore_case` decides whether matching files against the `glob` is
35    /// case-insensitive or not
36    ///
37    /// # Errors
38    ///
39    /// Returns an error if:
40    /// - the input does not exist (i.e. the path is invalid)
41    /// - the input cannot be parsed as a URL
42    pub fn new(
43        input: &str,
44        file_type_hint: Option<FileType>,
45        glob_ignore_case: bool,
46    ) -> LycheeResult<Self> {
47        let source = InputSource::new(input, glob_ignore_case)?;
48        Ok(Self {
49            source,
50            file_type_hint,
51        })
52    }
53
54    /// Convenience constructor with default settings
55    ///
56    /// # Errors
57    ///
58    /// Returns an error if:
59    /// - the input does not exist (i.e. the path is invalid)
60    /// - the input cannot be parsed as a URL
61    pub fn from_value(value: &str) -> LycheeResult<Self> {
62        Self::new(value, None, false)
63    }
64
65    /// Create an `Input` from an existing `InputSource`
66    ///
67    /// The file type will be determined later when processing the input.
68    #[must_use]
69    pub const fn from_input_source(source: InputSource) -> Self {
70        Self {
71            source,
72            file_type_hint: None,
73        }
74    }
75
76    /// Retrieve the contents from the input
77    ///
78    /// If the input is a path, only search through files that match the given
79    /// file extensions.
80    ///
81    /// # Errors
82    ///
83    /// Returns an error if the contents can not be retrieved because of an
84    /// underlying I/O error (e.g. an error while making a network request or
85    /// retrieving the contents from the file system)
86    #[allow(
87        clippy::too_many_arguments,
88        reason = "https://github.com/lycheeverse/lychee/issues/1898"
89    )]
90    pub fn get_contents(
91        self,
92        skip_missing: bool,
93        skip_hidden: bool,
94        skip_ignored: bool,
95        file_extensions: FileExtensions,
96        resolver: UrlContentResolver,
97        excluded_paths: PathExcludes,
98        preprocessor: Option<Preprocessor>,
99    ) -> impl Stream<Item = Result<InputContent, RequestError>> {
100        try_stream! {
101            let source = self.source.clone();
102
103            let user_input_error =
104                move |e: ErrorKind| RequestError::UserInputContent(source.clone(), e);
105            let discovered_input_error =
106                |e: ErrorKind| RequestError::GetInputContent(self.source.clone(), e);
107
108            // Handle simple cases that don't need resolution. Also, perform
109            // simple *stateful* checks for more complex input sources.
110            //
111            // Stateless well-formedness checks (e.g., checking glob syntax)
112            // are done in InputSource::new.
113            match self.source {
114                InputSource::RemoteUrl(url) => {
115                    match resolver.url_contents(*url).await {
116                        Err(_) if skip_missing => (),
117                        Err(e) => Err(user_input_error(e))?,
118                        Ok(content) => yield content,
119                    }
120                    return;
121                }
122                InputSource::FsPath(ref path) => {
123                    let is_readable = if path.is_dir() {
124                        path.read_dir()
125                            .map(|_| ())
126                            .map_err(|e| ErrorKind::DirTraversal(ignore::Error::Io(e)))
127                    } else {
128                        // This checks existence without requiring an open. Opening here,
129                        // then re-opening later, might cause problems with pipes. This
130                        // does not validate permissions.
131                        path.metadata()
132                            .map(|_| ())
133                            .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))
134                    };
135
136                    is_readable.map_err(user_input_error)?;
137                }
138                InputSource::Stdin => {
139                    yield Self::stdin_content(self.file_type_hint)
140                        .await
141                        .map_err(user_input_error)?;
142                    return;
143                }
144                InputSource::String(ref s) => {
145                    yield Self::string_content(s, self.file_type_hint);
146                    return;
147                }
148                _ => {}
149            }
150
151            // Handle complex cases that need resolution (FsPath, FsGlob)
152            let mut sources_stream = InputResolver::resolve(
153                &self,
154                file_extensions,
155                skip_hidden,
156                skip_ignored,
157                &excluded_paths,
158            );
159
160            let mut sources_empty = true;
161
162            while let Some(source_result) = sources_stream.next().await {
163                match source_result {
164                    Ok(source) => {
165                        let content_result = match source {
166                            ResolvedInputSource::FsPath(path) => {
167                                Self::path_content(&path, preprocessor.as_ref()).await
168                            },
169                            ResolvedInputSource::RemoteUrl(url) => {
170                                resolver.url_contents(*url).await
171                            }
172                            ResolvedInputSource::Stdin => {
173                                Self::stdin_content(self.file_type_hint).await
174                            }
175                            ResolvedInputSource::String(s) => {
176                                Ok(Self::string_content(&s, self.file_type_hint))
177                            }
178                        };
179
180                        match content_result {
181                            Err(_) if skip_missing => (),
182                            Err(e) if matches!(&e, ErrorKind::ReadFileInput(io_err, _) if io_err.kind() == std::io::ErrorKind::InvalidData) =>
183                            {
184                                // If the file contains invalid UTF-8 (e.g. binary), we skip it
185                                if let ErrorKind::ReadFileInput(_, path) = &e {
186                                    log::warn!(
187                                        "Skipping file with invalid UTF-8 content: {}",
188                                        path.display()
189                                    );
190                                }
191                            }
192                            Err(e) => Err(discovered_input_error(e))?,
193                            Ok(content) => {
194                                sources_empty = false;
195                                yield content
196                            }
197                        }
198                    }
199                    Err(e) => Err(discovered_input_error(e))?,
200                }
201            }
202
203            if sources_empty {
204                log::warn!("{}: No files found for this input source", self.source);
205            }
206        }
207    }
208
209    /// Retrieve all sources from this input. The output depends on the type of
210    /// input:
211    ///
212    /// - Remote URLs are returned as is, in their full form
213    /// - Glob patterns are expanded and each matched entry is returned
214    /// - Absolute or relative filepaths are returned as-is
215    /// - Stdin input is returned as the special string "<stdin>"
216    /// - A raw string input is returned as the special string "<raw string>"
217    ///
218    /// # Errors
219    ///
220    /// Returns an error if [`InputResolver::resolve`] returns an error.
221    pub fn get_sources(
222        self,
223        file_extensions: FileExtensions,
224        skip_hidden: bool,
225        skip_ignored: bool,
226        excluded_paths: &PathExcludes,
227    ) -> impl Stream<Item = LycheeResult<String>> {
228        InputResolver::resolve(
229            &self,
230            file_extensions,
231            skip_hidden,
232            skip_ignored,
233            excluded_paths,
234        )
235        .map(|res| {
236            res.map(|src| match src {
237                ResolvedInputSource::FsPath(path) => path.to_string_lossy().to_string(),
238                ResolvedInputSource::RemoteUrl(url) => url.to_string(),
239                ResolvedInputSource::Stdin => "<stdin>".to_string(),
240                ResolvedInputSource::String(_) => "<raw string>".to_string(),
241            })
242        })
243    }
244
245    /// Get the content for a given path.
246    ///
247    /// # Errors
248    ///
249    /// Returns an error if the file cannot be read
250    /// or [`Preprocessor`] failed
251    pub async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(
252        path: P,
253        preprocessor: Option<&Preprocessor>,
254    ) -> LycheeResult<InputContent> {
255        let path = path.into();
256        let content = Self::get_content(&path, preprocessor).await?;
257
258        Ok(InputContent {
259            file_type: FileType::from(&path),
260            source: ResolvedInputSource::FsPath(path),
261            content,
262        })
263    }
264
265    /// Create `InputContent` from stdin.
266    ///
267    /// # Errors
268    ///
269    /// Returns an error if stdin cannot be read
270    pub async fn stdin_content(file_type_hint: Option<FileType>) -> LycheeResult<InputContent> {
271        let mut content = String::new();
272        let mut stdin = stdin();
273
274        debug!("Reading content from stdin"); // useful info when nothing piped and process blocks
275        stdin.read_to_string(&mut content).await?;
276
277        let input_content = InputContent {
278            source: ResolvedInputSource::Stdin,
279            file_type: file_type_hint.unwrap_or_default(),
280            content,
281        };
282
283        Ok(input_content)
284    }
285
286    /// Create `InputContent` from a string.
287    #[must_use]
288    pub fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
289        InputContent::from_string(s, file_type_hint.unwrap_or_default())
290    }
291
292    /// Get content of file.
293    /// Get preprocessed file content if [`Preprocessor`] is [`Some`]
294    async fn get_content(
295        path: &PathBuf,
296        preprocessor: Option<&Preprocessor>,
297    ) -> LycheeResult<String> {
298        if let Some(pre) = preprocessor {
299            pre.process(path)
300        } else {
301            Ok(tokio::fs::read_to_string(path)
302                .await
303                .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?)
304        }
305    }
306}
307
308impl TryFrom<&str> for Input {
309    type Error = crate::ErrorKind;
310
311    fn try_from(value: &str) -> Result<Self, Self::Error> {
312        Self::from_value(value)
313    }
314}
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319    use crate::filter::PathExcludes;
320
321    /// A standalone function to allow for easier testing of path exclusion logic
322    pub fn is_excluded_path(excluded_paths: &PathExcludes, path: &Path) -> bool {
323        excluded_paths.is_match(&path.to_string_lossy())
324    }
325
326    #[test]
327    fn test_input_handles_real_relative_paths() {
328        let test_file = "./Cargo.toml";
329        let path = Path::new(test_file);
330
331        assert!(path.exists());
332        assert!(path.is_relative());
333
334        let input = Input::new(test_file, None, false);
335        assert!(input.is_ok());
336        assert!(matches!(
337            input,
338            Ok(Input {
339                source: InputSource::FsPath(PathBuf { .. }),
340                file_type_hint: None,
341            })
342        ));
343    }
344
345    #[test]
346    fn test_input_handles_nonexistent_relative_paths() {
347        let test_file = "./nonexistent/relative/path";
348        let path = Path::new(test_file);
349
350        assert!(!path.exists());
351        assert!(path.is_relative());
352
353        let input = Input::from_value(test_file);
354        assert!(input.is_err());
355        assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. }))));
356    }
357
358    #[test]
359    fn test_no_exclusions() {
360        let dir = tempfile::tempdir().unwrap();
361        assert!(!is_excluded_path(&PathExcludes::empty(), dir.path()));
362    }
363
364    #[test]
365    fn test_excluded() {
366        let dir = tempfile::tempdir().unwrap();
367        let path = dir.path();
368        let excludes = PathExcludes::new([path.to_string_lossy()]).unwrap();
369        assert!(is_excluded_path(&excludes, path));
370    }
371
372    #[test]
373    fn test_excluded_subdir() {
374        let parent_dir = tempfile::tempdir().unwrap();
375        let parent = parent_dir.path();
376        let child_dir = tempfile::tempdir_in(parent).unwrap();
377        let child = child_dir.path();
378
379        let excludes = PathExcludes::new([parent.to_string_lossy()]).unwrap();
380        assert!(is_excluded_path(&excludes, child));
381    }
382
383    #[test]
384    fn test_url_without_scheme() {
385        let input = Input::from_value("example.com");
386        assert_eq!(
387            input.unwrap().source.to_string(),
388            String::from("http://example.com/")
389        );
390    }
391
392    // Ensure that a Windows file path is not mistaken for a URL.
393    #[cfg(windows)]
394    #[test]
395    fn test_windows_style_filepath_not_existing() {
396        let input = Input::from_value("C:\\example\\project\\here");
397        assert!(input.is_err());
398        let input = input.unwrap_err();
399
400        match input {
401            ErrorKind::InvalidFile(_) => (),
402            _ => panic!("Should have received InvalidFile error"),
403        }
404    }
405
406    // Ensure that a Windows-style file path to an existing file is recognized
407    #[cfg(windows)]
408    #[test]
409    fn test_windows_style_filepath_existing() {
410        use std::env::temp_dir;
411        use tempfile::NamedTempFile;
412
413        let dir = temp_dir();
414        let file = NamedTempFile::new_in(dir).unwrap();
415        let path = file.path();
416        let input = Input::from_value(path.to_str().unwrap()).unwrap();
417
418        match input.source {
419            InputSource::FsPath(_) => (),
420            _ => panic!("Input source should be FsPath but was not"),
421        }
422    }
423
424    #[test]
425    fn test_url_scheme_check_succeeding() {
426        // Valid http and https URLs
427        assert!(matches!(
428            Input::from_value("http://example.com"),
429            Ok(Input {
430                source: InputSource::RemoteUrl(_),
431                ..
432            })
433        ));
434        assert!(matches!(
435            Input::from_value("https://example.com"),
436            Ok(Input {
437                source: InputSource::RemoteUrl(_),
438                ..
439            })
440        ));
441        assert!(matches!(
442            Input::from_value("http://subdomain.example.com/path?query=value",),
443            Ok(Input {
444                source: InputSource::RemoteUrl(_),
445                ..
446            })
447        ));
448        assert!(matches!(
449            Input::from_value("https://example.com:8080"),
450            Ok(Input {
451                source: InputSource::RemoteUrl(_),
452                ..
453            })
454        ));
455    }
456
457    #[test]
458    fn test_url_scheme_check_failing() {
459        // Invalid schemes
460        assert!(matches!(
461            Input::from_value("ftp://example.com"),
462            Err(ErrorKind::InvalidFile(_))
463        ));
464        assert!(matches!(
465            Input::from_value("httpx://example.com"),
466            Err(ErrorKind::InvalidFile(_))
467        ));
468        assert!(matches!(
469            Input::from_value("file:///path/to/file"),
470            Err(ErrorKind::InvalidFile(_))
471        ));
472        assert!(matches!(
473            Input::from_value("mailto:user@example.com"),
474            Err(ErrorKind::InvalidFile(_))
475        ));
476    }
477
478    #[test]
479    fn test_non_url_inputs() {
480        // Non-URL inputs
481        assert!(matches!(
482            Input::from_value("./local/path"),
483            Err(ErrorKind::InvalidFile(_))
484        ));
485        assert!(matches!(
486            Input::from_value("*.md"),
487            Ok(Input {
488                source: InputSource::FsGlob { .. },
489                ..
490            })
491        ));
492        // Assuming the current directory exists
493        assert!(matches!(
494            Input::from_value("."),
495            Ok(Input {
496                source: InputSource::FsPath(_),
497                ..
498            })
499        ));
500    }
501}