Skip to main content

lychee_lib/types/input/
input.rs

1//! Core input types and construction logic.
2//!
3//! The `Input` type handles the construction and validation of various input
4//! sources including URLs, file paths, glob patterns, and stdin.
5
6use super::InputResolver;
7use super::content::InputContent;
8use super::source::{InputSource, ResolvedInputSource};
9use crate::Preprocessor;
10use crate::filter::PathExcludes;
11use crate::types::{FileType, RequestError, file::FileExtensions, resolver::UrlContentResolver};
12use crate::{ErrorKind, LycheeResult};
13use async_stream::try_stream;
14use futures::stream::{Stream, StreamExt};
15use log::debug;
16use std::io::IsTerminal;
17use std::path::{Path, PathBuf};
18use tokio::io::{AsyncReadExt, stdin};
19
20/// Lychee Input with optional file hint for parsing
21#[derive(Clone, Debug, PartialEq, Eq, Hash)]
22pub struct Input {
23    /// Origin of input
24    pub source: InputSource,
25
26    /// Hint to indicate which extractor to use
27    ///
28    /// If this is not provided, the extractor will be guessed from the input
29    /// (e.g. file extension or URL path)
30    pub file_type_hint: Option<FileType>,
31}
32
33impl Input {
34    /// Construct a new `Input` source. In case the input is a `glob` pattern,
35    /// `glob_ignore_case` decides whether matching files against the `glob` is
36    /// case-insensitive or not
37    ///
38    /// # Errors
39    ///
40    /// Returns an error if:
41    /// - the input does not exist (i.e. the path is invalid)
42    /// - the input cannot be parsed as a URL
43    pub fn new(
44        input: &str,
45        file_type_hint: Option<FileType>,
46        glob_ignore_case: bool,
47    ) -> LycheeResult<Self> {
48        let source = InputSource::new(input, glob_ignore_case)?;
49        Ok(Self {
50            source,
51            file_type_hint,
52        })
53    }
54
55    /// Convenience constructor with default settings
56    ///
57    /// # Errors
58    ///
59    /// Returns an error if:
60    /// - the input does not exist (i.e. the path is invalid)
61    /// - the input cannot be parsed as a URL
62    pub fn from_value(value: &str) -> LycheeResult<Self> {
63        Self::new(value, None, false)
64    }
65
66    /// Create an `Input` from an existing `InputSource`
67    ///
68    /// The file type will be determined later when processing the input.
69    #[must_use]
70    pub const fn from_input_source(source: InputSource) -> Self {
71        Self {
72            source,
73            file_type_hint: None,
74        }
75    }
76
77    /// Retrieve the contents from the input
78    ///
79    /// If the input is a path, only search through files that match the given
80    /// file extensions.
81    ///
82    /// # Errors
83    ///
84    /// Returns an error if the contents can not be retrieved because of an
85    /// underlying I/O error (e.g. an error while making a network request or
86    /// retrieving the contents from the file system)
87    #[allow(
88        clippy::too_many_arguments,
89        reason = "https://github.com/lycheeverse/lychee/issues/1898"
90    )]
91    pub fn get_contents(
92        self,
93        skip_missing: bool,
94        skip_hidden: bool,
95        skip_ignored: bool,
96        file_extensions: FileExtensions,
97        resolver: UrlContentResolver,
98        excluded_paths: PathExcludes,
99        preprocessor: Option<Preprocessor>,
100    ) -> impl Stream<Item = Result<InputContent, RequestError>> {
101        try_stream! {
102            let source = self.source.clone();
103
104            let user_input_error =
105                move |e: ErrorKind| RequestError::UserInputContent(source.clone(), e.into());
106            let discovered_input_error =
107                |e: ErrorKind| RequestError::GetInputContent(self.source.clone(), e.into());
108
109            // Handle simple cases that don't need resolution. Also, perform
110            // simple *stateful* checks for more complex input sources.
111            //
112            // Stateless well-formedness checks (e.g., checking glob syntax)
113            // are done in InputSource::new.
114            match self.source {
115                InputSource::RemoteUrl(url) => {
116                    match resolver.url_contents(*url).await {
117                        Err(_) if skip_missing => (),
118                        Err(e) => Err(user_input_error(e))?,
119                        Ok(content) => yield content,
120                    }
121                    return;
122                }
123                InputSource::FsPath(ref path) => {
124                    // We check if the file is readable before processing. This catches
125                    // permission errors and missing files early.
126                    let is_readable = if path.is_dir() {
127                        path.read_dir()
128                            .map(|_| ())
129                            .map_err(|e| ErrorKind::DirTraversal(ignore::Error::Io(e)))
130                    } else {
131                        // We check existence without opening the file to avoid issues with
132                        // pipes and special files. This does not validate permissions.
133                        path.metadata()
134                            .map(|_| ())
135                            .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))
136                    };
137                    is_readable.map_err(user_input_error)?;
138                }
139                InputSource::Stdin => {
140                    yield Self::stdin_content(self.file_type_hint)
141                        .await
142                        .map_err(user_input_error)?;
143                    return;
144                }
145                InputSource::String(ref s) => {
146                    yield Self::string_content(s, self.file_type_hint);
147                    return;
148                }
149                _ => {}
150            }
151
152            // Handle complex cases that need resolution (FsPath, FsGlob)
153            let mut sources_stream = InputResolver::resolve(
154                &self,
155                file_extensions,
156                skip_hidden,
157                skip_ignored,
158                &excluded_paths,
159            );
160
161            let mut sources_empty = true;
162
163            while let Some(source_result) = sources_stream.next().await {
164                match source_result {
165                    Ok(source) => {
166                        let content_result = match source {
167                            ResolvedInputSource::FsPath(path) => {
168                                Self::path_content(&path, preprocessor.as_ref()).await
169                            },
170                            ResolvedInputSource::RemoteUrl(url) => {
171                                resolver.url_contents(*url).await
172                            }
173                            ResolvedInputSource::Stdin => {
174                                Self::stdin_content(self.file_type_hint).await
175                            }
176                            ResolvedInputSource::String(s) => {
177                                Ok(Self::string_content(&s, self.file_type_hint))
178                            }
179                        };
180
181                        match content_result {
182                            Err(_) if skip_missing => (),
183                            Err(e) if matches!(&e, ErrorKind::ReadFileInput(io_err, _) if io_err.kind() == std::io::ErrorKind::InvalidData) =>
184                            {
185                                // If the file contains invalid UTF-8 (e.g. binary), we skip it
186                                if let ErrorKind::ReadFileInput(_, path) = &e {
187                                    log::warn!(
188                                        "Skipping file with invalid UTF-8 content: {}",
189                                        path.display()
190                                    );
191                                }
192                            }
193                            Err(e) => Err(discovered_input_error(e))?,
194                            Ok(content) => {
195                                sources_empty = false;
196                                yield content
197                            }
198                        }
199                    }
200                    Err(e) => Err(discovered_input_error(e))?,
201                }
202            }
203
204            if sources_empty {
205                log::warn!("{}: No files found for this input source", self.source);
206            }
207        }
208    }
209
210    /// Retrieve all sources from this input. The output depends on the type of
211    /// input:
212    ///
213    /// - Remote URLs are returned as is, in their full form
214    /// - Glob patterns are expanded and each matched entry is returned
215    /// - Absolute or relative filepaths are returned as-is
216    /// - Stdin input is returned as the special string "<stdin>"
217    /// - A raw string input is returned as the special string "<raw string>"
218    ///
219    /// # Errors
220    ///
221    /// Returns an error if [`InputResolver::resolve`] returns an error.
222    pub fn get_sources(
223        self,
224        file_extensions: FileExtensions,
225        skip_hidden: bool,
226        skip_ignored: bool,
227        excluded_paths: &PathExcludes,
228    ) -> impl Stream<Item = LycheeResult<String>> {
229        InputResolver::resolve(
230            &self,
231            file_extensions,
232            skip_hidden,
233            skip_ignored,
234            excluded_paths,
235        )
236        .map(|res| {
237            res.map(|src| match src {
238                ResolvedInputSource::FsPath(path) => path.to_string_lossy().to_string(),
239                ResolvedInputSource::RemoteUrl(url) => url.to_string(),
240                ResolvedInputSource::Stdin => "<stdin>".to_string(),
241                ResolvedInputSource::String(_) => "<raw string>".to_string(),
242            })
243        })
244    }
245
246    /// Get the content for a given path.
247    ///
248    /// # Errors
249    ///
250    /// Returns an error if the file cannot be read
251    /// or [`Preprocessor`] failed
252    pub async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(
253        path: P,
254        preprocessor: Option<&Preprocessor>,
255    ) -> LycheeResult<InputContent> {
256        let path = path.into();
257        let content = Self::get_content(&path, preprocessor).await?;
258
259        Ok(InputContent {
260            file_type: FileType::from(&path),
261            source: ResolvedInputSource::FsPath(path),
262            content,
263        })
264    }
265
266    /// Create `InputContent` from stdin.
267    ///
268    /// # Errors
269    ///
270    /// Returns an error if stdin cannot be read
271    pub async fn stdin_content(file_type_hint: Option<FileType>) -> LycheeResult<InputContent> {
272        let mut content = String::new();
273        let mut stdin = stdin();
274
275        if std::io::stdin().is_terminal() {
276            // useful info when nothing piped and process blocks
277            debug!("Reading content from stdin");
278        }
279        stdin.read_to_string(&mut content).await?;
280
281        let input_content = InputContent {
282            source: ResolvedInputSource::Stdin,
283            file_type: file_type_hint.unwrap_or_default(),
284            content,
285        };
286
287        Ok(input_content)
288    }
289
290    /// Create `InputContent` from a string.
291    #[must_use]
292    pub fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
293        InputContent::from_string(s, file_type_hint.unwrap_or_default())
294    }
295
296    /// Get content of file.
297    /// Get preprocessed file content if [`Preprocessor`] is [`Some`]
298    async fn get_content(
299        path: &PathBuf,
300        preprocessor: Option<&Preprocessor>,
301    ) -> LycheeResult<String> {
302        if let Some(pre) = preprocessor {
303            pre.process(path)
304        } else {
305            Ok(tokio::fs::read_to_string(path)
306                .await
307                .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?)
308        }
309    }
310}
311
312impl TryFrom<&str> for Input {
313    type Error = crate::ErrorKind;
314
315    fn try_from(value: &str) -> Result<Self, Self::Error> {
316        Self::from_value(value)
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    use crate::filter::PathExcludes;
324
325    /// A standalone function to allow for easier testing of path exclusion logic
326    pub fn is_excluded_path(excluded_paths: &PathExcludes, path: &Path) -> bool {
327        excluded_paths.is_match(&path.to_string_lossy())
328    }
329
330    #[test]
331    fn test_input_handles_real_relative_paths() {
332        let test_file = "./Cargo.toml";
333        let path = Path::new(test_file);
334
335        assert!(path.exists());
336        assert!(path.is_relative());
337
338        let input = Input::new(test_file, None, false);
339        assert!(input.is_ok());
340        assert!(matches!(
341            input,
342            Ok(Input {
343                source: InputSource::FsPath(PathBuf { .. }),
344                file_type_hint: None,
345            })
346        ));
347    }
348
349    #[test]
350    fn test_input_handles_nonexistent_relative_paths() {
351        let test_file = "./nonexistent/relative/path";
352        let path = Path::new(test_file);
353
354        assert!(!path.exists());
355        assert!(path.is_relative());
356
357        let input = Input::from_value(test_file);
358        assert!(input.is_err());
359        assert!(matches!(input, Err(ErrorKind::InvalidInput(_))));
360    }
361
362    #[test]
363    fn test_no_exclusions() {
364        let dir = tempfile::tempdir().unwrap();
365        assert!(!is_excluded_path(&PathExcludes::empty(), dir.path()));
366    }
367
368    #[test]
369    fn test_excluded() {
370        let dir = tempfile::tempdir().unwrap();
371        let path = dir.path();
372        let excludes = PathExcludes::new([path.to_string_lossy()]).unwrap();
373        assert!(is_excluded_path(&excludes, path));
374    }
375
376    #[test]
377    fn test_excluded_subdir() {
378        let parent_dir = tempfile::tempdir().unwrap();
379        let parent = parent_dir.path();
380        let child_dir = tempfile::tempdir_in(parent).unwrap();
381        let child = child_dir.path();
382
383        let excludes = PathExcludes::new([parent.to_string_lossy()]).unwrap();
384        assert!(is_excluded_path(&excludes, child));
385    }
386
387    #[test]
388    fn test_url_without_scheme() {
389        let input = Input::from_value("example.com");
390        assert!(matches!(input, Err(ErrorKind::InvalidInput(_))));
391        if let Err(error) = input {
392            let error_msg = error.to_string();
393            assert!(error_msg.contains("Use full URL"));
394        }
395    }
396
397    // Ensure that a Windows file path is not mistaken for a URL.
398    #[cfg(windows)]
399    #[test]
400    fn test_windows_style_filepath_not_existing() {
401        let input = Input::from_value("C:\\example\\project\\here");
402        assert!(input.is_err());
403        let input = input.unwrap_err();
404
405        match input {
406            ErrorKind::InvalidInput(_) => (),
407            _ => panic!("Should have received InvalidInput error, got: {input:?}"),
408        }
409    }
410
411    // Ensure that a Windows-style file path to an existing file is recognized
412    #[cfg(windows)]
413    #[test]
414    fn test_windows_style_filepath_existing() {
415        use std::env::temp_dir;
416        use tempfile::NamedTempFile;
417
418        let dir = temp_dir();
419        let file = NamedTempFile::new_in(dir).unwrap();
420        let path = file.path();
421        let input = Input::from_value(path.to_str().unwrap()).unwrap();
422
423        match input.source {
424            InputSource::FsPath(_) => (),
425            _ => panic!("Input source should be FsPath but was not"),
426        }
427    }
428
429    #[test]
430    fn test_url_scheme_check_succeeding() {
431        // Valid http and https URLs
432        assert!(matches!(
433            Input::from_value("http://example.com"),
434            Ok(Input {
435                source: InputSource::RemoteUrl(_),
436                ..
437            })
438        ));
439        assert!(matches!(
440            Input::from_value("https://example.com"),
441            Ok(Input {
442                source: InputSource::RemoteUrl(_),
443                ..
444            })
445        ));
446        assert!(matches!(
447            Input::from_value("http://subdomain.example.com/path?query=value",),
448            Ok(Input {
449                source: InputSource::RemoteUrl(_),
450                ..
451            })
452        ));
453        assert!(matches!(
454            Input::from_value("https://example.com:8080"),
455            Ok(Input {
456                source: InputSource::RemoteUrl(_),
457                ..
458            })
459        ));
460    }
461
462    #[test]
463    fn test_url_scheme_check_passing() {
464        // Valid schemes should be accepted (future compatibility)
465        assert!(matches!(
466            Input::from_value("ftp://example.com"),
467            Ok(Input {
468                source: InputSource::RemoteUrl(_),
469                ..
470            })
471        ));
472        assert!(matches!(
473            Input::from_value("httpx://example.com"),
474            Ok(Input {
475                source: InputSource::RemoteUrl(_),
476                ..
477            })
478        ));
479        assert!(matches!(
480            Input::from_value("file:///path/to/file"),
481            Ok(Input {
482                source: InputSource::RemoteUrl(_),
483                ..
484            })
485        ));
486        assert!(matches!(
487            Input::from_value("mailto:user@example.com"),
488            Ok(Input {
489                source: InputSource::RemoteUrl(_),
490                ..
491            })
492        ));
493    }
494
495    #[test]
496    fn test_non_url_inputs() {
497        // Non-URL inputs
498        assert!(matches!(
499            Input::from_value("./local/path"),
500            Err(ErrorKind::InvalidInput(_))
501        ));
502        assert!(matches!(
503            Input::from_value("*.md"),
504            Ok(Input {
505                source: InputSource::FsGlob { .. },
506                ..
507            })
508        ));
509        // Assuming the current directory exists
510        assert!(matches!(
511            Input::from_value("."),
512            Ok(Input {
513                source: InputSource::FsPath(_),
514                ..
515            })
516        ));
517    }
518}