lychee_lib/types/input/
input.rs

1//! Core input types and construction logic.
2//!
3//! The `Input` type handles the construction and validation of various input
4//! sources including URLs, file paths, glob patterns, and stdin.
5
6use super::InputResolver;
7use super::content::InputContent;
8use super::source::InputSource;
9use super::source::ResolvedInputSource;
10use crate::filter::PathExcludes;
11use crate::types::FileType;
12use crate::types::file::FileExtensions;
13use crate::types::resolver::UrlContentResolver;
14use crate::{ErrorKind, Result};
15use async_stream::try_stream;
16use futures::stream::{Stream, StreamExt};
17use glob::glob_with;
18use ignore::WalkBuilder;
19use reqwest::Url;
20use shellexpand::tilde;
21use std::path::{Path, PathBuf};
22use tokio::io::{AsyncReadExt, stdin};
23
24const STDIN: &str = "-";
25
26/// Lychee Input with optional file hint for parsing
27#[derive(Clone, Debug, PartialEq, Eq, Hash)]
28pub struct Input {
29    /// Origin of input
30    pub source: InputSource,
31
32    /// Hint to indicate which extractor to use
33    ///
34    /// If this is not provided, the extractor will be guessed from the input
35    /// (e.g. file extension or URL path)
36    pub file_type_hint: Option<FileType>,
37}
38
39impl Input {
40    /// Construct a new `Input` source. In case the input is a `glob` pattern,
41    /// `glob_ignore_case` decides whether matching files against the `glob` is
42    /// case-insensitive or not
43    ///
44    /// # Errors
45    ///
46    /// Returns an error if:
47    /// - the input does not exist (i.e. the path is invalid)
48    /// - the input cannot be parsed as a URL
49    pub fn new(
50        input: &str,
51        file_type_hint: Option<FileType>,
52        glob_ignore_case: bool,
53    ) -> Result<Self> {
54        let source = if input == STDIN {
55            InputSource::Stdin
56        } else {
57            // We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
58            match Url::parse(input) {
59                // Weed out non-HTTP schemes, including Windows drive
60                // specifiers, which can be parsed by the
61                // [url](https://crates.io/crates/url) crate
62                Ok(url) if url.scheme() == "http" || url.scheme() == "https" => {
63                    InputSource::RemoteUrl(Box::new(url))
64                }
65                Ok(_) => {
66                    // URL parsed successfully, but it's not HTTP or HTTPS
67                    return Err(ErrorKind::InvalidFile(PathBuf::from(input)));
68                }
69                _ => {
70                    // This seems to be the only way to determine if this is a glob pattern
71                    let is_glob = glob::Pattern::escape(input) != input;
72
73                    if is_glob {
74                        InputSource::FsGlob {
75                            pattern: input.to_owned(),
76                            ignore_case: glob_ignore_case,
77                        }
78                    } else {
79                        // It might be a file path; check if it exists
80                        let path = PathBuf::from(input);
81
82                        // On Windows, a filepath can never be mistaken for a
83                        // URL, because Windows filepaths use `\` and URLs use
84                        // `/`
85                        #[cfg(windows)]
86                        if path.exists() {
87                            // The file exists, so we return the path
88                            InputSource::FsPath(path)
89                        } else {
90                            // We have a valid filepath, but the file does not
91                            // exist so we return an error
92                            return Err(ErrorKind::InvalidFile(path));
93                        }
94
95                        #[cfg(unix)]
96                        if path.exists() {
97                            InputSource::FsPath(path)
98                        } else if input.starts_with('~') || input.starts_with('.') {
99                            // The path is not valid, but it might still be a
100                            // valid URL.
101                            //
102                            // Check if the path starts with a tilde (`~`) or a
103                            // dot and exit early if it does.
104                            //
105                            // This check might not be sufficient to cover all cases
106                            // but it catches the most common ones
107                            return Err(ErrorKind::InvalidFile(path));
108                        } else {
109                            // Invalid path; check if a valid URL can be constructed from the input
110                            // by prefixing it with a `http://` scheme.
111                            //
112                            // Curl also uses http (i.e. not https), see
113                            // https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
114                            //
115                            // TODO: We should get rid of this heuristic and
116                            // require users to provide a full URL with scheme.
117                            // This is a big source of confusion to users.
118                            let url = Url::parse(&format!("http://{input}")).map_err(|e| {
119                                ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string())
120                            })?;
121                            InputSource::RemoteUrl(Box::new(url))
122                        }
123                    }
124                }
125            }
126        };
127        Ok(Self {
128            source,
129            file_type_hint,
130        })
131    }
132
133    /// Convenience constructor with default settings
134    ///
135    /// # Errors
136    ///
137    /// Returns an error if:
138    /// - the input does not exist (i.e. the path is invalid)
139    /// - the input cannot be parsed as a URL
140    pub fn from_value(value: &str) -> Result<Self> {
141        Self::new(value, None, false)
142    }
143
144    /// Create an `Input` from an existing `InputSource`
145    ///
146    /// The file type will be determined later when processing the input.
147    #[must_use]
148    pub const fn from_input_source(source: InputSource) -> Self {
149        Self {
150            source,
151            file_type_hint: None,
152        }
153    }
154
155    /// Retrieve the contents from the input
156    ///
157    /// If the input is a path, only search through files that match the given
158    /// file extensions.
159    ///
160    /// # Errors
161    ///
162    /// Returns an error if the contents can not be retrieved because of an
163    /// underlying I/O error (e.g. an error while making a network request or
164    /// retrieving the contents from the file system)
165    pub fn get_contents(
166        self,
167        skip_missing: bool,
168        skip_hidden: bool,
169        skip_gitignored: bool,
170        file_extensions: FileExtensions,
171        resolver: UrlContentResolver,
172        excluded_paths: PathExcludes,
173    ) -> impl Stream<Item = Result<InputContent>> {
174        try_stream! {
175            // Handle simple cases that don't need resolution
176            match self.source {
177                InputSource::RemoteUrl(url) => {
178                    match resolver.url_contents(*url).await {
179                        Err(_) if skip_missing => (),
180                        Err(e) => Err(e)?,
181                        Ok(content) => yield content,
182                    }
183                    return;
184                }
185                InputSource::Stdin => {
186                    yield Self::stdin_content(self.file_type_hint).await?;
187                    return;
188                }
189                InputSource::String(ref s) => {
190                    yield Self::string_content(s, self.file_type_hint);
191                    return;
192                }
193                _ => {}
194            }
195
196            // Handle complex cases that need resolution (FsPath, FsGlob)
197            let mut sources_stream = Box::pin(InputResolver::resolve(
198                &self,
199                file_extensions,
200                skip_hidden,
201                skip_gitignored,
202                &excluded_paths,
203            ));
204
205            let mut sources_empty = true;
206
207            while let Some(source_result) = sources_stream.next().await {
208                match source_result {
209                    Ok(source) => {
210                        let content_result = match source {
211                            ResolvedInputSource::FsPath(path) => {
212                                Self::path_content(&path).await
213                            },
214                            ResolvedInputSource::RemoteUrl(url) => {
215                                resolver.url_contents(*url).await
216                            },
217                            ResolvedInputSource::Stdin => {
218                                Self::stdin_content(self.file_type_hint).await
219                            },
220                            ResolvedInputSource::String(s) => {
221                                Ok(Self::string_content(&s, self.file_type_hint))
222                            },
223                        };
224
225                        match content_result {
226                            Err(_) if skip_missing => (),
227                            Err(e) if matches!(&e, ErrorKind::ReadFileInput(io_err, _) if io_err.kind() == std::io::ErrorKind::InvalidData) => {
228                                // If the file contains invalid UTF-8 (e.g. binary), we skip it
229                                if let ErrorKind::ReadFileInput(_, path) = &e {
230                                    log::warn!("Skipping file with invalid UTF-8 content: {}", path.display());
231                                }
232                            },
233                            Err(e) => Err(e)?,
234                            Ok(content) => {
235                                sources_empty = false;
236                                yield content
237                            }
238                        }
239                    },
240                    Err(e) => Err(e)?,
241                }
242            }
243
244            if sources_empty {
245                log::warn!("{}: No files found for this input source", self.source);
246            }
247        }
248    }
249
250    /// Create a `WalkBuilder` for directory traversal
251    fn walk_entries(
252        path: &Path,
253        file_extensions: FileExtensions,
254        skip_hidden: bool,
255        skip_gitignored: bool,
256    ) -> Result<ignore::Walk> {
257        Ok(WalkBuilder::new(path)
258            // Enable standard filters if `skip_gitignored `is true.
259            // This will skip files ignored by `.gitignore` and other VCS ignore files.
260            .standard_filters(skip_gitignored)
261            // Override hidden file behavior to be controlled by the separate skip_hidden parameter
262            .hidden(skip_hidden)
263            // Configure the file types filter to only include files with matching extensions
264            .types(file_extensions.try_into()?)
265            .build())
266    }
267
268    /// Retrieve all sources from this input. The output depends on the type of
269    /// input:
270    ///
271    /// - Remote URLs are returned as is, in their full form
272    /// - Filepath Glob Patterns are expanded and each matched entry is returned
273    /// - Absolute or relative filepaths are returned as is
274    /// - All other input types are not returned
275    ///
276    /// # Errors
277    ///
278    /// Returns an error if:
279    /// - The glob pattern is invalid or expansion encounters I/O errors
280    /// - Directory traversal fails, including:
281    ///   - Permission denied when accessing directories or files
282    ///   - I/O errors while reading directory contents
283    ///   - Filesystem errors (disk errors, network filesystem issues, etc.)
284    ///   - Invalid file paths or symbolic link resolution failures
285    /// - Errors when reading or evaluating `.gitignore` or `.ignore` files
286    /// - Errors occur during file extension or path exclusion evaluation
287    ///
288    /// Note: Individual glob match failures are logged to stderr but don't terminate the stream.
289    /// However, directory traversal errors will stop processing and return the error immediately.
290    pub fn get_sources(
291        self,
292        file_extensions: FileExtensions,
293        skip_hidden: bool,
294        skip_gitignored: bool,
295        excluded_paths: &PathExcludes,
296    ) -> impl Stream<Item = Result<String>> {
297        try_stream! {
298            match self.source {
299                InputSource::RemoteUrl(url) => yield url.to_string(),
300                InputSource::FsGlob {
301                    ref pattern,
302                    ignore_case,
303                } => {
304                    let glob_expanded = tilde(&pattern).to_string();
305                    let mut match_opts = glob::MatchOptions::new();
306                    match_opts.case_sensitive = !ignore_case;
307                    for entry in glob_with(&glob_expanded, match_opts)? {
308                        match entry {
309                            Ok(path) => {
310                                if !Self::is_excluded_path(&path, excluded_paths) {
311                                    yield path.to_string_lossy().to_string();
312                                }
313                            },
314                            Err(e) => eprintln!("{e:?}"),
315                        }
316                    }
317                }
318                InputSource::FsPath(ref path) => {
319                    if path.is_dir() {
320                        for entry in Input::walk_entries(
321                            path,
322                            file_extensions,
323                            skip_hidden,
324                            skip_gitignored,
325                        )? {
326                            let entry = entry?;
327                            if !Self::is_excluded_path(entry.path(), excluded_paths) {
328                                // Only yield files, not directories
329                                if entry.file_type().is_some_and(|ft| ft.is_file()) {
330                                    yield entry.path().to_string_lossy().to_string();
331                                }
332                            }
333                        }
334                    } else if !Self::is_excluded_path(path, excluded_paths) {
335                        yield path.to_string_lossy().to_string();
336                    }
337                }
338                InputSource::Stdin => yield "<stdin>".into(),
339                InputSource::String(_) => yield "<raw string>".into(),
340            }
341        }
342    }
343
344    /// Check if the given path was excluded from link checking
345    fn is_excluded_path(path: &Path, excluded_paths: &PathExcludes) -> bool {
346        excluded_paths.is_match(&path.to_string_lossy())
347    }
348
349    /// Get the content for a given path.
350    ///
351    /// # Errors
352    ///
353    /// Returns an error if the file cannot be read
354    pub async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(
355        path: P,
356    ) -> Result<InputContent> {
357        let path = path.into();
358
359        let content = tokio::fs::read_to_string(&path)
360            .await
361            .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?;
362
363        let input_content = InputContent {
364            file_type: FileType::from(&path),
365            source: ResolvedInputSource::FsPath(path),
366            content,
367        };
368
369        Ok(input_content)
370    }
371
372    /// Create `InputContent` from stdin.
373    ///
374    /// # Errors
375    ///
376    /// Returns an error if stdin cannot be read
377    pub async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
378        let mut content = String::new();
379        let mut stdin = stdin();
380        stdin.read_to_string(&mut content).await?;
381
382        let input_content = InputContent {
383            source: ResolvedInputSource::Stdin,
384            file_type: file_type_hint.unwrap_or_default(),
385            content,
386        };
387
388        Ok(input_content)
389    }
390
391    /// Create `InputContent` from a string.
392    #[must_use]
393    pub fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
394        InputContent::from_string(s, file_type_hint.unwrap_or_default())
395    }
396}
397
398impl TryFrom<&str> for Input {
399    type Error = crate::ErrorKind;
400
401    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
402        Self::from_value(value)
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use crate::filter::PathExcludes;
410
411    /// A standalone function to allow for easier testing of path exclusion logic
412    pub fn is_excluded_path(excluded_paths: &PathExcludes, path: &Path) -> bool {
413        excluded_paths.is_match(&path.to_string_lossy())
414    }
415
416    #[test]
417    fn test_input_handles_real_relative_paths() {
418        let test_file = "./Cargo.toml";
419        let path = Path::new(test_file);
420
421        assert!(path.exists());
422        assert!(path.is_relative());
423
424        let input = Input::new(test_file, None, false);
425        assert!(input.is_ok());
426        assert!(matches!(
427            input,
428            Ok(Input {
429                source: InputSource::FsPath(PathBuf { .. }),
430                file_type_hint: None,
431            })
432        ));
433    }
434
435    #[test]
436    fn test_input_handles_nonexistent_relative_paths() {
437        let test_file = "./nonexistent/relative/path";
438        let path = Path::new(test_file);
439
440        assert!(!path.exists());
441        assert!(path.is_relative());
442
443        let input = Input::from_value(test_file);
444        assert!(input.is_err());
445        assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. }))));
446    }
447
448    #[test]
449    fn test_no_exclusions() {
450        let dir = tempfile::tempdir().unwrap();
451        assert!(!is_excluded_path(&PathExcludes::empty(), dir.path()));
452    }
453
454    #[test]
455    fn test_excluded() {
456        let dir = tempfile::tempdir().unwrap();
457        let path = dir.path();
458        let excludes = PathExcludes::new([path.to_string_lossy()]).unwrap();
459        assert!(is_excluded_path(&excludes, path));
460    }
461
462    #[test]
463    fn test_excluded_subdir() {
464        let parent_dir = tempfile::tempdir().unwrap();
465        let parent = parent_dir.path();
466        let child_dir = tempfile::tempdir_in(parent).unwrap();
467        let child = child_dir.path();
468
469        let excludes = PathExcludes::new([parent.to_string_lossy()]).unwrap();
470        assert!(is_excluded_path(&excludes, child));
471    }
472
473    #[test]
474    fn test_url_without_scheme() {
475        let input = Input::from_value("example.com");
476        assert_eq!(
477            input.unwrap().source.to_string(),
478            String::from("http://example.com/")
479        );
480    }
481
482    // Ensure that a Windows file path is not mistaken for a URL.
483    #[cfg(windows)]
484    #[test]
485    fn test_windows_style_filepath_not_existing() {
486        let input = Input::from_value("C:\\example\\project\\here");
487        assert!(input.is_err());
488        let input = input.unwrap_err();
489
490        match input {
491            ErrorKind::InvalidFile(_) => (),
492            _ => panic!("Should have received InvalidFile error"),
493        }
494    }
495
496    // Ensure that a Windows-style file path to an existing file is recognized
497    #[cfg(windows)]
498    #[test]
499    fn test_windows_style_filepath_existing() {
500        use std::env::temp_dir;
501        use tempfile::NamedTempFile;
502
503        let dir = temp_dir();
504        let file = NamedTempFile::new_in(dir).unwrap();
505        let path = file.path();
506        let input = Input::from_value(path.to_str().unwrap()).unwrap();
507
508        match input.source {
509            InputSource::FsPath(_) => (),
510            _ => panic!("Input source should be FsPath but was not"),
511        }
512    }
513
514    #[test]
515    fn test_url_scheme_check_succeeding() {
516        // Valid http and https URLs
517        assert!(matches!(
518            Input::from_value("http://example.com"),
519            Ok(Input {
520                source: InputSource::RemoteUrl(_),
521                ..
522            })
523        ));
524        assert!(matches!(
525            Input::from_value("https://example.com"),
526            Ok(Input {
527                source: InputSource::RemoteUrl(_),
528                ..
529            })
530        ));
531        assert!(matches!(
532            Input::from_value("http://subdomain.example.com/path?query=value",),
533            Ok(Input {
534                source: InputSource::RemoteUrl(_),
535                ..
536            })
537        ));
538        assert!(matches!(
539            Input::from_value("https://example.com:8080"),
540            Ok(Input {
541                source: InputSource::RemoteUrl(_),
542                ..
543            })
544        ));
545    }
546
547    #[test]
548    fn test_url_scheme_check_failing() {
549        // Invalid schemes
550        assert!(matches!(
551            Input::from_value("ftp://example.com"),
552            Err(ErrorKind::InvalidFile(_))
553        ));
554        assert!(matches!(
555            Input::from_value("httpx://example.com"),
556            Err(ErrorKind::InvalidFile(_))
557        ));
558        assert!(matches!(
559            Input::from_value("file:///path/to/file"),
560            Err(ErrorKind::InvalidFile(_))
561        ));
562        assert!(matches!(
563            Input::from_value("mailto:user@example.com"),
564            Err(ErrorKind::InvalidFile(_))
565        ));
566    }
567
568    #[test]
569    fn test_non_url_inputs() {
570        // Non-URL inputs
571        assert!(matches!(
572            Input::from_value("./local/path"),
573            Err(ErrorKind::InvalidFile(_))
574        ));
575        assert!(matches!(
576            Input::from_value("*.md"),
577            Ok(Input {
578                source: InputSource::FsGlob { .. },
579                ..
580            })
581        ));
582        // Assuming the current directory exists
583        assert!(matches!(
584            Input::from_value("."),
585            Ok(Input {
586                source: InputSource::FsPath(_),
587                ..
588            })
589        ));
590    }
591}