lychee_lib/types/input/
source.rs

1//! Input source type definitions.
2//!
3//! lychee can handle different kinds of input sources:
4//! - URLs (of HTTP/HTTPS scheme)
5//! - File system paths (to files or directories)
6//! - Unix shell-style glob patterns (e.g. `./docs/**/*.md`)
7//! - Standard input (`stdin`)
8//! - Raw strings (UTF-8 only for now)
9//!
10//! Each input source is handled differently:
11//! - File paths are walked (if they are directories) and filtered by
12//!   extension
13//! - Glob patterns are expanded to matching file paths, which are then walked
14//!   and filtered by extension
15//! - URLs, raw strings, and standard input (`stdin`) are read directly
16
17use crate::ErrorKind;
18
19use glob::Pattern;
20use reqwest::Url;
21use serde::{Deserialize, Deserializer, Serialize};
22use std::borrow::Cow;
23use std::fmt::Display;
24use std::path::PathBuf;
25use std::result::Result;
26
27/// Input types which lychee supports
28#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)]
29#[non_exhaustive]
30pub enum InputSource {
31    /// URL (of HTTP/HTTPS scheme).
32    RemoteUrl(Box<Url>),
33    /// Unix shell-style glob pattern.
34    FsGlob {
35        /// The glob pattern matching all input files
36        #[serde(deserialize_with = "InputSource::deserialize_pattern")]
37        pattern: Pattern,
38        /// Don't be case sensitive when matching files against a glob pattern
39        ignore_case: bool,
40    },
41    /// File path.
42    FsPath(PathBuf),
43    /// Standard Input.
44    Stdin,
45    /// Raw string input.
46    String(Cow<'static, str>),
47}
48
49impl InputSource {
50    const STDIN: &str = "-";
51
52    /// Parses a [`InputSource`] from the given string. The kind of input source will be
53    /// automatically detected according to certain rules and precedences.
54    ///
55    /// # Errors
56    ///
57    /// Returns an error if:
58    /// - the input does not exist (i.e. the path is invalid)
59    /// - the input cannot be parsed as a URL
60    pub fn new(input: &str, glob_ignore_case: bool) -> Result<Self, ErrorKind> {
61        if input == Self::STDIN {
62            return Ok(InputSource::Stdin);
63        }
64
65        // We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
66        if let Ok(url) = Url::parse(input) {
67            // Weed out non-HTTP schemes, including Windows drive
68            // specifiers, which can be parsed by the
69            // [url](https://crates.io/crates/url) crate
70            return match url.scheme() {
71                "http" | "https" => Ok(InputSource::RemoteUrl(Box::new(url))),
72                _ => Err(ErrorKind::InvalidFile(PathBuf::from(input))),
73            };
74        }
75
76        // This seems to be the only way to determine if this is a glob pattern
77        let is_glob = glob::Pattern::escape(input) != input;
78
79        if is_glob {
80            return Ok(InputSource::FsGlob {
81                pattern: Pattern::new(input)?,
82                ignore_case: glob_ignore_case,
83            });
84        }
85
86        // It might be a file path; check if it exists
87        let path = PathBuf::from(input);
88
89        // On Windows, a filepath can never be mistaken for a
90        // URL, because Windows filepaths use `\` and URLs use
91        // `/`
92        #[cfg(windows)]
93        if path.exists() {
94            // The file exists, so we return the path
95            Ok(InputSource::FsPath(path))
96        } else {
97            // We have a valid filepath, but the file does not
98            // exist so we return an error
99            Err(ErrorKind::InvalidFile(path))
100        }
101
102        #[cfg(unix)]
103        if path.exists() {
104            Ok(InputSource::FsPath(path))
105        } else if input.starts_with('~') || input.starts_with('.') {
106            // The path is not valid, but it might still be a
107            // valid URL.
108            //
109            // Check if the path starts with a tilde (`~`) or a
110            // dot and exit early if it does.
111            //
112            // This check might not be sufficient to cover all cases
113            // but it catches the most common ones
114            Err(ErrorKind::InvalidFile(path))
115        } else {
116            // Invalid path; check if a valid URL can be constructed from the input
117            // by prefixing it with a `http://` scheme.
118            //
119            // Curl also uses http (i.e. not https), see
120            // https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
121            //
122            // TODO: We should get rid of this heuristic and
123            // require users to provide a full URL with scheme.
124            // This is a big source of confusion to users.
125            let url = Url::parse(&format!("http://{input}"))
126                .map_err(|e| ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string()))?;
127            Ok(InputSource::RemoteUrl(Box::new(url)))
128        }
129    }
130
131    fn deserialize_pattern<'de, D>(deserializer: D) -> Result<Pattern, D::Error>
132    where
133        D: Deserializer<'de>,
134    {
135        use serde::de::Error;
136        let s = String::deserialize(deserializer)?;
137        Pattern::new(&s).map_err(D::Error::custom)
138    }
139}
140
141/// Resolved input sources that can be processed for content.
142///
143/// This represents input sources after glob pattern expansion.
144/// It is identical to `InputSource`, except that glob patterns
145/// have been resolved to concrete file paths.
146///
147/// We use a separate type to avoid handling the (no longer applicable)
148/// glob case in downstream processing.
149#[derive(Debug, Clone, PartialEq, Eq, Hash)]
150pub enum ResolvedInputSource {
151    /// URL (of HTTP/HTTPS scheme).
152    RemoteUrl(Box<Url>),
153    /// File path.
154    FsPath(PathBuf),
155    /// Standard Input.
156    Stdin,
157    /// Raw string input.
158    String(Cow<'static, str>),
159}
160
161impl From<ResolvedInputSource> for InputSource {
162    fn from(resolved: ResolvedInputSource) -> Self {
163        match resolved {
164            ResolvedInputSource::RemoteUrl(url) => InputSource::RemoteUrl(url),
165            ResolvedInputSource::FsPath(path) => InputSource::FsPath(path),
166            ResolvedInputSource::Stdin => InputSource::Stdin,
167            ResolvedInputSource::String(s) => InputSource::String(s),
168        }
169    }
170}
171
172impl Display for ResolvedInputSource {
173    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
174        f.write_str(match self {
175            Self::RemoteUrl(url) => url.as_str(),
176            Self::FsPath(path) => path.to_str().unwrap_or_default(),
177            Self::Stdin => "stdin",
178            Self::String(s) => s.as_ref(),
179        })
180    }
181}
182
183/// Custom serialization for the `InputSource` enum.
184///
185/// This implementation serializes all variants as strings to ensure
186/// compatibility with JSON serialization, which requires string keys for enums.
187///
188/// Without this custom implementation, attempting to serialize `InputSource` to
189/// JSON would result in a "key must be a string" error.
190///
191/// See: <https://github.com/serde-rs/json/issues/45>
192impl Serialize for InputSource {
193    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
194    where
195        S: serde::Serializer,
196    {
197        serializer.collect_str(self)
198    }
199}
200
201impl Display for InputSource {
202    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
203        f.write_str(match self {
204            Self::RemoteUrl(url) => url.as_str(),
205            Self::FsGlob { pattern, .. } => pattern.as_str(),
206            Self::FsPath(path) => path.to_str().unwrap_or_default(),
207            Self::Stdin => "stdin",
208            Self::String(s) => s.as_ref(),
209        })
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    /// Serialization of `FsGlob` relies on [`glob::Pattern::to_string`].
218    /// Here, we check that the `to_string` works as we require.
219    #[test]
220    fn test_pattern_serialization_is_original_pattern() {
221        let pat = "asd[f]*";
222        assert_eq!(
223            serde_json::to_string(&InputSource::FsGlob {
224                pattern: Pattern::new(pat).unwrap(),
225                ignore_case: false,
226            })
227            .unwrap(),
228            serde_json::to_string(pat).unwrap(),
229        );
230    }
231}