lychee_lib/types/input/source.rs
1//! Input source type definitions.
2//!
3//! lychee can handle different kinds of input sources:
4//! - URLs (of HTTP/HTTPS scheme)
5//! - File system paths (to files or directories)
6//! - Unix shell-style glob patterns (e.g. `./docs/**/*.md`)
7//! - Standard input (`stdin`)
8//! - Raw strings (UTF-8 only for now)
9//!
10//! Each input source is handled differently:
11//! - File paths are walked (if they are directories) and filtered by
12//! extension
13//! - Glob patterns are expanded to matching file paths, which are then walked
14//! and filtered by extension
15//! - URLs, raw strings, and standard input (`stdin`) are read directly
16
17use crate::ErrorKind;
18
19use glob::Pattern;
20use reqwest::Url;
21use serde::{Deserialize, Deserializer, Serialize};
22use std::borrow::Cow;
23use std::fmt::Display;
24use std::path::PathBuf;
25use std::result::Result;
26
27/// Input types which lychee supports
28#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)]
29#[non_exhaustive]
30pub enum InputSource {
31 /// URL (of HTTP/HTTPS scheme).
32 RemoteUrl(Box<Url>),
33 /// Unix shell-style glob pattern.
34 FsGlob {
35 /// The glob pattern matching all input files
36 #[serde(deserialize_with = "InputSource::deserialize_pattern")]
37 pattern: Pattern,
38 /// Don't be case sensitive when matching files against a glob pattern
39 ignore_case: bool,
40 },
41 /// File path.
42 FsPath(PathBuf),
43 /// Standard Input.
44 Stdin,
45 /// Raw string input.
46 String(Cow<'static, str>),
47}
48
49impl InputSource {
50 const STDIN: &str = "-";
51
52 /// Parses a [`InputSource`] from the given string. The kind of input source will be
53 /// automatically detected according to certain rules and precedences.
54 ///
55 /// # Errors
56 ///
57 /// Returns an error if:
58 /// - the input does not exist (i.e. the path is invalid)
59 /// - the input cannot be parsed as a URL
60 pub fn new(input: &str, glob_ignore_case: bool) -> Result<Self, ErrorKind> {
61 if input == Self::STDIN {
62 return Ok(InputSource::Stdin);
63 }
64
65 // We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
66 if let Ok(url) = Url::parse(input) {
67 // Weed out non-HTTP schemes, including Windows drive
68 // specifiers, which can be parsed by the
69 // [url](https://crates.io/crates/url) crate
70 return match url.scheme() {
71 "http" | "https" => Ok(InputSource::RemoteUrl(Box::new(url))),
72 _ => Err(ErrorKind::InvalidFile(PathBuf::from(input))),
73 };
74 }
75
76 // This seems to be the only way to determine if this is a glob pattern
77 let is_glob = glob::Pattern::escape(input) != input;
78
79 if is_glob {
80 return Ok(InputSource::FsGlob {
81 pattern: Pattern::new(input)?,
82 ignore_case: glob_ignore_case,
83 });
84 }
85
86 // It might be a file path; check if it exists
87 let path = PathBuf::from(input);
88
89 // On Windows, a filepath can never be mistaken for a
90 // URL, because Windows filepaths use `\` and URLs use
91 // `/`
92 #[cfg(windows)]
93 if path.exists() {
94 // The file exists, so we return the path
95 Ok(InputSource::FsPath(path))
96 } else {
97 // We have a valid filepath, but the file does not
98 // exist so we return an error
99 Err(ErrorKind::InvalidFile(path))
100 }
101
102 #[cfg(unix)]
103 if path.exists() {
104 Ok(InputSource::FsPath(path))
105 } else if input.starts_with('~') || input.starts_with('.') {
106 // The path is not valid, but it might still be a
107 // valid URL.
108 //
109 // Check if the path starts with a tilde (`~`) or a
110 // dot and exit early if it does.
111 //
112 // This check might not be sufficient to cover all cases
113 // but it catches the most common ones
114 Err(ErrorKind::InvalidFile(path))
115 } else {
116 // Invalid path; check if a valid URL can be constructed from the input
117 // by prefixing it with a `http://` scheme.
118 //
119 // Curl also uses http (i.e. not https), see
120 // https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
121 //
122 // TODO: We should get rid of this heuristic and
123 // require users to provide a full URL with scheme.
124 // This is a big source of confusion to users.
125 let url = Url::parse(&format!("http://{input}"))
126 .map_err(|e| ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string()))?;
127 Ok(InputSource::RemoteUrl(Box::new(url)))
128 }
129 }
130
131 fn deserialize_pattern<'de, D>(deserializer: D) -> Result<Pattern, D::Error>
132 where
133 D: Deserializer<'de>,
134 {
135 use serde::de::Error;
136 let s = String::deserialize(deserializer)?;
137 Pattern::new(&s).map_err(D::Error::custom)
138 }
139}
140
141/// Resolved input sources that can be processed for content.
142///
143/// This represents input sources after glob pattern expansion.
144/// It is identical to `InputSource`, except that glob patterns
145/// have been resolved to concrete file paths.
146///
147/// We use a separate type to avoid handling the (no longer applicable)
148/// glob case in downstream processing.
149#[derive(Debug, Clone, PartialEq, Eq, Hash)]
150pub enum ResolvedInputSource {
151 /// URL (of HTTP/HTTPS scheme).
152 RemoteUrl(Box<Url>),
153 /// File path.
154 FsPath(PathBuf),
155 /// Standard Input.
156 Stdin,
157 /// Raw string input.
158 String(Cow<'static, str>),
159}
160
161impl From<ResolvedInputSource> for InputSource {
162 fn from(resolved: ResolvedInputSource) -> Self {
163 match resolved {
164 ResolvedInputSource::RemoteUrl(url) => InputSource::RemoteUrl(url),
165 ResolvedInputSource::FsPath(path) => InputSource::FsPath(path),
166 ResolvedInputSource::Stdin => InputSource::Stdin,
167 ResolvedInputSource::String(s) => InputSource::String(s),
168 }
169 }
170}
171
172impl Display for ResolvedInputSource {
173 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
174 f.write_str(match self {
175 Self::RemoteUrl(url) => url.as_str(),
176 Self::FsPath(path) => path.to_str().unwrap_or_default(),
177 Self::Stdin => "stdin",
178 Self::String(s) => s.as_ref(),
179 })
180 }
181}
182
183/// Custom serialization for the `InputSource` enum.
184///
185/// This implementation serializes all variants as strings to ensure
186/// compatibility with JSON serialization, which requires string keys for enums.
187///
188/// Without this custom implementation, attempting to serialize `InputSource` to
189/// JSON would result in a "key must be a string" error.
190///
191/// See: <https://github.com/serde-rs/json/issues/45>
192impl Serialize for InputSource {
193 fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
194 where
195 S: serde::Serializer,
196 {
197 serializer.collect_str(self)
198 }
199}
200
201impl Display for InputSource {
202 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
203 f.write_str(match self {
204 Self::RemoteUrl(url) => url.as_str(),
205 Self::FsGlob { pattern, .. } => pattern.as_str(),
206 Self::FsPath(path) => path.to_str().unwrap_or_default(),
207 Self::Stdin => "stdin",
208 Self::String(s) => s.as_ref(),
209 })
210 }
211}
212
213#[cfg(test)]
214mod tests {
215 use super::*;
216
217 /// Serialization of `FsGlob` relies on [`glob::Pattern::to_string`].
218 /// Here, we check that the `to_string` works as we require.
219 #[test]
220 fn test_pattern_serialization_is_original_pattern() {
221 let pat = "asd[f]*";
222 assert_eq!(
223 serde_json::to_string(&InputSource::FsGlob {
224 pattern: Pattern::new(pat).unwrap(),
225 ignore_case: false,
226 })
227 .unwrap(),
228 serde_json::to_string(pat).unwrap(),
229 );
230 }
231}