Skip to main content

lychee_lib/types/input/
source.rs

1//! Input source type definitions.
2//!
3//! lychee can handle different kinds of input sources:
4//! - URLs (of HTTP/HTTPS scheme)
5//! - File system paths (to files or directories)
6//! - Unix shell-style glob patterns (e.g. `./docs/**/*.md`)
7//! - Standard input (`stdin`)
8//! - Raw strings (UTF-8 only for now)
9//!
10//! Each input source is handled differently:
11//! - File paths are walked (if they are directories) and filtered by
12//!   extension
13//! - Glob patterns are expanded to matching file paths, which are then walked
14//!   and filtered by extension
15//! - URLs, raw strings, and standard input (`stdin`) are read directly
16
17use crate::BaseInfo;
18use crate::ErrorKind;
19use crate::utils;
20
21use glob::Pattern;
22use reqwest::Url;
23use serde::{Deserialize, Deserializer, Serialize};
24use std::borrow::Cow;
25use std::fmt::Display;
26use std::path::PathBuf;
27use std::result::Result;
28
29/// Input types which lychee supports
30#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)]
31#[non_exhaustive]
32pub enum InputSource {
33    /// URL (of HTTP/HTTPS scheme).
34    RemoteUrl(Box<Url>),
35    /// Unix shell-style glob pattern.
36    FsGlob {
37        /// The glob pattern matching all input files
38        #[serde(deserialize_with = "InputSource::deserialize_pattern")]
39        pattern: Pattern,
40        /// Don't be case sensitive when matching files against a glob pattern
41        ignore_case: bool,
42    },
43    /// File path.
44    FsPath(PathBuf),
45    /// Standard Input.
46    Stdin,
47    /// Raw string input.
48    String(Cow<'static, str>),
49}
50
51impl InputSource {
52    const STDIN: &str = "-";
53
54    /// Parses a [`InputSource`] from the given string. The kind of input source will be
55    /// automatically detected according to certain rules and precedences.
56    ///
57    /// # Errors
58    ///
59    /// Returns an error if:
60    /// - the input does not exist (i.e. the path is invalid)
61    /// - the input cannot be parsed as a URL
62    pub fn new(input: &str, glob_ignore_case: bool) -> Result<Self, ErrorKind> {
63        if input == Self::STDIN {
64            return Ok(InputSource::Stdin);
65        }
66
67        if let Ok(url) = utils::url::parse_url_or_path(input) {
68            return Ok(InputSource::RemoteUrl(Box::new(url)));
69        }
70
71        // This seems to be the only way to determine if this is a glob pattern
72        let is_glob = glob::Pattern::escape(input) != input;
73
74        if is_glob {
75            return Ok(InputSource::FsGlob {
76                pattern: Pattern::new(input)?,
77                ignore_case: glob_ignore_case,
78            });
79        }
80
81        // It might be a file path; check if it exists
82        let path = PathBuf::from(input);
83
84        if path.exists() {
85            Ok(InputSource::FsPath(path))
86        } else {
87            Err(ErrorKind::InvalidInput(input.to_owned()))
88        }
89    }
90
91    fn deserialize_pattern<'de, D>(deserializer: D) -> Result<Pattern, D::Error>
92    where
93        D: Deserializer<'de>,
94    {
95        use serde::de::Error;
96        let s = String::deserialize(deserializer)?;
97        Pattern::new(&s).map_err(D::Error::custom)
98    }
99}
100
101/// Resolved input sources that can be processed for content.
102///
103/// This represents input sources after glob pattern expansion.
104/// It is identical to `InputSource`, except that glob patterns
105/// have been resolved to concrete file paths.
106///
107/// We use a separate type to avoid handling the (no longer applicable)
108/// glob case in downstream processing.
109#[derive(Debug, Clone, PartialEq, Eq, Hash)]
110pub enum ResolvedInputSource {
111    /// URL (of HTTP/HTTPS scheme).
112    RemoteUrl(Box<Url>),
113    /// File path.
114    FsPath(PathBuf),
115    /// Standard Input.
116    Stdin,
117    /// Raw string input.
118    String(Cow<'static, str>),
119}
120
121impl ResolvedInputSource {
122    /// Converts a [`ResolvedInputSource::RemoteUrl`] or
123    /// [`ResolvedInputSource::FsPath`] to a [`BaseInfo`] for the source.
124    ///
125    /// For other variants (i.e., those without a URL), [`BaseInfo::None`]
126    /// is returned.
127    ///
128    /// # Errors
129    ///
130    /// Returns an error if building a URL from a [`ResolvedInputSource::FsPath`]
131    /// fails.
132    pub fn to_base_info(&self) -> Result<BaseInfo, ErrorKind> {
133        let url = match self {
134            Self::RemoteUrl(url) => Cow::Borrowed(&**url),
135            Self::FsPath(path) => std::path::absolute(path)
136                .ok()
137                .and_then(|x| Url::from_file_path(x).ok())
138                .map(Cow::Owned)
139                .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned()))?,
140            _ => return Ok(BaseInfo::none()),
141        };
142
143        Ok(BaseInfo::from_source_url(&url))
144    }
145}
146
147impl From<ResolvedInputSource> for InputSource {
148    fn from(resolved: ResolvedInputSource) -> Self {
149        match resolved {
150            ResolvedInputSource::RemoteUrl(url) => InputSource::RemoteUrl(url),
151            ResolvedInputSource::FsPath(path) => InputSource::FsPath(path),
152            ResolvedInputSource::Stdin => InputSource::Stdin,
153            ResolvedInputSource::String(s) => InputSource::String(s),
154        }
155    }
156}
157
158impl Display for ResolvedInputSource {
159    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160        f.write_str(match self {
161            Self::RemoteUrl(url) => url.as_str(),
162            Self::FsPath(path) => path.to_str().unwrap_or_default(),
163            Self::Stdin => "stdin",
164            Self::String(s) => s.as_ref(),
165        })
166    }
167}
168
169/// Custom serialization for the `InputSource` enum.
170///
171/// This implementation serializes all variants as strings to ensure
172/// compatibility with JSON serialization, which requires string keys for enums.
173///
174/// Without this custom implementation, attempting to serialize `InputSource` to
175/// JSON would result in a "key must be a string" error.
176///
177/// See: <https://github.com/serde-rs/json/issues/45>
178impl Serialize for InputSource {
179    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
180    where
181        S: serde::Serializer,
182    {
183        serializer.collect_str(self)
184    }
185}
186
187impl Display for InputSource {
188    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189        f.write_str(match self {
190            Self::RemoteUrl(url) => url.as_str(),
191            Self::FsGlob { pattern, .. } => pattern.as_str(),
192            Self::FsPath(path) => path.to_str().unwrap_or_default(),
193            Self::Stdin => "stdin",
194            Self::String(s) => s.as_ref(),
195        })
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    /// Serialization of `FsGlob` relies on [`glob::Pattern::to_string`].
204    /// Here, we check that the `to_string` works as we require.
205    #[test]
206    fn test_pattern_serialization_is_original_pattern() {
207        let pat = "asd[f]*";
208        assert_eq!(
209            serde_json::to_string(&InputSource::FsGlob {
210                pattern: Pattern::new(pat).unwrap(),
211                ignore_case: false,
212            })
213            .unwrap(),
214            serde_json::to_string(pat).unwrap(),
215        );
216    }
217}