linguist/
resolver.rs

1use std::collections::HashMap;
2use std::ffi::OsString;
3use std::fmt::Display;
4use std::io::{BufRead, BufReader};
5use std::path::Path;
6
7#[cfg(feature = "matcher")]
8use regex::Regex;
9
10use crate::container::Container;
11use crate::error::LinguistError;
12use crate::utils::{determine_multiline_exec, has_shebang, is_binary};
13
14/// A `Language` exposes the properties of a language definition.
15#[derive(Debug, Clone, PartialEq, Eq)]
16#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
17pub struct Language {
18    pub parent: Option<String>,
19    pub name: String,
20    pub aliases: Vec<String>,
21    pub scope: Scope,
22    pub extensions: Vec<OsString>,
23    pub filenames: Vec<OsString>,
24    pub interpreters: Vec<String>,
25    pub color: Option<String>,
26}
27
28impl Display for Language {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        write!(f, "{}", self.name)
31    }
32}
33
34/// A `Scope` represents the type of a [`Language`]. 
35#[derive(Debug, Clone, PartialEq, Eq)]
36#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
37pub enum Scope {
38    Programming,
39    Markup,
40    Data,
41    Prose,
42    Unknown,
43}
44
45impl From<String> for Scope {
46    fn from(value: String) -> Self {
47        match value.to_lowercase().as_str() {
48            "programming" => Scope::Programming,
49            "markup" => Scope::Markup,
50            "data" => Scope::Data,
51            "prose" => Scope::Prose,
52            _ => Scope::Unknown,
53        }
54    }
55}
56
57impl From<&str> for Scope {
58    fn from(value: &str) -> Self {
59        match value.to_lowercase().as_str() {
60            "programming" => Scope::Programming,
61            "markup" => Scope::Markup,
62            "data" => Scope::Data,
63            "prose" => Scope::Prose,
64            _ => Scope::Unknown,
65        }
66    }
67}
68
69impl std::fmt::Display for Scope {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        match self {
72            Scope::Programming => write!(f, "Programming"),
73            Scope::Markup => write!(f, "Markup"),
74            Scope::Data => write!(f, "Data"),
75            Scope::Prose => write!(f, "Prose"),
76            Scope::Unknown => write!(f, "Unknown"),
77        }
78    }
79}
80
81/// A `HeuristicRule` represents a check for a [`Language`] based on the content of a file.
82#[derive(Debug, Clone, PartialEq, Eq)]
83#[cfg_attr(feature = "matcher", derive(serde::Serialize, serde::Deserialize))]
84pub struct HeuristicRule {
85    /// The reference to the [`Language`] that is matched by this rule.
86    pub language: String,
87    /// A list of extensions that are used to check whether this rule applies.
88    pub extensions: Vec<OsString>,
89    /// A list of patterns that are used to check whether this rule applies.
90    pub patterns: Vec<String>,
91}
92
93/// Used to resolve all possible [`Language`]s by the given filename. 
94pub fn resolve_languages_by_filename(
95    file: impl AsRef<Path>,
96    container: &impl Container,
97) -> Result<Vec<&Language>, LinguistError> {
98    match container.get_languages_by_filename(file) {
99        Some(langs) => Ok(langs),
100        _ => Err(LinguistError::LanguageNotFound),
101    }
102}
103
104/// Used to resolve all possible [`Language`]s by the given extension.
105pub fn resolve_languages_by_extension(
106    file: impl AsRef<Path>,
107    container: &impl Container,
108) -> Result<Vec<&Language>, LinguistError> {
109    match container.get_languages_by_extension(file) {
110        Some(langs) => Ok(langs),
111        _ => Err(LinguistError::LanguageNotFound),
112    }
113}
114
115/// Used to resolve all possible [`Language`]s by the file contents.
116#[cfg(feature = "matcher")]
117pub fn resolve_language_by_content(
118    file: impl AsRef<Path>,
119    container: &impl Container,
120) -> Result<Option<&Language>, LinguistError> {
121    let content = match std::fs::read_to_string(file.as_ref()) {
122        Ok(content) => content,
123        _ => return Err(LinguistError::FileNotFound),
124    };
125
126    if let Some(rules) = container.get_heuristics_by_extension(file.as_ref()) {
127        for rule in rules {
128            let matcher = Regex::new(&rule.patterns.join("|"))?;
129
130            if matcher.is_match(&content) {
131                return Ok(container.get_language_by_name(&rule.language));
132            }
133        }
134    }
135
136    Err(LinguistError::LanguageNotFound)
137}
138
139/// Used to resolve all possible [`Language`]s by the file contents.
140pub fn resolve_languages_by_shebang(
141    file: impl AsRef<Path>,
142    container: &impl Container,
143) -> Result<Option<Vec<&Language>>, LinguistError> {
144    // load first line of file
145    let file = match std::fs::File::open(&file) {
146        Ok(file) => file,
147        Err(err) => return Err(LinguistError::IOError(err)),
148    };
149    let mut buf = BufReader::new(file);
150    let mut line = String::new();
151    let _ = buf.read_line(&mut line);
152
153    // check whether the first line of the file is a shebang
154    if !has_shebang(line.as_bytes()) {
155        return Ok(None);
156    }
157
158    let line = line[2..].trim();
159    let mut fields = line.split_whitespace().collect::<Vec<&str>>();
160    if fields.is_empty() {
161        return Ok(None);
162    }
163
164    let mut interpreter = Path::new(fields[0])
165        .file_name()
166        .unwrap()
167        .to_str()
168        .unwrap()
169        .to_owned();
170
171    if interpreter == "env" {
172        if fields.len() == 1 {
173            return Ok(None);
174        }
175
176        let env_opt_args = Regex::new(r"^-[a-zA-Z]+$").unwrap();
177        let env_var_args = Regex::new(r"^\$[a-zA-Z_]+$").unwrap();
178
179        let _i = 1;
180        while fields.len() > 2 {
181            if env_opt_args.is_match(fields[1]) || env_var_args.is_match(fields[1]) {
182                fields.remove(1);
183                continue;
184            }
185            break;
186        }
187        interpreter = Path::new(fields[1])
188            .file_name()
189            .unwrap()
190            .to_str()
191            .unwrap()
192            .to_owned();
193    }
194
195    let mut interpreter = interpreter;
196    if interpreter == "sh" {
197        interpreter = determine_multiline_exec(buf.buffer()).unwrap();
198    }
199
200    let python_version = Regex::new(r"^python[0-9]*\.[0-9]*").unwrap();
201    if python_version.is_match(&interpreter) {
202        interpreter = interpreter.split('.').next().unwrap().to_owned();
203    }
204    // If osascript is called with argument -l it could be different language so do not rely on it
205    // To match linguist behavior, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
206    if interpreter == "osascript" && line.contains("-l") {
207        interpreter = "".to_string();
208    }
209
210    let results = container.get_languages_by_interpreter(&interpreter);
211    if results.is_some() {
212        Ok(results)
213    } else {
214        Ok(None)
215    }
216}
217
218/// Resolve the [`Language`] of the given file. It will try to resolve the language by the filename,
219/// extension, shebang and content. The most likely language will be returned.
220pub fn resolve_language(
221    file: impl AsRef<Path>,
222    container: &impl Container,
223) -> Result<Option<&Language>, LinguistError> {
224    if is_binary(&file)? {
225        return Ok(None);
226    }
227
228    let mut probabilities: HashMap<String, usize> = HashMap::new();
229
230    if let Ok(candidates) = resolve_languages_by_filename(&file, container) {
231        for candidate in candidates {
232            *probabilities
233                .entry(candidate.name.clone().to_lowercase())
234                .or_insert(1) += 1;
235        }
236    }
237
238    if let Ok(Some(candidate)) = resolve_languages_by_shebang(&file, container) {
239        for lang in candidate {
240            *probabilities
241                .entry(lang.name.clone().to_lowercase())
242                .or_insert(1) += 1;
243        }
244    }
245
246    if let Ok(candidates) = resolve_languages_by_extension(&file, container) {
247        for candidate in candidates {
248            *probabilities
249                .entry(candidate.name.clone().to_lowercase())
250                .or_insert(1) += 1;
251        }
252    }
253
254    if let Ok(Some(candidate)) = resolve_language_by_content(&file, container) {
255        *probabilities
256            .entry(candidate.name.clone().to_lowercase())
257            .or_insert(1) += 1;
258    }
259
260    let mut ordered: Vec<(&String, &usize)> = probabilities.iter().collect();
261    ordered.sort_by_key(|&(_, v)| v);
262    ordered.reverse();
263
264    if !ordered.is_empty() {
265        return Ok(Some(
266            container
267                .get_language_by_name(ordered.get(0).unwrap().0)
268                .unwrap(),
269        ));
270    }
271    Err(LinguistError::LanguageNotFound)
272}