Skip to main content

hyperpolyglot/
lib.rs

1//! # Hyperpolyglot
2//! `hyperpolyglot` is a fast programming language detector.
3
4use ignore::{overrides::OverrideBuilder, WalkBuilder};
5use std::{
6    collections::HashMap,
7    convert::TryFrom,
8    env, fmt,
9    fs::File,
10    io::{BufReader, Read, Seek, SeekFrom},
11    path::{Path, PathBuf},
12    sync::mpsc,
13};
14
15mod detectors;
16mod filters;
17
18// Include the map that stores language info
19// static LANGUAGE_INFO: phf::Map<&'static str, Language> = ...;
20include!("codegen/language-info-map.rs");
21
22const MAX_CONTENT_SIZE_BYTES: usize = 51200;
23
24/// The language struct that contains the name and other interesting information about a
25/// language.
26///
27/// # Examples
28/// ```
29/// use hyperpolyglot::{Language, LanguageType};
30/// use std::convert::TryFrom;
31///
32/// let language = Language::try_from("Rust").unwrap();
33/// let expected = Language {
34///     name: "Rust",
35///     language_type: LanguageType::Programming,
36///     color: Some("#dea584"),
37///     group: None,
38/// };
39/// assert_eq!(language, expected)
40/// ```
41///
42/// # Errors
43/// `try_from` will error if the langauge name is not one of the known languages
44///
45/// If try_from is called with a language returned from [`detect`] or [`get_language_breakdown`]
46/// the value is guaranteed to be there and can be unwrapped
47#[derive(Debug, Copy, Clone, Eq, PartialEq)]
48pub struct Language {
49    /// The name of the language
50    pub name: &'static str,
51    /// Type of language. ex/ Data, Programming, Markup, Prose
52    pub language_type: LanguageType,
53    /// The css hex color used to represent the language on github. ex/ #dea584
54    pub color: Option<&'static str>,
55    /// Name of the parent language. ex/ The group for TSX would be TypeScript
56    pub group: Option<&'static str>,
57}
58
59impl TryFrom<&str> for Language {
60    type Error = &'static str;
61    fn try_from(name: &str) -> Result<Self, Self::Error> {
62        LANGUAGE_INFO.get(name).copied().ok_or("Language not found")
63    }
64}
65
66/// The set of possible language types
67#[derive(Debug, Copy, Clone, Eq, PartialEq)]
68pub enum LanguageType {
69    Data,
70    Markup,
71    Programming,
72    Prose,
73}
74
75impl fmt::Display for LanguageType {
76    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77        match self {
78            LanguageType::Data => write!(f, "Data"),
79            LanguageType::Markup => write!(f, "Markup"),
80            LanguageType::Programming => write!(f, "Programming"),
81            LanguageType::Prose => write!(f, "Prose"),
82        }
83    }
84}
85
86/// An enum where the variant is the strategy that detected the language and the value is the name
87/// of the language
88#[derive(Debug, Copy, Clone, Eq, PartialEq)]
89pub enum Detection {
90    Filename(&'static str),
91    Extension(&'static str),
92    Shebang(&'static str),
93    Heuristics(&'static str),
94    Classifier(&'static str),
95}
96
97impl Detection {
98    /// Returns the language detected
99    pub fn language(&self) -> &'static str {
100        match self {
101            Detection::Filename(language)
102            | Detection::Extension(language)
103            | Detection::Shebang(language)
104            | Detection::Heuristics(language)
105            | Detection::Classifier(language) => language,
106        }
107    }
108
109    /// Returns the strategy used to detect the langauge
110    pub fn variant(&self) -> &str {
111        match self {
112            Detection::Filename(_) => "Filename",
113            Detection::Extension(_) => "Extension",
114            Detection::Shebang(_) => "Shebang",
115            Detection::Heuristics(_) => "Heuristics",
116            Detection::Classifier(_) => "Classifier",
117        }
118    }
119}
120
121/// Detects the programming language of the file at a given path
122///
123/// If the language cannot be determined, None will be returned.
124/// `detect` will error on an io error or if the parser returns an error when tokenizing the
125/// contents of the file
126///
127/// # Examples
128/// ```
129/// use std::path::Path;
130/// use hyperpolyglot::{detect, Detection};
131///
132/// let path = Path::new("src/bin/main.rs");
133/// let language = detect(path).unwrap().unwrap();
134/// assert_eq!(Detection::Heuristics("Rust"), language);
135/// ```
136pub fn detect(path: &Path) -> Result<Option<Detection>, std::io::Error> {
137    let filename = match path.file_name() {
138        Some(filename) => filename.to_str(),
139        None => return Ok(None),
140    };
141
142    let candidate = filename.and_then(|filename| detectors::get_language_from_filename(filename));
143    if let Some(candidate) = candidate {
144        return Ok(Some(Detection::Filename(candidate)));
145    };
146
147    let extension = filename.and_then(|filename| detectors::get_extension(filename));
148
149    let candidates = extension
150        .map(|ext| detectors::get_languages_from_extension(ext))
151        .unwrap_or_else(Vec::new);
152
153    if candidates.len() == 1 {
154        return Ok(Some(Detection::Extension(candidates[0])));
155    };
156
157    let file = File::open(path)?;
158    let mut reader = BufReader::new(file);
159
160    let candidates = filter_candidates(
161        candidates,
162        detectors::get_languages_from_shebang(&mut reader)?,
163    );
164    if candidates.len() == 1 {
165        return Ok(Some(Detection::Shebang(candidates[0])));
166    };
167    reader.seek(SeekFrom::Start(0))?;
168
169    let mut content = String::new();
170    reader.read_to_string(&mut content)?;
171    let content = truncate_to_char_boundary(&content, MAX_CONTENT_SIZE_BYTES);
172
173    // using heuristics is only going to be useful if we have more than one candidate
174    // if the extension didn't result in candidate languages then the heuristics won't either
175    let candidates = if candidates.len() > 1 {
176        if let Some(extension) = extension {
177            let languages =
178                detectors::get_languages_from_heuristics(&extension[..], &candidates, &content);
179            filter_candidates(candidates, languages)
180        } else {
181            candidates
182        }
183    } else {
184        candidates
185    };
186
187    match candidates.len() {
188        0 => Ok(None),
189        1 => Ok(Some(Detection::Heuristics(candidates[0]))),
190        _ => Ok(Some(Detection::Classifier(detectors::classify(
191            &content,
192            &candidates,
193        )))),
194    }
195}
196
197// function stolen from from https://doc.rust-lang.org/nightly/src/core/str/mod.rs.html
198fn truncate_to_char_boundary(s: &str, mut max: usize) -> &str {
199    if max >= s.len() {
200        s
201    } else {
202        while !s.is_char_boundary(max) {
203            max -= 1;
204        }
205        &s[..max]
206    }
207}
208
209/// Walks the path provided and tallies the programming languages detected in the given path
210///
211/// Returns a map from the programming languages to a Vec of the files that were detected and the
212/// strategy used
213///
214/// # Examples
215/// ```
216/// use hyperpolyglot::get_language_breakdown;
217/// let breakdown = get_language_breakdown("src/");
218/// let total_detections = breakdown.iter().fold(0, |sum, (language, detections)| sum + detections.len());
219/// println!("Total files detected: {}", total_detections);
220/// ```
221pub fn get_language_breakdown<P: AsRef<Path>>(
222    path: P,
223) -> HashMap<&'static str, Vec<(Detection, PathBuf)>> {
224    let override_builder = OverrideBuilder::new(&path);
225    let override_builder = filters::add_documentation_override(override_builder);
226    let override_builder = filters::add_vendor_override(override_builder);
227
228    let num_threads = env::var_os("HYPLY_THREADS")
229        .and_then(|threads| threads.into_string().ok())
230        .and_then(|threads| threads.parse().ok())
231        .unwrap_or_else(num_cpus::get);
232
233    let (tx, rx) = mpsc::channel::<(Detection, PathBuf)>();
234    let walker = WalkBuilder::new(path)
235        .threads(num_threads)
236        .overrides(override_builder.build().unwrap())
237        .build_parallel();
238    walker.run(|| {
239        let tx = tx.clone();
240        Box::new(move |result| {
241            use ignore::WalkState::*;
242
243            if let Ok(path) = result {
244                let path = path.into_path();
245                if !path.is_dir() {
246                    if let Ok(Some(detection)) = detect(&path) {
247                        tx.send((detection, path)).unwrap();
248                    }
249                }
250            }
251            Continue
252        })
253    });
254    drop(tx);
255
256    let mut language_breakdown = HashMap::new();
257    for (detection, file) in rx {
258        let files = language_breakdown
259            .entry(detection.language())
260            .or_insert_with(Vec::new);
261        files.push((detection, file));
262    }
263
264    language_breakdown
265}
266
267fn filter_candidates(
268    previous_candidates: Vec<&'static str>,
269    new_candidates: Vec<&'static str>,
270) -> Vec<&'static str> {
271    if previous_candidates.is_empty() {
272        return new_candidates;
273    }
274
275    if new_candidates.is_empty() {
276        return previous_candidates;
277    }
278
279    let filtered_candidates: Vec<&'static str> = previous_candidates
280        .iter()
281        .filter(|l| new_candidates.contains(l))
282        .copied()
283        .collect();
284
285    match filtered_candidates.len() {
286        0 => previous_candidates,
287        _ => filtered_candidates,
288    }
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294    use std::fs;
295    use std::io::prelude::*;
296    use std::iter;
297
298    #[test]
299    fn test_detect_filename() {
300        let path = Path::new("APKBUILD");
301        let detected_language = detect(path).unwrap().unwrap();
302
303        assert_eq!(detected_language, Detection::Filename("Alpine Abuild"));
304    }
305
306    #[test]
307    fn test_detect_extension() {
308        let path = Path::new("pizza.purs");
309        let detected_language = detect(path).unwrap().unwrap();
310
311        assert_eq!(detected_language, Detection::Extension("PureScript"));
312    }
313
314    #[test]
315    fn test_detect_shebang() {
316        let path = Path::new("a");
317        let mut file = File::create(path).unwrap();
318        file.write(b"#!/usr/bin/python").unwrap();
319        file.flush().unwrap();
320
321        let detected_language = detect(path).unwrap().unwrap();
322
323        fs::remove_file(path).unwrap();
324
325        assert_eq!(detected_language, Detection::Shebang("Python"));
326    }
327
328    #[test]
329    fn test_detect_heuristics() {
330        let path = Path::new("a.es");
331        let mut file = File::create(path).unwrap();
332        file.write(b"'use strict'").unwrap();
333        file.flush().unwrap();
334
335        let detected_language = detect(path).unwrap().unwrap();
336
337        fs::remove_file(path).unwrap();
338
339        assert_eq!(detected_language, Detection::Heuristics("JavaScript"));
340    }
341
342    #[test]
343    fn test_detect_classify() {
344        let path = Path::new("peep.rs");
345        let mut file = File::create(path).unwrap();
346        file.write(
347            b"
348            match optional {
349                Some(pattern) => println!(\"Hello World\"),
350                None => println!(\"u missed\")
351            }
352            ",
353        )
354        .unwrap();
355        file.flush().unwrap();
356
357        let detected_language = detect(path).unwrap().unwrap();
358
359        fs::remove_file(path).unwrap();
360        assert_eq!(detected_language, Detection::Classifier("Rust"));
361    }
362
363    #[test]
364    fn test_detect_none() {
365        let path = Path::new("y");
366        let mut file = File::create(path).unwrap();
367        file.write(
368            b"
369            use std::io;
370            fn main() {
371                println!(\"{}\", \"Hello World\");
372            }",
373        )
374        .unwrap();
375        file.flush().unwrap();
376
377        let detected_language = detect(path).unwrap();
378
379        fs::remove_file(path).unwrap();
380
381        assert_eq!(detected_language, None);
382    }
383
384    #[test]
385    fn test_detect_accuracy() {
386        let mut total = 0;
387        let mut correct = 0;
388        fs::read_dir("samples")
389            .unwrap()
390            .map(|entry| entry.unwrap())
391            .filter(|entry| entry.path().is_dir())
392            .map(|language_dir| {
393                let path = language_dir.path();
394                let language = path.file_name().unwrap();
395                let language = language.to_string_lossy().into_owned();
396
397                let file_paths = fs::read_dir(language_dir.path())
398                    .unwrap()
399                    .map(|entry| entry.unwrap().path())
400                    .filter(|path| path.is_file());
401
402                let language_iter = iter::repeat(language);
403                file_paths.zip(language_iter)
404            })
405            .flatten()
406            .for_each(|(file, language)| {
407                // Skip the files we can't detect. The reason the detect function fails on these is
408                // because of a heuristic added to .h files that defaults to C if none of the
409                // Objective-C or C++ rules match. This makes us fail on two of the sample files
410                // but tends to perform better on non training data
411                if file.file_name().unwrap() == "rpc.h" || file.file_name().unwrap() == "Field.h" {
412                    return;
413                }
414                // F* uses the name Fstar in the file system
415                let language = match &language[..] {
416                    "Fstar" => "F*",
417                    l => l,
418                };
419                if let Ok(Some(detection)) = detect(&file) {
420                    total += 1;
421                    if detection.language() == language {
422                        correct += 1;
423                    } else {
424                        println!("Incorrect detection: {:?} {:?}", file, detection)
425                    }
426                }
427            });
428
429        let accuracy = (correct as f64) / (total as f64);
430        assert_eq!(accuracy, 1.0);
431    }
432
433    #[test]
434    fn test_filter_candidates() {
435        let previous_candidates = vec!["JavaScript", "Python"];
436        let new_candidates = vec!["Python", "Bibbity"];
437        assert_eq!(
438            filter_candidates(previous_candidates, new_candidates),
439            vec!["Python"]
440        );
441    }
442
443    #[test]
444    fn test_filter_candidates_no_new() {
445        let previous_candidates = vec!["JavaScript", "Python"];
446        let new_candidates = vec![];
447        assert_eq!(
448            filter_candidates(previous_candidates, new_candidates),
449            vec!["JavaScript", "Python"]
450        );
451    }
452
453    #[test]
454    fn test_filter_candidates_no_prev() {
455        let previous_candidates = vec![];
456        let new_candidates = vec!["JavaScript", "Erlang"];
457        assert_eq!(
458            filter_candidates(previous_candidates, new_candidates),
459            vec!["JavaScript", "Erlang"]
460        );
461    }
462
463    #[test]
464    fn test_filter_candidates_no_matches() {
465        let previous_candidates = vec!["Python"];
466        let new_candidates = vec!["JavaScript", "Erlang"];
467        assert_eq!(
468            filter_candidates(previous_candidates, new_candidates),
469            vec!["Python"]
470        );
471    }
472
473    #[test]
474    fn test_get_language_breakdown_ignores_overrides_documentation() {
475        fs::create_dir_all("temp-testing-dir").unwrap();
476        fs::File::create("temp-testing-dir/README.md").unwrap();
477        assert!(get_language_breakdown("temp-testing-dir").is_empty());
478
479        fs::remove_dir_all("temp-testing-dir").unwrap();
480    }
481
482    #[test]
483    fn test_get_language_breakdown_ignores_overrides_vendor() {
484        fs::create_dir_all("temp-testing-dir2/node_modules").unwrap();
485        fs::File::create("temp-testing-dir2/node_modules/hello.go").unwrap();
486        assert!(get_language_breakdown("temp-testing-dir2").is_empty());
487
488        fs::remove_dir_all("temp-testing-dir2").unwrap();
489    }
490}