Skip to main content

gengo/
lib.rs

1//! Gengo is a language detection library for collections of files.
2//! Currently, it supports reading from git repositories.
3//!
4//! # Features
5//!
6//! ## `directory`
7//!
8//! Provides the `Directory` file source, which reads files from a local directory.
9//!
10//! ## `git`
11//!
12//! Provides the `Git` file source, which reads files from a git repository. It reads
13//! from a specified revision and supports git attributes for overrides, making it the
14//! most similar to [GitHub Linguist][github-linguist]. Because of this, it also works
15//! on bare repositories.
16//!
17//! # Example
18//!
19//! ```no_run
20//! # #[cfg(feature = "git")]
21//! # {
22//! use gengo::{Builder, Git};
23//! let git = Git::new("path/to/repo", "HEAD").unwrap();
24//! let gengo = Builder::new(git).build().unwrap();
25//! let results = gengo.analyze().unwrap();
26//! # }
27//! ```
28//!
29//! [github-linguist]: https://github.com/github-linguist/linguist
30
31pub use analysis::Analysis;
32use binary::Binary;
33pub use builder::Builder;
34use documentation::Documentation;
35
36pub use error::{Error, ErrorKind};
37use generated::Generated;
38
39#[cfg(feature = "directory")]
40pub use file_source::Directory;
41
42#[cfg(feature = "git")]
43pub use file_source::Git;
44
45pub use file_source::FileSource;
46use glob::MatchOptions;
47use indexmap::IndexMap;
48use language::Category;
49pub use language::Language;
50
51use std::error::Error as ErrorTrait;
52use std::path::Path;
53
54use vendored::Vendored;
55
56use rayon::prelude::{FromParallelIterator, ParallelBridge, ParallelIterator};
57use serde::Serialize;
58
59pub mod analysis;
60mod binary;
61mod builder;
62mod documentation;
63mod error;
64mod file_source;
65mod generated;
66pub mod language;
67mod vendored;
68
69type GenericError = Box<dyn ErrorTrait>;
70type Result<T, E = GenericError> = std::result::Result<T, E>;
71
72/// Shared match options for consistent behavior.
73const GLOB_MATCH_OPTIONS: MatchOptions = MatchOptions {
74    case_sensitive: true,
75    require_literal_separator: true,
76    require_literal_leading_dot: false,
77};
78
79/// The main entry point for Gengo.
80pub struct Gengo<FS: for<'fs> FileSource<'fs>> {
81    file_source: FS,
82    read_limit: usize,
83    binary: Binary,
84    documentation: Documentation,
85    generated: Generated,
86    vendored: Vendored,
87}
88
89impl<FS: for<'fs> FileSource<'fs>> Gengo<FS> {
90    /// Analyzes each file in the repository at the given revision.
91    pub fn analyze(&self) -> Result<Analysis> {
92        let state = self.file_source.state()?;
93        let entries = self
94            .file_source
95            .entries()?
96            .par_bridge()
97            .map_with(state, |state, entry| {
98                let filepath = self.file_source.filepath(&entry, state).ok()?;
99                let contents = self.file_source.contents(&entry, state).ok()?;
100
101                let entry = self.analyze_blob(&filepath, contents, state)?;
102                Some((filepath.as_ref().to_owned(), entry))
103            })
104            .filter_map(|entry| entry);
105        let entries = IndexMap::from_par_iter(entries);
106
107        Ok(Analysis(entries))
108    }
109
110    fn analyze_blob(
111        &self,
112        filepath: impl AsRef<Path>,
113        contents: impl AsRef<[u8]>,
114        state: &mut <FS as FileSource>::State,
115    ) -> Option<Entry> {
116        let overrides = self.file_source.overrides(&filepath, state);
117        let filepath = filepath.as_ref();
118        let contents = contents.as_ref();
119
120        // NOTE Users might be surprised if there is an override for a binary file but it
121        //      is still skipped, but this should be a rare case.
122        if self.is_binary(filepath, contents) {
123            return None;
124        }
125
126        let language = overrides
127            .language
128            .or_else(|| Language::pick(filepath, contents, self.read_limit))?;
129        let generated = overrides
130            .is_generated
131            .unwrap_or_else(|| self.is_generated(filepath, contents));
132        let documentation = overrides
133            .is_documentation
134            .unwrap_or_else(|| self.is_documentation(filepath, contents));
135        let vendored = overrides
136            .is_vendored
137            .unwrap_or_else(|| self.is_vendored(filepath, contents));
138
139        let detectable = match language.category() {
140            Category::Data | Category::Prose => false,
141            Category::Pattern | Category::Programming | Category::Markup | Category::Query => {
142                !(generated || documentation || vendored)
143            }
144            category => unimplemented!("Failed to check if category {category:?} is detectable"),
145        };
146        let detectable = overrides.is_detectable.unwrap_or(detectable);
147
148        let size = contents.len();
149        let entry = Entry {
150            language,
151            size,
152            detectable,
153            generated,
154            documentation,
155            vendored,
156        };
157        Some(entry)
158    }
159
160    /// Guesses if a file is generated.
161    pub fn is_generated(&self, filepath: impl AsRef<Path>, contents: &[u8]) -> bool {
162        self.generated.is_generated(filepath, contents)
163    }
164
165    /// Guesses if a file is documentation.
166    pub fn is_documentation(&self, filepath: impl AsRef<Path>, contents: &[u8]) -> bool {
167        self.documentation.is_documentation(filepath, contents)
168    }
169
170    /// Guesses if a file is vendored.
171    pub fn is_vendored(&self, filepath: impl AsRef<Path>, contents: &[u8]) -> bool {
172        self.vendored.is_vendored(filepath, contents)
173    }
174
175    /// Guesses if a file is binary.
176    pub fn is_binary(&self, filepath: impl AsRef<Path>, contents: &[u8]) -> bool {
177        self.binary.is_binary(filepath, contents)
178    }
179}
180
181/// A single entry in the language statistics.
182#[derive(Debug, Serialize)]
183pub struct Entry {
184    /// The detected language.
185    language: Language,
186    /// The size of the file.
187    size: usize,
188    /// If the file is detectable (should not be ignored).
189    detectable: bool,
190    /// If the file was generated.
191    generated: bool,
192    /// If the file is documentation.
193    documentation: bool,
194    /// If the file is vendored.
195    vendored: bool,
196}
197
198impl Entry {
199    /// The detected language.
200    pub fn language(&self) -> &Language {
201        &self.language
202    }
203
204    /// The size of the file.
205    pub fn size(&self) -> usize {
206        self.size
207    }
208
209    /// If the file is detectable (should not be ignored).
210    pub fn detectable(&self) -> bool {
211        self.detectable
212    }
213
214    /// If the file was generated.
215    pub fn generated(&self) -> bool {
216        self.generated
217    }
218
219    /// If the file is documentation.
220    pub fn documentation(&self) -> bool {
221        self.documentation
222    }
223
224    /// If the file is vendored.
225    pub fn vendored(&self) -> bool {
226        self.vendored
227    }
228}