1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
pub use builder::Builder;
use documentation::Documentation;
use generated::Generated;
use git2::{AttrCheckFlags, AttrValue, Blob, Commit, ObjectType, Repository, Tree};
use indexmap::IndexMap;
pub use languages::analyzer::Analyzers;
use languages::Category;
pub use languages::Language;
use std::error::Error;
use std::ffi::OsStr;
use std::path::Path;
use vendored::Vendored;

mod builder;
mod documentation;
mod generated;
pub mod languages;
mod vendored;

/// The main entry point for Gengo.
pub struct Gengo {
    repository: Repository,
    analyzers: Analyzers,
    read_limit: usize,
}

// TODO parse .gitattributes to get language overrides.
impl Gengo {
    const ATTR_CHECK_FLAGS: [AttrCheckFlags; 2] =
        [AttrCheckFlags::NO_SYSTEM, AttrCheckFlags::INDEX_THEN_FILE];
    /// Resolves a revision to a commit.
    fn rev(&self, rev: &str) -> Result<Commit, Box<dyn Error>> {
        let reference = self.repository.revparse_single(rev)?;
        let commit = reference.peel_to_commit()?;
        Ok(commit)
    }

    /// Analyzes each file in the repository at the given revision.
    pub fn analyze(&self, rev: &str) -> Result<IndexMap<String, Entry>, Box<dyn Error>> {
        let mut results = IndexMap::new();
        let commit = self.rev(rev)?;
        let tree = commit.tree()?;
        self.analyze_tree("", &tree, &mut results)?;
        Ok(results)
    }

    fn analyze_tree(
        &self,
        root: &str,
        tree: &Tree,
        results: &mut IndexMap<String, Entry>,
    ) -> Result<(), Box<dyn Error>> {
        for entry in tree.iter() {
            let object = entry.to_object(&self.repository)?;
            match entry.kind() {
                Some(ObjectType::Tree) => {
                    let path = entry.name().ok_or("invalid path")?;
                    let tree = object.as_tree().expect("object to be a tree");
                    let path = Path::new(root).join(path);
                    let path = path.to_str().ok_or("invalid path")?;

                    self.analyze_tree(path, tree, results)?;
                }
                Some(ObjectType::Blob) => {
                    let path = entry.name().ok_or("invalid path").unwrap();
                    let filepath = Path::new(root).join(path);
                    let filepath = filepath.as_os_str();
                    let blob = object.as_blob().expect("object to be a blob");

                    self.analyze_blob(filepath, blob, results)?;
                }
                _ => continue,
            }
        }
        Ok(())
    }

    fn analyze_blob(
        &self,
        filepath: &OsStr,
        blob: &Blob,
        results: &mut IndexMap<String, Entry>,
    ) -> Result<(), Box<dyn Error>> {
        let path = Path::new(filepath);
        let contents = blob.content();

        let lang_override = self
            .get_str_attr(path, "gengo-language")?
            .map(|s| s.replace('-', " "))
            .and_then(|s| self.analyzers.get(&s));

        let language =
            lang_override.or_else(|| self.analyzers.pick(filepath, contents, self.read_limit));

        let language = if let Some(language) = language {
            language.clone()
        } else {
            return Ok(());
        };

        let size = contents.len();
        let generated = self
            .get_boolean_attr(path, "gengo-generated")?
            .unwrap_or_else(|| self.is_generated(filepath, contents));
        let documentation = self
            .get_boolean_attr(path, "gengo-documentation")?
            .unwrap_or_else(|| self.is_documentation(filepath, contents));
        let vendored = self
            .get_boolean_attr(path, "gengo-vendored")?
            .unwrap_or_else(|| self.is_vendored(filepath, contents));

        let detectable = match language.category() {
            Category::Data | Category::Prose => false,
            Category::Programming | Category::Markup | Category::Query => {
                !(generated || documentation || vendored)
            }
        };
        let detectable = self
            .get_boolean_attr(path, "gengo-detectable")?
            .unwrap_or(detectable);

        let path = String::from(filepath.to_str().ok_or("invalid path")?);
        let entry = Entry {
            language,
            size,
            detectable,
            generated,
            documentation,
            vendored,
        };

        results.insert(path, entry);

        Ok(())
    }

    /// Guesses if a file is generated.
    pub fn is_generated(&self, filepath: &OsStr, contents: &[u8]) -> bool {
        Generated::is_generated(filepath, contents)
    }

    /// Guesses if a file is documentation.
    pub fn is_documentation(&self, filepath: &OsStr, contents: &[u8]) -> bool {
        Documentation::is_documentation(filepath, contents)
    }

    /// Guesses if a file is vendored.
    pub fn is_vendored(&self, filepath: &OsStr, contents: &[u8]) -> bool {
        Vendored::is_vendored(filepath, contents)
    }

    fn get_attr(&self, path: &Path, attr: &str) -> Result<AttrValue, Box<dyn Error>> {
        let flags = Self::ATTR_CHECK_FLAGS
            .into_iter()
            .reduce(|a, b| a | b)
            .unwrap();
        let attr = self.repository.get_attr(path, attr, flags)?;
        let attr = AttrValue::from_string(attr);
        Ok(attr)
    }

    fn get_boolean_attr(&self, path: &Path, attr: &str) -> Result<Option<bool>, Box<dyn Error>> {
        let attr = self.get_attr(path, attr)?;
        let attr = match attr {
            AttrValue::True => Some(true),
            AttrValue::False => Some(false),
            AttrValue::Unspecified => None,
            // NOTE To avoid being overly strict, we'll just ignore invalid attributes.
            _ => None,
        };
        Ok(attr)
    }

    fn get_str_attr(&self, path: &Path, attr: &str) -> Result<Option<String>, Box<dyn Error>> {
        let attr = self.get_attr(path, attr)?;
        let attr = match attr {
            AttrValue::String(s) => Some(s),
            AttrValue::Unspecified => None,
            // NOTE To avoid being overly strict, we'll just ignore invalid attributes.
            _ => None,
        };
        Ok(attr.map(String::from))
    }
}

/// A single entry in the language statistics.
#[derive(Debug)]
pub struct Entry {
    /// The detected language.
    language: Language,
    /// The size of the file.
    size: usize,
    /// If the file is detectable (should not be ignored).
    detectable: bool,
    /// If the file was generated.
    generated: bool,
    /// If the file is documentation.
    documentation: bool,
    /// If the file is vendored.
    vendored: bool,
}

impl Entry {
    /// The detected language.
    pub fn language(&self) -> &Language {
        &self.language
    }

    /// The size of the file.
    pub fn size(&self) -> usize {
        self.size
    }

    /// If the file is detectable (should not be ignored).
    pub fn detectable(&self) -> bool {
        self.detectable
    }

    /// If the file was generated.
    pub fn generated(&self) -> bool {
        self.generated
    }

    /// If the file is documentation.
    pub fn documentation(&self) -> bool {
        self.documentation
    }

    /// If the file is vendored.
    pub fn vendored(&self) -> bool {
        self.vendored
    }
}