1use ignore::{overrides::OverrideBuilder, WalkBuilder};
5use std::{
6 collections::HashMap,
7 convert::TryFrom,
8 env, fmt,
9 fs::File,
10 io::{BufReader, Read, Seek, SeekFrom},
11 path::{Path, PathBuf},
12 sync::mpsc,
13};
14
15mod detectors;
16mod filters;
17
18include!("codegen/language-info-map.rs");
21
22const MAX_CONTENT_SIZE_BYTES: usize = 51200;
23
24#[derive(Debug, Copy, Clone, Eq, PartialEq)]
48pub struct Language {
49 pub name: &'static str,
51 pub language_type: LanguageType,
53 pub color: Option<&'static str>,
55 pub group: Option<&'static str>,
57}
58
59impl TryFrom<&str> for Language {
60 type Error = &'static str;
61 fn try_from(name: &str) -> Result<Self, Self::Error> {
62 LANGUAGE_INFO.get(name).copied().ok_or("Language not found")
63 }
64}
65
66#[derive(Debug, Copy, Clone, Eq, PartialEq)]
68pub enum LanguageType {
69 Data,
70 Markup,
71 Programming,
72 Prose,
73}
74
75impl fmt::Display for LanguageType {
76 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77 match self {
78 LanguageType::Data => write!(f, "Data"),
79 LanguageType::Markup => write!(f, "Markup"),
80 LanguageType::Programming => write!(f, "Programming"),
81 LanguageType::Prose => write!(f, "Prose"),
82 }
83 }
84}
85
86#[derive(Debug, Copy, Clone, Eq, PartialEq)]
89pub enum Detection {
90 Filename(&'static str),
91 Extension(&'static str),
92 Shebang(&'static str),
93 Heuristics(&'static str),
94 Classifier(&'static str),
95}
96
97impl Detection {
98 pub fn language(&self) -> &'static str {
100 match self {
101 Detection::Filename(language)
102 | Detection::Extension(language)
103 | Detection::Shebang(language)
104 | Detection::Heuristics(language)
105 | Detection::Classifier(language) => language,
106 }
107 }
108
109 pub fn variant(&self) -> &str {
111 match self {
112 Detection::Filename(_) => "Filename",
113 Detection::Extension(_) => "Extension",
114 Detection::Shebang(_) => "Shebang",
115 Detection::Heuristics(_) => "Heuristics",
116 Detection::Classifier(_) => "Classifier",
117 }
118 }
119}
120
121pub fn detect(path: &Path) -> Result<Option<Detection>, std::io::Error> {
137 let filename = match path.file_name() {
138 Some(filename) => filename.to_str(),
139 None => return Ok(None),
140 };
141
142 let candidate = filename.and_then(|filename| detectors::get_language_from_filename(filename));
143 if let Some(candidate) = candidate {
144 return Ok(Some(Detection::Filename(candidate)));
145 };
146
147 let extension = filename.and_then(|filename| detectors::get_extension(filename));
148
149 let candidates = extension
150 .map(|ext| detectors::get_languages_from_extension(ext))
151 .unwrap_or_else(Vec::new);
152
153 if candidates.len() == 1 {
154 return Ok(Some(Detection::Extension(candidates[0])));
155 };
156
157 let file = File::open(path)?;
158 let mut reader = BufReader::new(file);
159
160 let candidates = filter_candidates(
161 candidates,
162 detectors::get_languages_from_shebang(&mut reader)?,
163 );
164 if candidates.len() == 1 {
165 return Ok(Some(Detection::Shebang(candidates[0])));
166 };
167 reader.seek(SeekFrom::Start(0))?;
168
169 let mut content = String::new();
170 reader.read_to_string(&mut content)?;
171 let content = truncate_to_char_boundary(&content, MAX_CONTENT_SIZE_BYTES);
172
173 let candidates = if candidates.len() > 1 {
176 if let Some(extension) = extension {
177 let languages =
178 detectors::get_languages_from_heuristics(&extension[..], &candidates, &content);
179 filter_candidates(candidates, languages)
180 } else {
181 candidates
182 }
183 } else {
184 candidates
185 };
186
187 match candidates.len() {
188 0 => Ok(None),
189 1 => Ok(Some(Detection::Heuristics(candidates[0]))),
190 _ => Ok(Some(Detection::Classifier(detectors::classify(
191 &content,
192 &candidates,
193 )))),
194 }
195}
196
197fn truncate_to_char_boundary(s: &str, mut max: usize) -> &str {
199 if max >= s.len() {
200 s
201 } else {
202 while !s.is_char_boundary(max) {
203 max -= 1;
204 }
205 &s[..max]
206 }
207}
208
209pub fn get_language_breakdown<P: AsRef<Path>>(
222 path: P,
223) -> HashMap<&'static str, Vec<(Detection, PathBuf)>> {
224 let override_builder = OverrideBuilder::new(&path);
225 let override_builder = filters::add_documentation_override(override_builder);
226 let override_builder = filters::add_vendor_override(override_builder);
227
228 let num_threads = env::var_os("HYPLY_THREADS")
229 .and_then(|threads| threads.into_string().ok())
230 .and_then(|threads| threads.parse().ok())
231 .unwrap_or_else(num_cpus::get);
232
233 let (tx, rx) = mpsc::channel::<(Detection, PathBuf)>();
234 let walker = WalkBuilder::new(path)
235 .threads(num_threads)
236 .overrides(override_builder.build().unwrap())
237 .build_parallel();
238 walker.run(|| {
239 let tx = tx.clone();
240 Box::new(move |result| {
241 use ignore::WalkState::*;
242
243 if let Ok(path) = result {
244 let path = path.into_path();
245 if !path.is_dir() {
246 if let Ok(Some(detection)) = detect(&path) {
247 tx.send((detection, path)).unwrap();
248 }
249 }
250 }
251 Continue
252 })
253 });
254 drop(tx);
255
256 let mut language_breakdown = HashMap::new();
257 for (detection, file) in rx {
258 let files = language_breakdown
259 .entry(detection.language())
260 .or_insert_with(Vec::new);
261 files.push((detection, file));
262 }
263
264 language_breakdown
265}
266
267fn filter_candidates(
268 previous_candidates: Vec<&'static str>,
269 new_candidates: Vec<&'static str>,
270) -> Vec<&'static str> {
271 if previous_candidates.is_empty() {
272 return new_candidates;
273 }
274
275 if new_candidates.is_empty() {
276 return previous_candidates;
277 }
278
279 let filtered_candidates: Vec<&'static str> = previous_candidates
280 .iter()
281 .filter(|l| new_candidates.contains(l))
282 .copied()
283 .collect();
284
285 match filtered_candidates.len() {
286 0 => previous_candidates,
287 _ => filtered_candidates,
288 }
289}
290
291#[cfg(test)]
292mod tests {
293 use super::*;
294 use std::fs;
295 use std::io::prelude::*;
296 use std::iter;
297
298 #[test]
299 fn test_detect_filename() {
300 let path = Path::new("APKBUILD");
301 let detected_language = detect(path).unwrap().unwrap();
302
303 assert_eq!(detected_language, Detection::Filename("Alpine Abuild"));
304 }
305
306 #[test]
307 fn test_detect_extension() {
308 let path = Path::new("pizza.purs");
309 let detected_language = detect(path).unwrap().unwrap();
310
311 assert_eq!(detected_language, Detection::Extension("PureScript"));
312 }
313
314 #[test]
315 fn test_detect_shebang() {
316 let path = Path::new("a");
317 let mut file = File::create(path).unwrap();
318 file.write(b"#!/usr/bin/python").unwrap();
319 file.flush().unwrap();
320
321 let detected_language = detect(path).unwrap().unwrap();
322
323 fs::remove_file(path).unwrap();
324
325 assert_eq!(detected_language, Detection::Shebang("Python"));
326 }
327
328 #[test]
329 fn test_detect_heuristics() {
330 let path = Path::new("a.es");
331 let mut file = File::create(path).unwrap();
332 file.write(b"'use strict'").unwrap();
333 file.flush().unwrap();
334
335 let detected_language = detect(path).unwrap().unwrap();
336
337 fs::remove_file(path).unwrap();
338
339 assert_eq!(detected_language, Detection::Heuristics("JavaScript"));
340 }
341
342 #[test]
343 fn test_detect_classify() {
344 let path = Path::new("peep.rs");
345 let mut file = File::create(path).unwrap();
346 file.write(
347 b"
348 match optional {
349 Some(pattern) => println!(\"Hello World\"),
350 None => println!(\"u missed\")
351 }
352 ",
353 )
354 .unwrap();
355 file.flush().unwrap();
356
357 let detected_language = detect(path).unwrap().unwrap();
358
359 fs::remove_file(path).unwrap();
360 assert_eq!(detected_language, Detection::Classifier("Rust"));
361 }
362
363 #[test]
364 fn test_detect_none() {
365 let path = Path::new("y");
366 let mut file = File::create(path).unwrap();
367 file.write(
368 b"
369 use std::io;
370 fn main() {
371 println!(\"{}\", \"Hello World\");
372 }",
373 )
374 .unwrap();
375 file.flush().unwrap();
376
377 let detected_language = detect(path).unwrap();
378
379 fs::remove_file(path).unwrap();
380
381 assert_eq!(detected_language, None);
382 }
383
384 #[test]
385 fn test_detect_accuracy() {
386 let mut total = 0;
387 let mut correct = 0;
388 fs::read_dir("samples")
389 .unwrap()
390 .map(|entry| entry.unwrap())
391 .filter(|entry| entry.path().is_dir())
392 .map(|language_dir| {
393 let path = language_dir.path();
394 let language = path.file_name().unwrap();
395 let language = language.to_string_lossy().into_owned();
396
397 let file_paths = fs::read_dir(language_dir.path())
398 .unwrap()
399 .map(|entry| entry.unwrap().path())
400 .filter(|path| path.is_file());
401
402 let language_iter = iter::repeat(language);
403 file_paths.zip(language_iter)
404 })
405 .flatten()
406 .for_each(|(file, language)| {
407 if file.file_name().unwrap() == "rpc.h" || file.file_name().unwrap() == "Field.h" {
412 return;
413 }
414 let language = match &language[..] {
416 "Fstar" => "F*",
417 l => l,
418 };
419 if let Ok(Some(detection)) = detect(&file) {
420 total += 1;
421 if detection.language() == language {
422 correct += 1;
423 } else {
424 println!("Incorrect detection: {:?} {:?}", file, detection)
425 }
426 }
427 });
428
429 let accuracy = (correct as f64) / (total as f64);
430 assert_eq!(accuracy, 1.0);
431 }
432
433 #[test]
434 fn test_filter_candidates() {
435 let previous_candidates = vec!["JavaScript", "Python"];
436 let new_candidates = vec!["Python", "Bibbity"];
437 assert_eq!(
438 filter_candidates(previous_candidates, new_candidates),
439 vec!["Python"]
440 );
441 }
442
443 #[test]
444 fn test_filter_candidates_no_new() {
445 let previous_candidates = vec!["JavaScript", "Python"];
446 let new_candidates = vec![];
447 assert_eq!(
448 filter_candidates(previous_candidates, new_candidates),
449 vec!["JavaScript", "Python"]
450 );
451 }
452
453 #[test]
454 fn test_filter_candidates_no_prev() {
455 let previous_candidates = vec![];
456 let new_candidates = vec!["JavaScript", "Erlang"];
457 assert_eq!(
458 filter_candidates(previous_candidates, new_candidates),
459 vec!["JavaScript", "Erlang"]
460 );
461 }
462
463 #[test]
464 fn test_filter_candidates_no_matches() {
465 let previous_candidates = vec!["Python"];
466 let new_candidates = vec!["JavaScript", "Erlang"];
467 assert_eq!(
468 filter_candidates(previous_candidates, new_candidates),
469 vec!["Python"]
470 );
471 }
472
473 #[test]
474 fn test_get_language_breakdown_ignores_overrides_documentation() {
475 fs::create_dir_all("temp-testing-dir").unwrap();
476 fs::File::create("temp-testing-dir/README.md").unwrap();
477 assert!(get_language_breakdown("temp-testing-dir").is_empty());
478
479 fs::remove_dir_all("temp-testing-dir").unwrap();
480 }
481
482 #[test]
483 fn test_get_language_breakdown_ignores_overrides_vendor() {
484 fs::create_dir_all("temp-testing-dir2/node_modules").unwrap();
485 fs::File::create("temp-testing-dir2/node_modules/hello.go").unwrap();
486 assert!(get_language_breakdown("temp-testing-dir2").is_empty());
487
488 fs::remove_dir_all("temp-testing-dir2").unwrap();
489 }
490}