Skip to main content

rust_code_analysis_code_split/
tools.rs

1use std::cmp::Ordering;
2use std::collections::HashMap;
3use std::fs::{self, File};
4use std::io::{Read, Write};
5use std::path::{Component, Path, PathBuf};
6use std::sync::OnceLock;
7
8use regex::bytes::Regex;
9use termcolor::{Color, ColorSpec, StandardStreamLock, WriteColor};
10
11use crate::langs::fake;
12use crate::langs::*;
13
14/// Reads a file.
15///
16/// # Examples
17///
18/// ```
19/// use std::path::Path;
20///
21/// use rust_code_analysis::read_file;
22///
23/// let path = Path::new("Cargo.toml");
24/// read_file(&path).unwrap();
25/// ```
26pub fn read_file(path: &Path) -> std::io::Result<Vec<u8>> {
27    let mut file = File::open(path)?;
28    let mut data = Vec::new();
29    file.read_to_end(&mut data)?;
30
31    remove_blank_lines(&mut data);
32
33    Ok(data)
34}
35
36/// Reads a file and adds an `EOL` at its end.
37///
38/// # Examples
39///
40/// ```
41/// use std::path::Path;
42///
43/// use rust_code_analysis::read_file_with_eol;
44///
45/// let path = Path::new("Cargo.toml");
46/// read_file_with_eol(&path).unwrap();
47/// ```
48pub fn read_file_with_eol(path: &Path) -> std::io::Result<Option<Vec<u8>>> {
49    let file_size = fs::metadata(path).map_or(1024 * 1024, |m| m.len() as usize);
50    if file_size <= 3 {
51        // this file is very likely almost empty... so nothing to do on it
52        return Ok(None);
53    }
54
55    let mut file = File::open(path)?;
56
57    let mut start = vec![0; 64.min(file_size)];
58    let start = if file.read_exact(&mut start).is_ok() {
59        // Skip the bom if one
60        if start[..2] == [b'\xFE', b'\xFF'] || start[..2] == [b'\xFF', b'\xFE'] {
61            &start[2..]
62        } else if start[..3] == [b'\xEF', b'\xBB', b'\xBF'] {
63            &start[3..]
64        } else {
65            &start
66        }
67    } else {
68        return Ok(None);
69    };
70
71    // so start contains more or less 64 chars
72    let mut head = String::from_utf8_lossy(start).into_owned();
73    // The last char could be wrong because we were in the middle of an utf-8 sequence
74    head.pop();
75    // now check if there is an invalid char
76    if head.contains('\u{FFFD}') {
77        return Ok(None);
78    }
79
80    let mut data = Vec::with_capacity(file_size + 2);
81    data.extend_from_slice(start);
82
83    file.read_to_end(&mut data)?;
84
85    remove_blank_lines(&mut data);
86
87    Ok(Some(data))
88}
89
90/// Writes data to a file.
91///
92/// # Examples
93///
94/// ```no_run
95/// use std::path::Path;
96///
97/// use rust_code_analysis::write_file;
98///
99/// let path = Path::new("foo.txt");
100/// let data: [u8; 4] = [0; 4];
101/// write_file(&path, &data).unwrap();
102/// ```
103pub fn write_file(path: &Path, data: &[u8]) -> std::io::Result<()> {
104    let mut file = File::create(path)?;
105    file.write_all(data)?;
106
107    Ok(())
108}
109
110/// Detects the language of a code using
111/// the extension of a file.
112///
113/// # Examples
114///
115/// ```
116/// use std::path::Path;
117///
118/// use rust_code_analysis::get_language_for_file;
119///
120/// let path = Path::new("build.rs");
121/// get_language_for_file(&path).unwrap();
122/// ```
123pub fn get_language_for_file(path: &Path) -> Option<LANG> {
124    if let Some(ext) = path.extension() {
125        let ext = ext.to_str().unwrap().to_lowercase();
126        get_from_ext(&ext)
127    } else {
128        None
129    }
130}
131
132fn mode_to_str(mode: &[u8]) -> Option<String> {
133    std::str::from_utf8(mode).ok().map(|m| m.to_lowercase())
134}
135
136// comment containing coding info are useful
137static RE1_EMACS: OnceLock<Regex> = OnceLock::new();
138static RE2_EMACS: OnceLock<Regex> = OnceLock::new();
139static RE1_VIM: OnceLock<Regex> = OnceLock::new();
140
141// Regular expressions
142const FIRST_EMACS_EXPRESSION: &str = r"(?i)-\*-.*[^-\w]mode\s*:\s*([^:;\s]+)";
143const SECOND_EMACS_EXPRESSION: &str = r"-\*-\s*([^:;\s]+)\s*-\*-";
144const VIM_EXPRESSION: &str = r"(?i)vim\s*:.*[^\w]ft\s*=\s*([^:\s]+)";
145
146#[inline(always)]
147fn get_regex<'a>(
148    once_lock: &OnceLock<Regex>,
149    line: &'a [u8],
150    regex: &'a str,
151) -> Option<regex::bytes::Captures<'a>> {
152    once_lock
153        .get_or_init(|| Regex::new(regex).unwrap())
154        .captures_iter(line)
155        .next()
156}
157
158fn get_emacs_mode(buf: &[u8]) -> Option<String> {
159    // we just try to use the emacs info (if there)
160    for (i, line) in buf.splitn(5, |c| *c == b'\n').enumerate() {
161        if let Some(cap) = get_regex(&RE1_EMACS, line, FIRST_EMACS_EXPRESSION) {
162            return mode_to_str(&cap[1]);
163        } else if let Some(cap) = get_regex(&RE2_EMACS, line, SECOND_EMACS_EXPRESSION) {
164            return mode_to_str(&cap[1]);
165        } else if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
166            return mode_to_str(&cap[1]);
167        }
168        if i == 3 {
169            break;
170        }
171    }
172
173    for (i, line) in buf.rsplitn(5, |c| *c == b'\n').enumerate() {
174        if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
175            return mode_to_str(&cap[1]);
176        }
177        if i == 3 {
178            break;
179        }
180    }
181
182    None
183}
184
185/// Guesses the language of a code.
186///
187/// Returns a tuple containing a [`LANG`] as first argument
188/// and the language name as a second one.
189///
190/// # Examples
191///
192/// ```
193/// use std::path::PathBuf;
194///
195/// use rust_code_analysis::guess_language;
196///
197/// let source_code = "int a = 42;";
198///
199/// // The path to a dummy file used to contain the source code
200/// let path = PathBuf::from("foo.c");
201/// let source_slice = source_code.as_bytes();
202///
203/// // Guess the language of a code
204/// guess_language(&source_slice, &path);
205/// ```
206///
207/// [`LANG`]: enum.LANG.html
208pub fn guess_language<'a, P: AsRef<Path>>(buf: &[u8], path: P) -> (Option<LANG>, &'a str) {
209    let ext = path
210        .as_ref()
211        .extension()
212        .map(|e| e.to_str().unwrap())
213        .map(|e| e.to_lowercase())
214        .unwrap_or_else(|| "".to_string());
215    let from_ext = get_from_ext(&ext);
216
217    let mode = get_emacs_mode(buf).unwrap_or_default();
218
219    let from_mode = get_from_emacs_mode(&mode);
220
221    if let Some(lang_ext) = from_ext {
222        if let Some(lang_mode) = from_mode {
223            if lang_ext == lang_mode {
224                (
225                    Some(lang_mode),
226                    fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name()),
227                )
228            } else {
229                // we should probably rely on extension here
230                (Some(lang_ext), lang_ext.get_name())
231            }
232        } else {
233            (
234                Some(lang_ext),
235                fake::get_true(&ext, &mode).unwrap_or_else(|| lang_ext.get_name()),
236            )
237        }
238    } else if let Some(lang_mode) = from_mode {
239        (
240            Some(lang_mode),
241            fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name()),
242        )
243    } else {
244        (None, fake::get_true(&ext, &mode).unwrap_or_default())
245    }
246}
247
248/// Replaces \n and \r ending characters with a single generic \n
249pub(crate) fn remove_blank_lines(data: &mut Vec<u8>) {
250    let count_trailing = data
251        .iter()
252        .rev()
253        .take_while(|&c| *c == b'\n' || *c == b'\r')
254        .count();
255    if count_trailing > 0 {
256        data.truncate(data.len() - count_trailing);
257    }
258    data.push(b'\n');
259}
260
261pub(crate) fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
262    // Copied from Cargo sources: https://github.com/rust-lang/cargo/blob/master/src/cargo/util/paths.rs#L65
263    let mut components = path.as_ref().components().peekable();
264    let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() {
265        components.next();
266        PathBuf::from(c.as_os_str())
267    } else {
268        PathBuf::new()
269    };
270
271    for component in components {
272        match component {
273            Component::Prefix(..) => unreachable!(),
274            Component::RootDir => {
275                ret.push(component.as_os_str());
276            }
277            Component::CurDir => {}
278            Component::ParentDir => {
279                ret.pop();
280            }
281            Component::Normal(c) => {
282                ret.push(c);
283            }
284        }
285    }
286    ret
287}
288
289pub(crate) fn get_paths_dist(path1: &Path, path2: &Path) -> Option<usize> {
290    for ancestor in path1.ancestors() {
291        if path2.starts_with(ancestor) && !ancestor.as_os_str().is_empty() {
292            let path1 = path1.strip_prefix(ancestor).unwrap();
293            let path2 = path2.strip_prefix(ancestor).unwrap();
294            return Some(path1.components().count() + path2.components().count());
295        }
296    }
297    None
298}
299
300pub(crate) fn guess_file<S: ::std::hash::BuildHasher>(
301    current_path: &Path,
302    include_path: &str,
303    all_files: &HashMap<String, Vec<PathBuf>, S>,
304) -> Vec<PathBuf> {
305    let include_path = if let Some(end) = include_path.strip_prefix("mozilla/") {
306        end
307    } else {
308        include_path
309    };
310    let include_path = normalize_path(include_path);
311    if let Some(possibilities) = all_files.get(include_path.file_name().unwrap().to_str().unwrap())
312    {
313        if possibilities.len() == 1 {
314            // Only one file with this name
315            return possibilities.clone();
316        }
317
318        let mut new_possibilities = Vec::new();
319        for p in possibilities.iter() {
320            if p.ends_with(&include_path) && current_path != p {
321                new_possibilities.push(p.clone());
322            }
323        }
324        if new_possibilities.len() == 1 {
325            // Only one path is finishing with "foo/Bar.h"
326            return new_possibilities;
327        }
328        new_possibilities.clear();
329
330        if let Some(parent) = current_path.parent() {
331            for p in possibilities.iter() {
332                if p.starts_with(parent) && current_path != p {
333                    new_possibilities.push(p.clone());
334                }
335            }
336            if new_possibilities.len() == 1 {
337                // Only one path in the current working directory (current_path)
338                return new_possibilities;
339            }
340            new_possibilities.clear();
341        }
342
343        let mut dist_min = usize::MAX;
344        let mut path_min = Vec::new();
345        for p in possibilities.iter() {
346            if current_path == p {
347                continue;
348            }
349            if let Some(dist) = get_paths_dist(current_path, p) {
350                match dist.cmp(&dist_min) {
351                    Ordering::Less => {
352                        dist_min = dist;
353                        path_min.clear();
354                        path_min.push(p);
355                    }
356                    Ordering::Equal => {
357                        path_min.push(p);
358                    }
359                    Ordering::Greater => {}
360                }
361            }
362        }
363
364        let path_min: Vec<_> = path_min.drain(..).map(|p| p.to_path_buf()).collect();
365        return path_min;
366    }
367
368    vec![]
369}
370
371#[inline(always)]
372pub(crate) fn color(stdout: &mut StandardStreamLock, color: Color) -> std::io::Result<()> {
373    stdout.set_color(ColorSpec::new().set_fg(Some(color)))
374}
375
376#[inline(always)]
377pub(crate) fn intense_color(stdout: &mut StandardStreamLock, color: Color) -> std::io::Result<()> {
378    stdout.set_color(ColorSpec::new().set_fg(Some(color)).set_intense(true))
379}
380
381#[cfg(test)]
382pub(crate) fn check_func_space<T: crate::ParserTrait, F: Fn(crate::FuncSpace)>(
383    source: &str,
384    filename: &str,
385    check: F,
386) {
387    let path = std::path::PathBuf::from(filename);
388    let mut trimmed_bytes = source.trim_end().trim_matches('\n').as_bytes().to_vec();
389    trimmed_bytes.push(b'\n');
390    let parser = T::new(trimmed_bytes, &path, None);
391    let func_space = crate::metrics(&parser, &path).unwrap();
392
393    check(func_space)
394}
395
396#[cfg(test)]
397pub(crate) fn check_metrics<T: crate::ParserTrait>(
398    source: &str,
399    filename: &str,
400    check: fn(crate::CodeMetrics) -> (),
401) {
402    check_func_space::<T, _>(source, filename, |func_space| check(func_space.metrics))
403}
404
405#[cfg(test)]
406mod tests {
407    use pretty_assertions::assert_eq;
408
409    use super::*;
410
411    #[test]
412    fn test_read() {
413        let tmp_dir = std::env::temp_dir();
414        let tmp_path = tmp_dir.join("test_read");
415        let data = vec![
416            (b"\xFF\xFEabc".to_vec(), Some(b"abc\n".to_vec())),
417            (b"\xFE\xFFabc".to_vec(), Some(b"abc\n".to_vec())),
418            (b"\xEF\xBB\xBFabc".to_vec(), Some(b"abc\n".to_vec())),
419            (b"\xEF\xBB\xBFabc\n".to_vec(), Some(b"abc\n".to_vec())),
420            (b"\xEF\xBBabc\n".to_vec(), None),
421            (b"abcdef\n".to_vec(), Some(b"abcdef\n".to_vec())),
422            (b"abcdef".to_vec(), Some(b"abcdef\n".to_vec())),
423        ];
424        for (d, expected) in data {
425            write_file(&tmp_path, &d).unwrap();
426            let res = read_file_with_eol(&tmp_path).unwrap();
427            assert_eq!(res, expected);
428        }
429    }
430
431    #[test]
432    fn test_guess_language() {
433        let buf = b"// -*- foo: bar; mode: c++; hello: world\n";
434        assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
435
436        let buf = b"// -*- c++ -*-\n";
437        assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
438
439        let buf = b"// -*- foo: bar; bar-mode: c++; hello: world\n";
440        assert_eq!(
441            guess_language(buf, "foo.py"),
442            (Some(LANG::Python), "python")
443        );
444
445        let buf = b"/* hello world */\n";
446        assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
447
448        let buf = b"\n\n\n\n\n\n\n\n\n// vim: set ts=4 ft=c++\n\n\n";
449        assert_eq!(guess_language(buf, "foo.c"), (Some(LANG::Cpp), "c/c++"));
450
451        let buf = b"\n\n\n\n\n\n\n\n\n\n\n\n";
452        assert_eq!(guess_language(buf, "foo.txt"), (None, ""));
453
454        let buf = b"// -*- foo: bar; mode: Objective-C++; hello: world\n";
455        assert_eq!(
456            guess_language(buf, "foo.mm"),
457            (Some(LANG::Cpp), "obj-c/c++")
458        );
459    }
460}