rust_code_analysis/
tools.rs

1use std::cmp::Ordering;
2use std::collections::HashMap;
3use std::fs::{self, File};
4use std::io::{Read, Write};
5use std::path::{Component, Path, PathBuf};
6
7use lazy_static::lazy_static;
8use regex::bytes::Regex;
9
10use crate::langs::fake;
11use crate::langs::*;
12
13/// Reads a file.
14///
15/// # Examples
16///
17/// ```
18/// use std::path::Path;
19///
20/// use rust_code_analysis::read_file;
21///
22/// let path = Path::new("Cargo.toml");
23/// read_file(&path).unwrap();
24/// ```
25pub fn read_file(path: &Path) -> std::io::Result<Vec<u8>> {
26    let mut file = File::open(path)?;
27    let mut data = Vec::new();
28    file.read_to_end(&mut data)?;
29
30    remove_blank_lines(&mut data);
31
32    Ok(data)
33}
34
35/// Reads a file and adds an `EOL` at its end.
36///
37/// # Examples
38///
39/// ```
40/// use std::path::Path;
41///
42/// use rust_code_analysis::read_file_with_eol;
43///
44/// let path = Path::new("Cargo.toml");
45/// read_file_with_eol(&path).unwrap();
46/// ```
47pub fn read_file_with_eol(path: &Path) -> std::io::Result<Option<Vec<u8>>> {
48    let file_size = fs::metadata(path).map_or(1024 * 1024, |m| m.len() as usize);
49    if file_size <= 3 {
50        // this file is very likely almost empty... so nothing to do on it
51        return Ok(None);
52    }
53
54    let mut file = File::open(path)?;
55
56    let mut start = vec![0; 64.min(file_size)];
57    let start = if file.read_exact(&mut start).is_ok() {
58        // Skip the bom if one
59        if start[..2] == [b'\xFE', b'\xFF'] || start[..2] == [b'\xFF', b'\xFE'] {
60            &start[2..]
61        } else if start[..3] == [b'\xEF', b'\xBB', b'\xBF'] {
62            &start[3..]
63        } else {
64            &start
65        }
66    } else {
67        return Ok(None);
68    };
69
70    // so start contains more or less 64 chars
71    let mut head = String::from_utf8_lossy(start).into_owned();
72    // The last char could be wrong because we were in the middle of an utf-8 sequence
73    head.pop();
74    // now check if there is an invalid char
75    if head.contains('\u{FFFD}') {
76        return Ok(None);
77    }
78
79    let mut data = Vec::with_capacity(file_size + 2);
80    data.extend_from_slice(start);
81
82    file.read_to_end(&mut data)?;
83
84    remove_blank_lines(&mut data);
85
86    Ok(Some(data))
87}
88
89/// Writes data to a file.
90///
91/// # Examples
92///
93/// ```no_run
94/// use std::path::Path;
95///
96/// use rust_code_analysis::write_file;
97///
98/// let path = Path::new("foo.txt");
99/// let data: [u8; 4] = [0; 4];
100/// write_file(&path, &data).unwrap();
101/// ```
102pub fn write_file(path: &Path, data: &[u8]) -> std::io::Result<()> {
103    let mut file = File::create(path)?;
104    file.write_all(data)?;
105
106    Ok(())
107}
108
109/// Detects the language of a code using
110/// the extension of a file.
111///
112/// # Examples
113///
114/// ```
115/// use std::path::Path;
116///
117/// use rust_code_analysis::get_language_for_file;
118///
119/// let path = Path::new("build.rs");
120/// get_language_for_file(&path).unwrap();
121/// ```
122pub fn get_language_for_file(path: &Path) -> Option<LANG> {
123    if let Some(ext) = path.extension() {
124        let ext = ext.to_str().unwrap().to_lowercase();
125        get_from_ext(&ext)
126    } else {
127        None
128    }
129}
130
131fn mode_to_str(mode: &[u8]) -> Option<String> {
132    std::str::from_utf8(mode).ok().map(|m| m.to_lowercase())
133}
134
135fn get_emacs_mode(buf: &[u8]) -> Option<String> {
136    // we just try to use the emacs info (if there)
137    lazy_static! {
138        // comment containing coding info are useful
139        static ref RE1_EMACS: Regex = Regex::new(r"(?i)-\*-.*[^-\w]mode\s*:\s*([^:;\s]+)").unwrap();
140        static ref RE2_EMACS: Regex = Regex::new(r"-\*-\s*([^:;\s]+)\s*-\*-").unwrap();
141        static ref RE1_VIM: Regex = Regex::new(r"(?i)vim\s*:.*[^\w]ft\s*=\s*([^:\s]+)").unwrap();
142    }
143
144    for (i, line) in buf.splitn(5, |c| *c == b'\n').enumerate() {
145        if let Some(cap) = RE1_EMACS.captures_iter(line).next() {
146            return mode_to_str(&cap[1]);
147        } else if let Some(cap) = RE2_EMACS.captures_iter(line).next() {
148            return mode_to_str(&cap[1]);
149        } else if let Some(cap) = RE1_VIM.captures_iter(line).next() {
150            return mode_to_str(&cap[1]);
151        }
152        if i == 3 {
153            break;
154        }
155    }
156
157    for (i, line) in buf.rsplitn(5, |c| *c == b'\n').enumerate() {
158        if let Some(cap) = RE1_VIM.captures_iter(line).next() {
159            return mode_to_str(&cap[1]);
160        }
161        if i == 3 {
162            break;
163        }
164    }
165
166    None
167}
168
169/// Guesses the language of a code.
170///
171/// Returns a tuple containing a [`LANG`] as first argument
172/// and the language name as a second one.
173///
174/// # Examples
175///
176/// ```
177/// use std::path::PathBuf;
178///
179/// use rust_code_analysis::guess_language;
180///
181/// let source_code = "int a = 42;";
182///
183/// // The path to a dummy file used to contain the source code
184/// let path = PathBuf::from("foo.c");
185/// let source_slice = source_code.as_bytes();
186///
187/// // Guess the language of a code
188/// guess_language(&source_slice, &path);
189/// ```
190///
191/// [`LANG`]: enum.LANG.html
192pub fn guess_language<P: AsRef<Path>>(buf: &[u8], path: P) -> (Option<LANG>, String) {
193    let ext = path
194        .as_ref()
195        .extension()
196        .map(|e| e.to_str().unwrap())
197        .map(|e| e.to_lowercase())
198        .unwrap_or_else(|| "".to_string());
199    let from_ext = get_from_ext(&ext);
200
201    let mode = get_emacs_mode(buf).unwrap_or_default();
202
203    let from_mode = get_from_emacs_mode(&mode);
204
205    if let Some(lang_ext) = from_ext {
206        if let Some(lang_mode) = from_mode {
207            if lang_ext == lang_mode {
208                (
209                    Some(lang_mode),
210                    fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name().to_string()),
211                )
212            } else {
213                // we should probably rely on extension here
214                (Some(lang_ext), lang_ext.get_name().to_string())
215            }
216        } else {
217            (
218                Some(lang_ext),
219                fake::get_true(&ext, &mode).unwrap_or_else(|| lang_ext.get_name().to_string()),
220            )
221        }
222    } else if let Some(lang_mode) = from_mode {
223        (
224            Some(lang_mode),
225            fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name().to_string()),
226        )
227    } else {
228        (None, fake::get_true(&ext, &mode).unwrap_or_default())
229    }
230}
231
232/// Replaces \n and \r ending characters with a single generic \n
233pub(crate) fn remove_blank_lines(data: &mut Vec<u8>) {
234    let count_trailing = data
235        .iter()
236        .rev()
237        .take_while(|&c| (*c == b'\n' || *c == b'\r'))
238        .count();
239    if count_trailing > 0 {
240        data.truncate(data.len() - count_trailing);
241    }
242    data.push(b'\n');
243}
244
245pub(crate) fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
246    // Copied from Cargo sources: https://github.com/rust-lang/cargo/blob/master/src/cargo/util/paths.rs#L65
247    let mut components = path.as_ref().components().peekable();
248    let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() {
249        components.next();
250        PathBuf::from(c.as_os_str())
251    } else {
252        PathBuf::new()
253    };
254
255    for component in components {
256        match component {
257            Component::Prefix(..) => unreachable!(),
258            Component::RootDir => {
259                ret.push(component.as_os_str());
260            }
261            Component::CurDir => {}
262            Component::ParentDir => {
263                ret.pop();
264            }
265            Component::Normal(c) => {
266                ret.push(c);
267            }
268        }
269    }
270    ret
271}
272
273pub(crate) fn get_paths_dist(path1: &Path, path2: &Path) -> Option<usize> {
274    for ancestor in path1.ancestors() {
275        if path2.starts_with(ancestor) && !ancestor.as_os_str().is_empty() {
276            let path1 = path1.strip_prefix(ancestor).unwrap();
277            let path2 = path2.strip_prefix(ancestor).unwrap();
278            return Some(path1.components().count() + path2.components().count());
279        }
280    }
281    None
282}
283
284pub(crate) fn guess_file<S: ::std::hash::BuildHasher>(
285    current_path: &Path,
286    include_path: &str,
287    all_files: &HashMap<String, Vec<PathBuf>, S>,
288) -> Vec<PathBuf> {
289    //let rpath = include_path.clone();
290    let include_path = if let Some(end) = include_path.strip_prefix("mozilla/") {
291        end
292    } else {
293        include_path
294    };
295    let include_path = normalize_path(include_path);
296    if let Some(possibilities) = all_files.get(include_path.file_name().unwrap().to_str().unwrap())
297    {
298        if possibilities.len() == 1 {
299            // Only one file with this name
300            return possibilities.clone();
301        }
302
303        let mut new_possibilities = Vec::new();
304        for p in possibilities.iter() {
305            if p.ends_with(&include_path) && current_path != p {
306                new_possibilities.push(p.clone());
307            }
308        }
309        if new_possibilities.len() == 1 {
310            // Only one path is finishing with "foo/Bar.h"
311            return new_possibilities;
312        }
313        new_possibilities.clear();
314
315        if let Some(parent) = current_path.parent() {
316            for p in possibilities.iter() {
317                if p.starts_with(parent) && current_path != p {
318                    new_possibilities.push(p.clone());
319                }
320            }
321            if new_possibilities.len() == 1 {
322                // Only one path in the current working directory (current_path)
323                return new_possibilities;
324            }
325            new_possibilities.clear();
326        }
327
328        let mut dist_min = std::usize::MAX;
329        let mut path_min = Vec::new();
330        for p in possibilities.iter() {
331            if current_path == p {
332                continue;
333            }
334            if let Some(dist) = get_paths_dist(current_path, p) {
335                match dist.cmp(&dist_min) {
336                    Ordering::Less => {
337                        dist_min = dist;
338                        path_min.clear();
339                        path_min.push(p);
340                    }
341                    Ordering::Equal => {
342                        path_min.push(p);
343                    }
344                    Ordering::Greater => {}
345                }
346            }
347        }
348
349        let path_min: Vec<_> = path_min.drain(..).map(|p| p.to_path_buf()).collect();
350        return path_min;
351    }
352
353    vec![]
354}
355
356#[cfg(test)]
357mod tests {
358    use pretty_assertions::assert_eq;
359
360    use super::*;
361
362    #[test]
363    fn test_read() {
364        let tmp_dir = std::env::temp_dir();
365        let tmp_path = tmp_dir.join("test_read");
366        let data = vec![
367            (b"\xFF\xFEabc".to_vec(), Some(b"abc\n".to_vec())),
368            (b"\xFE\xFFabc".to_vec(), Some(b"abc\n".to_vec())),
369            (b"\xEF\xBB\xBFabc".to_vec(), Some(b"abc\n".to_vec())),
370            (b"\xEF\xBB\xBFabc\n".to_vec(), Some(b"abc\n".to_vec())),
371            (b"\xEF\xBBabc\n".to_vec(), None),
372            (b"abcdef\n".to_vec(), Some(b"abcdef\n".to_vec())),
373            (b"abcdef".to_vec(), Some(b"abcdef\n".to_vec())),
374        ];
375        for (d, expected) in data {
376            write_file(&tmp_path, &d).unwrap();
377            let res = read_file_with_eol(&tmp_path).unwrap();
378            assert_eq!(res, expected);
379        }
380    }
381
382    #[test]
383    fn test_guess_language() {
384        let buf = b"// -*- foo: bar; mode: c++; hello: world\n";
385        assert_eq!(
386            guess_language(buf, "foo.cpp"),
387            (Some(LANG::Cpp), "c/c++".to_string())
388        );
389
390        let buf = b"// -*- c++ -*-\n";
391        assert_eq!(
392            guess_language(buf, "foo.cpp"),
393            (Some(LANG::Cpp), "c/c++".to_string())
394        );
395
396        let buf = b"// -*- foo: bar; bar-mode: c++; hello: world\n";
397        assert_eq!(
398            guess_language(buf, "foo.py"),
399            (Some(LANG::Python), "python".to_string())
400        );
401
402        let buf = b"/* hello world */\n";
403        assert_eq!(
404            guess_language(buf, "foo.cpp"),
405            (Some(LANG::Cpp), "c/c++".to_string())
406        );
407
408        let buf = b"\n\n\n\n\n\n\n\n\n// vim: set ts=4 ft=c++\n\n\n";
409        assert_eq!(
410            guess_language(buf, "foo.c"),
411            (Some(LANG::Cpp), "c/c++".to_string())
412        );
413
414        let buf = b"\n\n\n\n\n\n\n\n\n\n\n\n";
415        assert_eq!(guess_language(buf, "foo.txt"), (None, "".to_string()));
416
417        let buf = b"// -*- foo: bar; mode: Objective-C++; hello: world\n";
418        assert_eq!(
419            guess_language(buf, "foo.mm"),
420            (Some(LANG::Cpp), "obj-c/c++".to_string())
421        );
422    }
423}