rust_code_analysis_code_split/
tools.rs1use std::cmp::Ordering;
2use std::collections::HashMap;
3use std::fs::{self, File};
4use std::io::{Read, Write};
5use std::path::{Component, Path, PathBuf};
6use std::sync::OnceLock;
7
8use regex::bytes::Regex;
9use termcolor::{Color, ColorSpec, StandardStreamLock, WriteColor};
10
11use crate::langs::fake;
12use crate::langs::*;
13
14pub fn read_file(path: &Path) -> std::io::Result<Vec<u8>> {
27 let mut file = File::open(path)?;
28 let mut data = Vec::new();
29 file.read_to_end(&mut data)?;
30
31 remove_blank_lines(&mut data);
32
33 Ok(data)
34}
35
36pub fn read_file_with_eol(path: &Path) -> std::io::Result<Option<Vec<u8>>> {
49 let file_size = fs::metadata(path).map_or(1024 * 1024, |m| m.len() as usize);
50 if file_size <= 3 {
51 return Ok(None);
53 }
54
55 let mut file = File::open(path)?;
56
57 let mut start = vec![0; 64.min(file_size)];
58 let start = if file.read_exact(&mut start).is_ok() {
59 if start[..2] == [b'\xFE', b'\xFF'] || start[..2] == [b'\xFF', b'\xFE'] {
61 &start[2..]
62 } else if start[..3] == [b'\xEF', b'\xBB', b'\xBF'] {
63 &start[3..]
64 } else {
65 &start
66 }
67 } else {
68 return Ok(None);
69 };
70
71 let mut head = String::from_utf8_lossy(start).into_owned();
73 head.pop();
75 if head.contains('\u{FFFD}') {
77 return Ok(None);
78 }
79
80 let mut data = Vec::with_capacity(file_size + 2);
81 data.extend_from_slice(start);
82
83 file.read_to_end(&mut data)?;
84
85 remove_blank_lines(&mut data);
86
87 Ok(Some(data))
88}
89
90pub fn write_file(path: &Path, data: &[u8]) -> std::io::Result<()> {
104 let mut file = File::create(path)?;
105 file.write_all(data)?;
106
107 Ok(())
108}
109
110pub fn get_language_for_file(path: &Path) -> Option<LANG> {
124 if let Some(ext) = path.extension() {
125 let ext = ext.to_str().unwrap().to_lowercase();
126 get_from_ext(&ext)
127 } else {
128 None
129 }
130}
131
132fn mode_to_str(mode: &[u8]) -> Option<String> {
133 std::str::from_utf8(mode).ok().map(|m| m.to_lowercase())
134}
135
136static RE1_EMACS: OnceLock<Regex> = OnceLock::new();
138static RE2_EMACS: OnceLock<Regex> = OnceLock::new();
139static RE1_VIM: OnceLock<Regex> = OnceLock::new();
140
141const FIRST_EMACS_EXPRESSION: &str = r"(?i)-\*-.*[^-\w]mode\s*:\s*([^:;\s]+)";
143const SECOND_EMACS_EXPRESSION: &str = r"-\*-\s*([^:;\s]+)\s*-\*-";
144const VIM_EXPRESSION: &str = r"(?i)vim\s*:.*[^\w]ft\s*=\s*([^:\s]+)";
145
146#[inline(always)]
147fn get_regex<'a>(
148 once_lock: &OnceLock<Regex>,
149 line: &'a [u8],
150 regex: &'a str,
151) -> Option<regex::bytes::Captures<'a>> {
152 once_lock
153 .get_or_init(|| Regex::new(regex).unwrap())
154 .captures_iter(line)
155 .next()
156}
157
158fn get_emacs_mode(buf: &[u8]) -> Option<String> {
159 for (i, line) in buf.splitn(5, |c| *c == b'\n').enumerate() {
161 if let Some(cap) = get_regex(&RE1_EMACS, line, FIRST_EMACS_EXPRESSION) {
162 return mode_to_str(&cap[1]);
163 } else if let Some(cap) = get_regex(&RE2_EMACS, line, SECOND_EMACS_EXPRESSION) {
164 return mode_to_str(&cap[1]);
165 } else if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
166 return mode_to_str(&cap[1]);
167 }
168 if i == 3 {
169 break;
170 }
171 }
172
173 for (i, line) in buf.rsplitn(5, |c| *c == b'\n').enumerate() {
174 if let Some(cap) = get_regex(&RE1_VIM, line, VIM_EXPRESSION) {
175 return mode_to_str(&cap[1]);
176 }
177 if i == 3 {
178 break;
179 }
180 }
181
182 None
183}
184
185pub fn guess_language<'a, P: AsRef<Path>>(buf: &[u8], path: P) -> (Option<LANG>, &'a str) {
209 let ext = path
210 .as_ref()
211 .extension()
212 .map(|e| e.to_str().unwrap())
213 .map(|e| e.to_lowercase())
214 .unwrap_or_else(|| "".to_string());
215 let from_ext = get_from_ext(&ext);
216
217 let mode = get_emacs_mode(buf).unwrap_or_default();
218
219 let from_mode = get_from_emacs_mode(&mode);
220
221 if let Some(lang_ext) = from_ext {
222 if let Some(lang_mode) = from_mode {
223 if lang_ext == lang_mode {
224 (
225 Some(lang_mode),
226 fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name()),
227 )
228 } else {
229 (Some(lang_ext), lang_ext.get_name())
231 }
232 } else {
233 (
234 Some(lang_ext),
235 fake::get_true(&ext, &mode).unwrap_or_else(|| lang_ext.get_name()),
236 )
237 }
238 } else if let Some(lang_mode) = from_mode {
239 (
240 Some(lang_mode),
241 fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name()),
242 )
243 } else {
244 (None, fake::get_true(&ext, &mode).unwrap_or_default())
245 }
246}
247
248pub(crate) fn remove_blank_lines(data: &mut Vec<u8>) {
250 let count_trailing = data
251 .iter()
252 .rev()
253 .take_while(|&c| *c == b'\n' || *c == b'\r')
254 .count();
255 if count_trailing > 0 {
256 data.truncate(data.len() - count_trailing);
257 }
258 data.push(b'\n');
259}
260
261pub(crate) fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
262 let mut components = path.as_ref().components().peekable();
264 let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() {
265 components.next();
266 PathBuf::from(c.as_os_str())
267 } else {
268 PathBuf::new()
269 };
270
271 for component in components {
272 match component {
273 Component::Prefix(..) => unreachable!(),
274 Component::RootDir => {
275 ret.push(component.as_os_str());
276 }
277 Component::CurDir => {}
278 Component::ParentDir => {
279 ret.pop();
280 }
281 Component::Normal(c) => {
282 ret.push(c);
283 }
284 }
285 }
286 ret
287}
288
289pub(crate) fn get_paths_dist(path1: &Path, path2: &Path) -> Option<usize> {
290 for ancestor in path1.ancestors() {
291 if path2.starts_with(ancestor) && !ancestor.as_os_str().is_empty() {
292 let path1 = path1.strip_prefix(ancestor).unwrap();
293 let path2 = path2.strip_prefix(ancestor).unwrap();
294 return Some(path1.components().count() + path2.components().count());
295 }
296 }
297 None
298}
299
300pub(crate) fn guess_file<S: ::std::hash::BuildHasher>(
301 current_path: &Path,
302 include_path: &str,
303 all_files: &HashMap<String, Vec<PathBuf>, S>,
304) -> Vec<PathBuf> {
305 let include_path = if let Some(end) = include_path.strip_prefix("mozilla/") {
306 end
307 } else {
308 include_path
309 };
310 let include_path = normalize_path(include_path);
311 if let Some(possibilities) = all_files.get(include_path.file_name().unwrap().to_str().unwrap())
312 {
313 if possibilities.len() == 1 {
314 return possibilities.clone();
316 }
317
318 let mut new_possibilities = Vec::new();
319 for p in possibilities.iter() {
320 if p.ends_with(&include_path) && current_path != p {
321 new_possibilities.push(p.clone());
322 }
323 }
324 if new_possibilities.len() == 1 {
325 return new_possibilities;
327 }
328 new_possibilities.clear();
329
330 if let Some(parent) = current_path.parent() {
331 for p in possibilities.iter() {
332 if p.starts_with(parent) && current_path != p {
333 new_possibilities.push(p.clone());
334 }
335 }
336 if new_possibilities.len() == 1 {
337 return new_possibilities;
339 }
340 new_possibilities.clear();
341 }
342
343 let mut dist_min = usize::MAX;
344 let mut path_min = Vec::new();
345 for p in possibilities.iter() {
346 if current_path == p {
347 continue;
348 }
349 if let Some(dist) = get_paths_dist(current_path, p) {
350 match dist.cmp(&dist_min) {
351 Ordering::Less => {
352 dist_min = dist;
353 path_min.clear();
354 path_min.push(p);
355 }
356 Ordering::Equal => {
357 path_min.push(p);
358 }
359 Ordering::Greater => {}
360 }
361 }
362 }
363
364 let path_min: Vec<_> = path_min.drain(..).map(|p| p.to_path_buf()).collect();
365 return path_min;
366 }
367
368 vec![]
369}
370
371#[inline(always)]
372pub(crate) fn color(stdout: &mut StandardStreamLock, color: Color) -> std::io::Result<()> {
373 stdout.set_color(ColorSpec::new().set_fg(Some(color)))
374}
375
376#[inline(always)]
377pub(crate) fn intense_color(stdout: &mut StandardStreamLock, color: Color) -> std::io::Result<()> {
378 stdout.set_color(ColorSpec::new().set_fg(Some(color)).set_intense(true))
379}
380
381#[cfg(test)]
382pub(crate) fn check_func_space<T: crate::ParserTrait, F: Fn(crate::FuncSpace)>(
383 source: &str,
384 filename: &str,
385 check: F,
386) {
387 let path = std::path::PathBuf::from(filename);
388 let mut trimmed_bytes = source.trim_end().trim_matches('\n').as_bytes().to_vec();
389 trimmed_bytes.push(b'\n');
390 let parser = T::new(trimmed_bytes, &path, None);
391 let func_space = crate::metrics(&parser, &path).unwrap();
392
393 check(func_space)
394}
395
396#[cfg(test)]
397pub(crate) fn check_metrics<T: crate::ParserTrait>(
398 source: &str,
399 filename: &str,
400 check: fn(crate::CodeMetrics) -> (),
401) {
402 check_func_space::<T, _>(source, filename, |func_space| check(func_space.metrics))
403}
404
405#[cfg(test)]
406mod tests {
407 use pretty_assertions::assert_eq;
408
409 use super::*;
410
411 #[test]
412 fn test_read() {
413 let tmp_dir = std::env::temp_dir();
414 let tmp_path = tmp_dir.join("test_read");
415 let data = vec![
416 (b"\xFF\xFEabc".to_vec(), Some(b"abc\n".to_vec())),
417 (b"\xFE\xFFabc".to_vec(), Some(b"abc\n".to_vec())),
418 (b"\xEF\xBB\xBFabc".to_vec(), Some(b"abc\n".to_vec())),
419 (b"\xEF\xBB\xBFabc\n".to_vec(), Some(b"abc\n".to_vec())),
420 (b"\xEF\xBBabc\n".to_vec(), None),
421 (b"abcdef\n".to_vec(), Some(b"abcdef\n".to_vec())),
422 (b"abcdef".to_vec(), Some(b"abcdef\n".to_vec())),
423 ];
424 for (d, expected) in data {
425 write_file(&tmp_path, &d).unwrap();
426 let res = read_file_with_eol(&tmp_path).unwrap();
427 assert_eq!(res, expected);
428 }
429 }
430
431 #[test]
432 fn test_guess_language() {
433 let buf = b"// -*- foo: bar; mode: c++; hello: world\n";
434 assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
435
436 let buf = b"// -*- c++ -*-\n";
437 assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
438
439 let buf = b"// -*- foo: bar; bar-mode: c++; hello: world\n";
440 assert_eq!(
441 guess_language(buf, "foo.py"),
442 (Some(LANG::Python), "python")
443 );
444
445 let buf = b"/* hello world */\n";
446 assert_eq!(guess_language(buf, "foo.cpp"), (Some(LANG::Cpp), "c/c++"));
447
448 let buf = b"\n\n\n\n\n\n\n\n\n// vim: set ts=4 ft=c++\n\n\n";
449 assert_eq!(guess_language(buf, "foo.c"), (Some(LANG::Cpp), "c/c++"));
450
451 let buf = b"\n\n\n\n\n\n\n\n\n\n\n\n";
452 assert_eq!(guess_language(buf, "foo.txt"), (None, ""));
453
454 let buf = b"// -*- foo: bar; mode: Objective-C++; hello: world\n";
455 assert_eq!(
456 guess_language(buf, "foo.mm"),
457 (Some(LANG::Cpp), "obj-c/c++")
458 );
459 }
460}