use crate::langs::fake;
use crate::langs::*;
use regex::bytes::Regex;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{Read, Write};
use std::path::{Component, Path, PathBuf};
pub fn read_file(path: &PathBuf) -> std::io::Result<Vec<u8>> {
let mut file = File::open(path)?;
let mut data = Vec::new();
file.read_to_end(&mut data)?;
remove_blank_lines(&mut data);
Ok(data)
}
pub fn read_file_with_eol(path: &PathBuf) -> std::io::Result<Option<Vec<u8>>> {
let file_size = fs::metadata(&path).map_or(1024 * 1024, |m| m.len() as usize);
if file_size <= 3 {
return Ok(None);
}
let mut file = File::open(path)?;
let mut start = vec![0; 64.min(file_size)];
let start = if let Ok(_) = file.read_exact(&mut start) {
if start[..2] == [b'\xFE', b'\xFF'] || start[..2] == [b'\xFF', b'\xFE'] {
&start[2..]
} else if start[..3] == [b'\xEF', b'\xBB', b'\xBF'] {
&start[3..]
} else {
&start
}
} else {
return Ok(None);
};
let mut head = String::from_utf8_lossy(&start).into_owned();
head.pop();
if head.contains('\u{FFFD}') {
return Ok(None);
}
let mut data = Vec::with_capacity(file_size + 2);
data.extend_from_slice(&start);
file.read_to_end(&mut data)?;
remove_blank_lines(&mut data);
Ok(Some(data))
}
pub fn write_file(path: &PathBuf, data: &[u8]) -> std::io::Result<()> {
let mut file = File::create(path)?;
file.write_all(data)?;
Ok(())
}
pub fn get_language_for_file(path: &PathBuf) -> Option<LANG> {
if let Some(ext) = path.extension() {
let ext = ext.to_str().unwrap().to_lowercase();
get_from_ext(&ext)
} else {
None
}
}
fn mode_to_str(mode: &[u8]) -> Option<String> {
std::str::from_utf8(mode).ok().map(|m| m.to_lowercase())
}
fn get_emacs_mode(buf: &[u8]) -> Option<String> {
lazy_static! {
static ref RE1_EMACS: Regex = Regex::new(r"(?i)-\*-.*[^-\w]mode\s*:\s*([^:;\s]+)").unwrap();
static ref RE2_EMACS: Regex = Regex::new(r"-\*-\s*([^:;\s]+)\s*-\*-").unwrap();
static ref RE1_VIM: Regex = Regex::new(r"(?i)vim\s*:.*[^\w]ft\s*=\s*([^:\s]+)").unwrap();
}
for (i, line) in buf.splitn(5, |c| *c == b'\n').enumerate() {
if let Some(cap) = RE1_EMACS.captures_iter(line).next() {
return mode_to_str(&cap[1]);
} else if let Some(cap) = RE2_EMACS.captures_iter(line).next() {
return mode_to_str(&cap[1]);
} else if let Some(cap) = RE1_VIM.captures_iter(line).next() {
return mode_to_str(&cap[1]);
}
if i == 3 {
break;
}
}
for (i, line) in buf.rsplitn(5, |c| *c == b'\n').enumerate() {
if let Some(cap) = RE1_VIM.captures_iter(line).next() {
return mode_to_str(&cap[1]);
}
if i == 3 {
break;
}
}
None
}
pub fn guess_language<P: AsRef<Path>>(buf: &[u8], path: P) -> (Option<LANG>, String) {
let ext = path
.as_ref()
.extension()
.map(|e| e.to_str().unwrap())
.map(|e| e.to_lowercase())
.unwrap_or_else(|| "".to_string());
let from_ext = get_from_ext(&ext);
let mode = get_emacs_mode(buf).unwrap_or_else(|| "".to_string());
let from_mode = get_from_emacs_mode(&mode);
if let Some(lang_ext) = from_ext {
if let Some(lang_mode) = from_mode {
if lang_ext == lang_mode {
(
Some(lang_mode),
fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name().to_string()),
)
} else {
(Some(lang_ext), lang_ext.get_name().to_string())
}
} else {
(
Some(lang_ext),
fake::get_true(&ext, &mode).unwrap_or_else(|| lang_ext.get_name().to_string()),
)
}
} else if let Some(lang_mode) = from_mode {
(
Some(lang_mode),
fake::get_true(&ext, &mode).unwrap_or_else(|| lang_mode.get_name().to_string()),
)
} else {
(
None,
fake::get_true(&ext, &mode).unwrap_or_else(|| "".to_string()),
)
}
}
pub(crate) fn remove_blank_lines(data: &mut Vec<u8>) {
let count_trailing = data.iter().rev().take_while(|&c| *c == b'\n').count();
if count_trailing > 0 {
data.truncate(data.len() - count_trailing + 1);
} else {
data.push(b'\n');
}
}
pub(crate) fn normalize_path<P: AsRef<Path>>(path: P) -> Option<PathBuf> {
let mut components = path.as_ref().components().peekable();
let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() {
components.next();
PathBuf::from(c.as_os_str())
} else {
PathBuf::new()
};
for component in components {
match component {
Component::Prefix(..) => unreachable!(),
Component::RootDir => {
ret.push(component.as_os_str());
}
Component::CurDir => {}
Component::ParentDir => {
ret.pop();
}
Component::Normal(c) => {
ret.push(c);
}
}
}
Some(ret)
}
pub(crate) fn get_paths_dist(path1: &PathBuf, path2: &PathBuf) -> Option<usize> {
for ancestor in path1.ancestors() {
if path2.starts_with(ancestor) && !ancestor.as_os_str().is_empty() {
let path1 = path1.strip_prefix(ancestor).unwrap();
let path2 = path2.strip_prefix(ancestor).unwrap();
return Some(path1.components().count() + path2.components().count());
}
}
None
}
pub(crate) fn guess_file<S: ::std::hash::BuildHasher>(
current_path: &PathBuf,
include_path: &str,
all_files: &HashMap<String, Vec<PathBuf>, S>,
) -> Vec<PathBuf> {
let include_path = if include_path.starts_with("mozilla/") {
&include_path[8..]
} else {
include_path
};
let include_path = normalize_path(include_path).unwrap();
if let Some(possibilities) = all_files.get(include_path.file_name().unwrap().to_str().unwrap())
{
if possibilities.len() == 1 {
return possibilities.clone();
}
let mut new_possibilities = Vec::new();
for p in possibilities.iter() {
if p.ends_with(&include_path) && current_path != p {
new_possibilities.push(p.clone());
}
}
if new_possibilities.len() == 1 {
return new_possibilities;
}
new_possibilities.clear();
if let Some(parent) = current_path.parent() {
for p in possibilities.iter() {
if p.starts_with(&parent) && current_path != p {
new_possibilities.push(p.clone());
}
}
if new_possibilities.len() == 1 {
return new_possibilities;
}
new_possibilities.clear();
}
let mut dist_min = std::usize::MAX;
let mut path_min = Vec::new();
for p in possibilities.iter() {
if current_path == p {
continue;
}
if let Some(dist) = get_paths_dist(current_path, &p) {
match dist.cmp(&dist_min) {
Ordering::Less => {
dist_min = dist;
path_min.clear();
path_min.push(p);
}
Ordering::Equal => {
path_min.push(p);
}
Ordering::Greater => {}
}
}
}
let path_min: Vec<_> = path_min.drain(..).map(|p| p.to_path_buf()).collect();
return path_min;
}
vec![]
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_read() {
let tmp_dir = std::env::temp_dir();
let tmp_path = tmp_dir.join("test_read");
let data = vec![
(b"\xFF\xFEabc".to_vec(), Some(b"abc\n".to_vec())),
(b"\xFE\xFFabc".to_vec(), Some(b"abc\n".to_vec())),
(b"\xEF\xBB\xBFabc".to_vec(), Some(b"abc\n".to_vec())),
(b"\xEF\xBB\xBFabc\n".to_vec(), Some(b"abc\n".to_vec())),
(b"\xEF\xBBabc\n".to_vec(), None),
(b"abcdef\n".to_vec(), Some(b"abcdef\n".to_vec())),
(b"abcdef".to_vec(), Some(b"abcdef\n".to_vec())),
];
for (d, expected) in data {
write_file(&tmp_path, &d).unwrap();
let res = read_file_with_eol(&tmp_path).unwrap();
assert_eq!(res, expected);
}
}
#[test]
fn test_guess_language() {
let buf = b"// -*- foo: bar; mode: c++; hello: world\n";
assert_eq!(
guess_language(buf, "foo.cpp"),
(Some(LANG::Cpp), "c/c++".to_string())
);
let buf = b"// -*- c++ -*-\n";
assert_eq!(
guess_language(buf, "foo.cpp"),
(Some(LANG::Cpp), "c/c++".to_string())
);
let buf = b"// -*- foo: bar; bar-mode: c++; hello: world\n";
assert_eq!(
guess_language(buf, "foo.py"),
(Some(LANG::Python), "python".to_string())
);
let buf = b"/* hello world */\n";
assert_eq!(
guess_language(buf, "foo.cpp"),
(Some(LANG::Cpp), "c/c++".to_string())
);
let buf = b"\n\n\n\n\n\n\n\n\n// vim: set ts=4 ft=c++\n\n\n";
assert_eq!(
guess_language(buf, "foo.c"),
(Some(LANG::Cpp), "c/c++".to_string())
);
let buf = b"\n\n\n\n\n\n\n\n\n\n\n\n";
assert_eq!(guess_language(buf, "foo.txt"), (None, "".to_string()));
let buf = b"// -*- foo: bar; mode: Objective-C++; hello: world\n";
assert_eq!(
guess_language(buf, "foo.mm"),
(Some(LANG::Cpp), "obj-c/c++".to_string())
);
}
}