use anyhow::{bail, Context, Error, Result};
use pathdiff::diff_paths;
use polars::io::SerWriter;
use polars::prelude::{CsvReadOptions, CsvWriter, Schema};
use polars::{frame::DataFrame, io::SerReader};
use walkdir::WalkDir;
use std::fs;
use std::io::BufWriter;
use std::path::{Component, PathBuf};
use std::sync::Arc;
use std::{
fs::File,
io::{BufRead, BufReader, Lines},
path::Path,
};
#[derive(PartialEq, Eq, Debug, Copy, Clone)]
pub enum FileMode {
Read,
Overwrite,
Append,
}
pub fn open_file(path: impl AsRef<Path>, mode: FileMode) -> Result<File> {
if let Some(parent) = path.as_ref().parent() {
if let Some(parent_path) = parent.to_str() {
create_dir(parent_path)?;
}
}
match mode {
FileMode::Read => std::fs::File::open(&path),
FileMode::Overwrite => std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(&path),
FileMode::Append => std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(&path),
}
.with_context(|| format!("Could not open {}", &path.as_ref().display()))
}
pub fn check_path(path: &str) -> Result<PathBuf> {
if Path::new(path).exists() {
Ok(PathBuf::from(path))
} else {
bail!("File or directory {path} not found")
}
}
pub fn load_file(
path: impl AsRef<Path>,
memory_limit: u64,
) -> Result<core::result::Result<Vec<u8>, u64>> {
let metadata = std::fs::metadata(&path).with_context(|| {
format!(
"Could not fetch metadata for file {}",
&path.as_ref().display()
)
})?;
let file_size = metadata.len();
if file_size > memory_limit {
Ok(Err(file_size))
} else {
std::fs::read(&path)
.map(Ok)
.with_context(|| format!("Could not read file {}", &path.as_ref().display()))
}
}
pub fn file_lines(path: impl AsRef<Path>) -> Result<Lines<BufReader<File>>, Error> {
Ok(std::io::BufReader::new(open_file(path, FileMode::Read)?).lines())
}
pub fn file_lines_count(path: impl AsRef<Path>) -> Result<usize, Error> {
Ok(file_lines(path)?.count())
}
pub fn create_dir(path: impl AsRef<Path>) -> Result<(), Error> {
let path_buf = path.as_ref().to_path_buf();
match std::fs::create_dir_all(&path_buf) {
Ok(_) => Ok(()),
Err(e) => {
if e.kind() != std::io::ErrorKind::AlreadyExists {
bail!(format!(
"Could not create directory {}: {}",
path_buf.display(),
e
))
} else {
Ok(())
}
}
}
}
pub fn delete_dir(path: impl AsRef<Path>, silent: bool) -> Result<()> {
let path_buf = path.as_ref().to_path_buf();
match std::fs::remove_dir_all(&path_buf) {
Ok(_) => Ok(()),
Err(e) if e.kind() == std::io::ErrorKind::NotFound && silent => Ok(()),
Err(e) => Err(e.into()),
}
}
pub fn delete_file(path: impl AsRef<Path>, silent: bool) -> Result<()> {
let path_buf = path.as_ref().to_path_buf();
match std::fs::remove_file(&path_buf) {
Ok(_) => Ok(()),
Err(e) if e.kind() == std::io::ErrorKind::NotFound && silent => Ok(()),
Err(e) => bail!(format!(
"Could not delete file {}: {}",
path_buf.display(),
e
)),
}
}
pub fn write_file(path: impl AsRef<Path>, content: impl AsRef<[u8]>) -> Result<()> {
if let Some(parent) = path.as_ref().parent() {
create_dir(parent)?;
}
fs::write(&path, content)?;
Ok(())
}
pub fn open_csv(
path: &str,
schema: Option<Schema>,
columns: Option<Vec<&str>>,
) -> Result<DataFrame, Error> {
CsvReadOptions::default()
.with_columns(
columns.map(|cols| Arc::from(cols.into_iter().map(|s| s.into()).collect::<Vec<_>>())),
)
.with_schema_overwrite(schema.map(Arc::new))
.with_has_header(true)
.into_reader_with_file_handle(BufReader::new(open_file(path, FileMode::Read)?))
.finish()
.with_context(|| format!("Could not read {path}"))
}
pub fn write_csv(path: &str, df: &mut DataFrame) -> Result<()> {
CsvWriter::new(BufWriter::new(open_file(path, FileMode::Overwrite)?))
.include_header(true)
.with_separator(b',')
.finish(df)
.with_context(|| format!("Could not write to {path}"))
}
pub fn is_empty_dir(path: impl AsRef<Path>) -> Result<bool> {
Ok(fs::read_dir(path)?.next().is_none())
}
pub fn delete_empty_dirs(path: impl AsRef<Path>) -> Result<()> {
for entry in WalkDir::new(path)
.contents_first(true)
.into_iter()
.filter_map(Result::ok)
.filter(|e| e.file_type().is_dir())
{
if is_empty_dir(entry.path())? {
fs::remove_dir(entry.path())?;
}
}
Ok(())
}
pub fn files_sorted_by_proximity(
root_dir: impl AsRef<Path>,
pivot_file: impl AsRef<Path>,
ext: &str,
) -> Result<Vec<PathBuf>, Error> {
let pivot_file = pivot_file.as_ref();
let root_dir = root_dir.as_ref();
if !pivot_file.exists() {
bail!("Pivot file {pivot_file:?} does not exist")
} else {
let pivot_canon = pivot_file
.canonicalize()
.with_context(|| format!("Could not canonicalize pivot file {pivot_file:?}"))?;
let root_canon = root_dir
.canonicalize()
.with_context(|| format!("Could not canonicalize root dir {root_dir:?}"))?;
if !pivot_canon.starts_with(&root_canon) {
bail!("Pivot file {pivot_file:?} is not in root dir {root_dir:?}")
} else {
let mut files: Vec<PathBuf> = WalkDir::new(root_dir)
.into_iter()
.filter_map(Result::ok)
.filter(|e| e.file_type().is_file())
.map(|e| e.into_path())
.filter(|p| {
p.extension()
.and_then(|e| e.to_str())
.map(|e| e.eq_ignore_ascii_case(ext))
.unwrap_or(false)
})
.collect();
files.sort_by_key(|p| {
let rel: PathBuf = diff_paths(p, pivot_file).unwrap();
let mut ups = 0;
let mut total = 0;
for comp in rel.components() {
if matches!(comp, Component::ParentDir) {
ups += 1;
} else if !matches!(comp, Component::CurDir) {
total += 1;
}
}
(ups, total)
});
Ok(files)
}
}
}
#[cfg(test)]
mod io_tests {
use std::io::Write;
use std::path::Path;
use anyhow::{ensure, Ok};
use super::*;
#[test]
fn read_file_test() -> Result<()> {
let file = open_file("tests/data/non_existent_file.txt", FileMode::Read);
ensure!(file.is_err());
open_file("tests/data/empty.csv", FileMode::Read)?;
Ok(())
}
#[test]
fn file_lines_test() -> Result<()> {
let path = "tests/data/small_file.csv";
let mut lines = file_lines(path)?;
assert_eq!(lines.next().unwrap()?, "id,name,fork");
assert_eq!(lines.next().unwrap()?, "0,a,1");
assert_eq!(lines.next().unwrap()?, "1,b,0");
assert_eq!(lines.next().unwrap()?, "2,c,1");
assert_eq!(lines.next().unwrap()?, "3,d,0");
ensure!(lines.next().is_none());
Ok(())
}
#[test]
fn create_delete_dir_test() -> Result<()> {
let test_dir = "tests";
create_dir(test_dir)?;
delete_dir(format!("{test_dir}/new_dir"), true)?;
ensure!(delete_dir(format!("{test_dir}/new_dir"), false).is_err());
let new_dir = format!("{test_dir}/new_dir/new_dir");
ensure!(!Path::new(&new_dir).exists());
create_dir(&new_dir)?;
ensure!(Path::new(&new_dir).exists());
delete_dir(format!("{test_dir}/new_dir"), false)?;
ensure!(!Path::new(&new_dir).exists());
Ok(())
}
#[test]
fn create_delete_file_test() -> Result<()> {
let test_file = "tests/new_file.txt";
delete_file(test_file, true)?;
ensure!(delete_file(test_file, false).is_err());
ensure!(!Path::new(&test_file).exists());
open_file(test_file, FileMode::Overwrite)?;
ensure!(Path::new(&test_file).exists());
delete_file(test_file, false)?;
ensure!(!Path::new(&test_file).exists());
Ok(())
}
#[test]
fn write_file_test() -> Result<()> {
let path = "tests/data/abc.txt";
{
let file = open_file(path, FileMode::Overwrite)?;
write!(&file, "abc")?;
}
let content = std::fs::read_to_string(path)?;
let lines: Vec<&str> = content.lines().collect();
ensure!(lines.len() == 1);
assert_eq!(lines[0], "abc");
{
let file = open_file(path, FileMode::Append)?;
write!(&file, "okok")?;
}
let content = std::fs::read_to_string(path)?;
let lines: Vec<&str> = content.lines().collect();
ensure!(lines.len() == 1);
assert_eq!(lines[0], "abcokok");
{
let file = open_file(path, FileMode::Overwrite)?;
write!(&file, "abc")?;
}
let content = std::fs::read_to_string(path)?;
let lines: Vec<&str> = content.lines().collect();
ensure!(lines.len() == 1);
assert_eq!(lines[0], "abc");
Ok(())
}
#[test]
fn line_count_test() -> Result<()> {
let count = file_lines_count("tests/data/small_file.csv")?;
assert_eq!(count, 5);
ensure!(file_lines_count("tests/data/non_existent_file.csv").is_err());
Ok(())
}
#[test]
fn files_sorted_by_proximity_test() -> Result<()> {
let root_dir = "tests/data/test_project";
let pivot_file = "tests/data/test_project/utils/foo.rs";
let files = files_sorted_by_proximity(root_dir, pivot_file, "rs")?;
let files = files
.into_iter()
.map(|p| p.to_str().unwrap().to_string())
.collect::<Vec<_>>();
let expected_files = vec![
"tests/data/test_project/utils/foo.rs",
"tests/data/test_project/utils/bar.rs",
"tests/data/test_project/utils/snippets/example.rs",
"tests/data/test_project/main.rs",
"tests/data/test_project/io/fs.rs",
];
assert_eq!(files, expected_files);
Ok(())
}
}