use std::{
fs::File,
io::{BufRead, BufReader, Read, Seek, SeekFrom},
path::Path,
};
use anyhow::{Context as _, Result};
use memmap2::Mmap;
use super::{
encoding::{self, FileEncoding},
line_counter,
stats::AnalysisResults,
};
use crate::langs::Language;
const MMAP_THRESHOLD: u64 = 256 * 1024;
const SAMPLE_SIZE: usize = 4 * 1024;
fn open_file_context(path: &Path) -> String {
format!("Failed to open file {}", path.display())
}
fn mmap_file_context(path: &Path) -> String {
format!("Failed to memory-map file {}", path.display())
}
pub(super) trait LineSource {
fn for_each_line<F>(&mut self, f: &mut F) -> Result<()>
where
F: FnMut(&[u8]);
}
pub(super) struct BufLineSource<R: BufRead> {
reader: R,
buffer: Vec<u8>,
}
impl<R: BufRead> BufLineSource<R> {
pub(super) fn new(reader: R) -> Self {
Self { reader, buffer: Vec::with_capacity(1024) }
}
}
impl<R: BufRead> LineSource for BufLineSource<R> {
fn for_each_line<F>(&mut self, f: &mut F) -> Result<()>
where
F: FnMut(&[u8]),
{
loop {
self.buffer.clear();
let bytes_read = self.reader.read_until(b'\n', &mut self.buffer)?;
if bytes_read == 0 {
break;
}
f(&self.buffer);
}
Ok(())
}
}
pub(super) struct MmapLineSource<'a> {
bytes: &'a [u8],
pos: usize,
}
impl<'a> MmapLineSource<'a> {
pub(super) const fn new(bytes: &'a [u8]) -> Self {
Self { bytes, pos: 0 }
}
}
impl LineSource for MmapLineSource<'_> {
fn for_each_line<F>(&mut self, f: &mut F) -> Result<()>
where
F: FnMut(&[u8]),
{
while self.pos < self.bytes.len() {
let line_end =
memchr::memchr(b'\n', &self.bytes[self.pos..]).map_or(self.bytes.len(), |offset| self.pos + offset + 1);
let line_bytes = &self.bytes[self.pos..line_end];
f(line_bytes);
self.pos = line_end;
}
Ok(())
}
}
pub(super) enum FileSource {
Buffered(File),
Mapped(Mmap),
}
impl FileSource {
pub(super) fn open(file_path: &Path, file_size: u64) -> Result<Self> {
let file = File::open(file_path).with_context(|| open_file_context(file_path))?;
if file_size >= MMAP_THRESHOLD {
let mmap = unsafe { Mmap::map(&file) }.with_context(|| mmap_file_context(file_path))?;
Ok(Self::Mapped(mmap))
} else {
Ok(Self::Buffered(file))
}
}
pub(super) fn sample(&mut self, file_size: u64) -> Result<Vec<u8>> {
match self {
Self::Buffered(file) => sample_file(file, file_size),
Self::Mapped(mmap) => Ok(sample_from_slice(mmap)),
}
}
pub(super) fn process(
self,
file_path: &Path,
file_size: u64,
results: &mut AnalysisResults,
collect_details: bool,
language: &'static Language,
encoding: FileEncoding,
) -> Result<()> {
match self {
Self::Buffered(file) => {
process_file_buffered(file_path, file, file_size, results, collect_details, language, encoding)
}
Self::Mapped(mmap) => {
process_file_mmap(file_path, file_size, results, collect_details, language, encoding, &mmap)
}
}
}
}
fn sample_ranges(file_len: u64) -> (usize, Option<(u64, usize)>) {
let start_len = usize::try_from(file_len.min(SAMPLE_SIZE as u64)).unwrap();
if file_len <= SAMPLE_SIZE as u64 {
return (start_len, None);
}
let mut mid_offset = (file_len.saturating_sub(SAMPLE_SIZE as u64)) / 2;
if mid_offset % 2 == 1 {
mid_offset = mid_offset.saturating_sub(1);
}
let mid_len = usize::try_from((mid_offset + SAMPLE_SIZE as u64).min(file_len) - mid_offset).unwrap();
(start_len, Some((mid_offset, mid_len)))
}
fn sample_file(file: &mut File, file_size: u64) -> Result<Vec<u8>> {
let mut buffer = Vec::with_capacity(SAMPLE_SIZE * 2);
let mut chunk = [0u8; SAMPLE_SIZE];
let (start_len, mid_range) = sample_ranges(file_size);
let read_start = file.read(&mut chunk[..start_len])?;
buffer.extend_from_slice(&chunk[..read_start]);
if let Some((mid_offset, mid_len)) = mid_range {
file.seek(SeekFrom::Start(mid_offset))?;
let read_mid = file.read(&mut chunk[..mid_len])?;
buffer.extend_from_slice(&chunk[..read_mid]);
}
file.rewind()?;
Ok(buffer)
}
fn sample_from_slice(file_bytes: &[u8]) -> Vec<u8> {
let mut samples = Vec::with_capacity(SAMPLE_SIZE * 2);
let (start_len, mid_range) = sample_ranges(file_bytes.len() as u64);
samples.extend_from_slice(&file_bytes[..start_len]);
if let Some((mid_offset, mid_len)) = mid_range {
let offset = usize::try_from(mid_offset).unwrap();
samples.extend_from_slice(&file_bytes[offset..offset + mid_len]);
}
samples
}
fn process_file_buffered(
file_path: &Path,
file: File,
file_size: u64,
results: &mut AnalysisResults,
collect_details: bool,
language: &'static Language,
encoding: FileEncoding,
) -> Result<()> {
if encoding::is_utf16(encoding.encoding) {
let mut reader = BufReader::with_capacity(64 * 1024, file);
return encoding::process_utf16_stream(
file_path,
file_size,
results,
collect_details,
language,
encoding,
&mut reader,
);
}
let reader = BufReader::with_capacity(64 * 1024, file);
let mut source = BufLineSource::new(reader);
line_counter::process_lines(file_path, file_size, results, collect_details, language, encoding, &mut source)
}
fn process_file_mmap(
file_path: &Path,
file_size: u64,
results: &mut AnalysisResults,
collect_details: bool,
language: &'static Language,
encoding: FileEncoding,
mmap: &Mmap,
) -> Result<()> {
let file_bytes = mmap.as_ref();
if encoding::is_utf16(encoding.encoding) {
encoding::process_utf16_bytes(file_path, file_size, results, collect_details, language, encoding, file_bytes);
return Ok(());
}
let mut source = MmapLineSource::new(file_bytes);
line_counter::process_lines(file_path, file_size, results, collect_details, language, encoding, &mut source)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sample_ranges_small_file() {
let (start_len, mid) = sample_ranges(100);
assert_eq!(start_len, 100);
assert!(mid.is_none());
}
#[test]
fn test_sample_ranges_exact_sample_size() {
let (start_len, mid) = sample_ranges(SAMPLE_SIZE as u64);
assert_eq!(start_len, SAMPLE_SIZE);
assert!(mid.is_none());
}
#[test]
fn test_sample_ranges_large_file() {
let file_size = 100_000u64;
let (start_len, mid) = sample_ranges(file_size);
assert_eq!(start_len, SAMPLE_SIZE);
let (mid_offset, mid_len) = mid.expect("should have mid range");
assert!(mid_offset > 0);
assert!(mid_offset < file_size - SAMPLE_SIZE as u64);
assert_eq!(mid_len, SAMPLE_SIZE);
assert_eq!(mid_offset % 2, 0);
}
#[test]
fn test_sample_from_slice_small() {
let data: Vec<u8> = (0..100).collect();
let samples = sample_from_slice(&data);
assert_eq!(samples.len(), 100);
assert_eq!(&samples[..], &data[..]);
}
#[test]
fn test_sample_from_slice_large() {
let data: Vec<u8> = (0u8..=255).cycle().take(10_000).collect();
let samples = sample_from_slice(&data);
assert!(samples.len() > SAMPLE_SIZE);
assert!(samples.len() <= SAMPLE_SIZE * 2);
assert_eq!(&samples[..SAMPLE_SIZE], &data[..SAMPLE_SIZE]);
}
#[test]
fn test_mmap_line_source_empty() {
let data: &[u8] = b"";
let mut source = MmapLineSource::new(data);
let mut lines = Vec::new();
source.for_each_line(&mut |line| lines.push(line.to_vec())).unwrap();
assert!(lines.is_empty());
}
#[test]
fn test_mmap_line_source_single_line_no_newline() {
let data = b"hello";
let mut source = MmapLineSource::new(data);
let mut lines = Vec::new();
source.for_each_line(&mut |line| lines.push(line.to_vec())).unwrap();
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], b"hello");
}
#[test]
fn test_mmap_line_source_single_line_with_newline() {
let data = b"hello\n";
let mut source = MmapLineSource::new(data);
let mut lines = Vec::new();
source.for_each_line(&mut |line| lines.push(line.to_vec())).unwrap();
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], b"hello\n");
}
#[test]
fn test_mmap_line_source_multiple_lines() {
let data = b"line1\nline2\nline3";
let mut source = MmapLineSource::new(data);
let mut lines = Vec::new();
source.for_each_line(&mut |line| lines.push(line.to_vec())).unwrap();
assert_eq!(lines.len(), 3);
assert_eq!(lines[0], b"line1\n");
assert_eq!(lines[1], b"line2\n");
assert_eq!(lines[2], b"line3");
}
#[test]
fn test_mmap_line_source_crlf() {
let data = b"line1\r\nline2\r\n";
let mut source = MmapLineSource::new(data);
let mut lines = Vec::new();
source.for_each_line(&mut |line| lines.push(line.to_vec())).unwrap();
assert_eq!(lines.len(), 2);
assert_eq!(lines[0], b"line1\r\n");
assert_eq!(lines[1], b"line2\r\n");
}
#[test]
fn test_buf_line_source_multiple_lines() {
use std::io::Cursor;
let data = b"line1\nline2\nline3";
let reader = std::io::BufReader::new(Cursor::new(data));
let mut source = BufLineSource::new(reader);
let mut lines = Vec::new();
source.for_each_line(&mut |line| lines.push(line.to_vec())).unwrap();
assert_eq!(lines.len(), 3);
assert_eq!(lines[0], b"line1\n");
assert_eq!(lines[1], b"line2\n");
assert_eq!(lines[2], b"line3");
}
}