use {
super::*,
memmap2::Mmap,
std::{
convert::TryInto,
fmt,
io,
path::Path,
},
};
#[derive(Clone)]
pub struct Needle {
bytes: Box<[u8]>,
max_file_size: usize,
}
impl fmt::Debug for Needle {
fn fmt(
&self,
f: &mut fmt::Formatter<'_>,
) -> fmt::Result {
f.debug_struct("Needle")
.field("bytes", &self.bytes)
.finish_non_exhaustive()
}
}
impl Needle {
pub fn new(
pat: &str,
max_file_size: usize,
) -> Self {
let bytes = pat.as_bytes().to_vec().into_boxed_slice();
Self {
bytes,
max_file_size,
}
}
pub fn is_empty(&self) -> bool {
self.bytes.is_empty()
}
pub fn as_str(&self) -> &str {
unsafe { std::str::from_utf8_unchecked(&self.bytes) }
}
fn find_naive_1(
&self,
hay: &Mmap,
) -> Option<usize> {
let n = self.bytes[0];
hay.iter().position(|&b| b == n)
}
fn find_naive_2(
&self,
mut pos: usize,
hay: &Mmap,
) -> Option<usize> {
let max_pos = hay.len() - 2;
let b0 = self.bytes[0];
let b1 = self.bytes[1];
unsafe {
while pos <= max_pos {
if *hay.get_unchecked(pos) == b0 && *hay.get_unchecked(pos + 1) == b1 {
return Some(pos);
}
pos += 1;
}
}
None
}
fn find_naive_3(
&self,
mut pos: usize,
hay: &Mmap,
) -> Option<usize> {
let max_pos = hay.len() - 3;
let b0 = self.bytes[0];
let b1 = self.bytes[1];
let b2 = self.bytes[2];
unsafe {
while pos <= max_pos {
if *hay.get_unchecked(pos) == b0
&& *hay.get_unchecked(pos + 1) == b1
&& *hay.get_unchecked(pos + 2) == b2
{
return Some(pos);
}
pos += 1;
}
}
None
}
fn find_naive_4(
&self,
mut pos: usize,
hay: &Mmap,
) -> Option<usize> {
let max_pos = hay.len() - 4;
let needle = u32::from_ne_bytes((&*self.bytes).try_into().unwrap());
while pos <= max_pos {
if u32::from_ne_bytes((&hay[pos..pos + 4]).try_into().unwrap()) == needle {
return Some(pos);
}
pos += 1;
}
None
}
fn find_naive_6(
&self,
mut pos: usize,
hay: &Mmap,
) -> Option<usize> {
let max_pos = hay.len() - 6;
let b0 = self.bytes[0];
let b1 = self.bytes[1];
let b2 = self.bytes[2];
let b3 = self.bytes[3];
let b4 = self.bytes[4];
let b5 = self.bytes[5];
unsafe {
while pos <= max_pos {
if *hay.get_unchecked(pos) == b0
&& *hay.get_unchecked(pos + 1) == b1
&& *hay.get_unchecked(pos + 2) == b2
&& *hay.get_unchecked(pos + 3) == b3
&& *hay.get_unchecked(pos + 4) == b4
&& *hay.get_unchecked(pos + 5) == b5
{
return Some(pos);
}
pos += 1;
}
}
None
}
fn is_at_pos(
&self,
hay_stack: &Mmap,
pos: usize,
) -> bool {
unsafe {
for (i, b) in self.bytes.iter().enumerate() {
if hay_stack.get_unchecked(i + pos) != b {
return false;
}
}
}
true
}
fn find_naive(
&self,
mut pos: usize,
hay: &Mmap,
) -> Option<usize> {
let max_pos = hay.len() - self.bytes.len();
while pos <= max_pos {
if self.is_at_pos(hay, pos) {
return Some(pos);
}
pos += 1;
}
None
}
fn search_mmap(
&self,
hay: &Mmap,
) -> ContentSearchResult {
if hay.len() < self.bytes.len() {
return ContentSearchResult::NotFound;
}
#[cfg(not(any(target_family = "windows", target_os = "android")))]
unsafe {
libc::posix_madvise(
hay.as_ptr() as *mut std::ffi::c_void,
hay.len(),
libc::POSIX_MADV_SEQUENTIAL,
);
}
let pos = match self.bytes.len() {
1 => self.find_naive_1(hay),
2 => self.find_naive_2(0, hay),
3 => self.find_naive_3(0, hay),
4 => self.find_naive_4(0, hay),
6 => self.find_naive_6(0, hay),
_ => self.find_naive(0, hay),
};
pos.map_or(ContentSearchResult::NotFound, |pos| {
ContentSearchResult::Found { pos }
})
}
pub fn search<P: AsRef<Path>>(
&self,
hay_path: P,
) -> io::Result<ContentSearchResult> {
super::get_mmap_if_suitable(hay_path, self.max_file_size).map(|om| {
om.map_or(ContentSearchResult::NotSuitable, |hay| {
self.search_mmap(&hay)
})
})
}
pub fn get_match<P: AsRef<Path>>(
&self,
hay_path: P,
desired_len: usize,
) -> Option<ContentMatch> {
let Ok(hay) = get_mmap(hay_path) else {
return None;
};
match self.search_mmap(&hay) {
ContentSearchResult::Found { pos } => {
Some(ContentMatch::build(&hay, pos, self.as_str(), desired_len))
}
_ => None,
}
}
}
#[cfg(test)]
mod content_search_tests {
use super::*;
#[test]
fn test_found() -> Result<(), io::Error> {
let needle = Needle::new("inception", 1_000_000);
let res = needle.search("src/content_search/needle.rs")?;
assert!(res.is_found());
Ok(())
}
}