pub mod weights;
mod covering;
pub use covering::{build_covering, build_covering_inner};
use weights::BIGRAM_WEIGHTS;
use xxhash_rust::xxh64::xxh64;
pub const BOUNDARY_THRESHOLD: u16 = 28000;
pub const MIN_GRAM_LEN: usize = 3;
pub const MAX_GRAM_LEN: usize = 128;
#[inline]
pub fn gram_hash(gram: &[u8]) -> u64 {
xxh64(gram, 0)
}
#[inline]
fn is_forced_boundary(byte: u8) -> bool {
matches!(
byte,
b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C
| b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>'
| b'.' | b',' | b':' | b';'
| b'=' | b'+' | b'-' | b'*' | b'/' | b'%'
| b'!' | b'&' | b'|' | b'^' | b'~'
| b'"' | b'\'' | b'`'
| b'@' | b'#' | b'$' | b'?'
| b'_'
| 0x00..=0x08 | 0x0E..=0x1F | 0x7F
)
}
fn boundary_positions(bytes: &[u8]) -> Vec<usize> {
let n = bytes.len();
let mut positions = Vec::with_capacity(n / 4);
positions.push(0);
for i in 1..n {
if is_forced_boundary(bytes[i]) || is_forced_boundary(bytes[i - 1]) {
positions.push(i);
continue;
}
if bytes[i - 1].is_ascii_lowercase() && bytes[i].is_ascii_uppercase() {
positions.push(i);
continue;
}
let left = bytes[i - 1].to_ascii_lowercase();
let right = bytes[i].to_ascii_lowercase();
let idx = (left as usize) << 8 | (right as usize);
if BIGRAM_WEIGHTS[idx] >= BOUNDARY_THRESHOLD {
positions.push(i);
}
}
if n > 0 {
positions.push(n);
}
positions.dedup();
positions
}
#[cfg(test)]
fn boundary_positions_lower(bytes: &[u8]) -> Vec<usize> {
let n = bytes.len();
let mut positions = Vec::with_capacity(n / 4);
positions.push(0);
for i in 1..n {
if is_forced_boundary(bytes[i]) || is_forced_boundary(bytes[i - 1]) {
positions.push(i);
continue;
}
let idx = (bytes[i - 1] as usize) << 8 | (bytes[i] as usize);
if BIGRAM_WEIGHTS[idx] >= BOUNDARY_THRESHOLD {
positions.push(i);
}
}
if n > 0 {
positions.push(n);
}
positions.dedup();
positions
}
fn with_boundary_positions_lower<F, R>(bytes: &[u8], f: F) -> R
where
F: FnOnce(&[usize]) -> R,
{
thread_local! {
static BUF: std::cell::RefCell<Vec<usize>> = std::cell::RefCell::new(Vec::with_capacity(256));
}
BUF.with(|buf| {
let mut buf = buf.borrow_mut();
buf.clear();
const MIN_CAPACITY: usize = 256;
let needed = bytes.len() / 4 + 16;
if buf.capacity() > MIN_CAPACITY.max(needed * 4) {
buf.shrink_to(MIN_CAPACITY.max(needed));
}
let n = bytes.len();
buf.push(0);
for i in 1..n {
if is_forced_boundary(bytes[i]) || is_forced_boundary(bytes[i - 1]) {
buf.push(i);
continue;
}
let idx = (bytes[i - 1] as usize) << 8 | (bytes[i] as usize);
if BIGRAM_WEIGHTS[idx] >= BOUNDARY_THRESHOLD {
buf.push(i);
}
}
if n > 0 {
buf.push(n);
}
buf.dedup();
f(&buf)
})
}
pub fn build_all(input: &[u8]) -> Vec<u64> {
if input.is_empty() {
return Vec::new();
}
let lower: Vec<u8> = input.iter().map(|b| b.to_ascii_lowercase()).collect();
with_boundary_positions_lower(&lower, |lower_boundaries| {
let mut hashes = Vec::new();
append_grams_for_boundaries(&mut hashes, &lower, lower_boundaries);
let camel_boundaries = camel_case_boundaries(input);
if camel_boundaries.is_empty() {
return hashes;
}
let merged_boundaries = merge_boundaries(lower_boundaries, &camel_boundaries);
append_new_grams_for_boundaries(&mut hashes, &lower, lower_boundaries, &merged_boundaries);
hashes
})
}
fn append_grams_for_boundaries(hashes: &mut Vec<u64>, lower: &[u8], boundaries: &[usize]) {
for w in boundaries.windows(2) {
let (start, end) = (w[0], w[1]);
let span = end - start;
if (MIN_GRAM_LEN..=MAX_GRAM_LEN).contains(&span) {
hashes.push(gram_hash(&lower[start..end]));
}
}
}
fn camel_case_boundaries(bytes: &[u8]) -> Vec<usize> {
let mut positions = Vec::new();
for i in 1..bytes.len() {
if bytes[i - 1].is_ascii_lowercase() && bytes[i].is_ascii_uppercase() {
positions.push(i);
}
}
positions
}
fn merge_boundaries(base: &[usize], extra: &[usize]) -> Vec<usize> {
let mut merged = Vec::with_capacity(base.len() + extra.len());
let mut base_i = 0;
let mut extra_i = 0;
while base_i < base.len() || extra_i < extra.len() {
let next = match (base.get(base_i), extra.get(extra_i)) {
(Some(&base_pos), Some(&extra_pos)) if base_pos <= extra_pos => {
base_i += 1;
if base_pos == extra_pos {
extra_i += 1;
}
base_pos
}
(Some(&_base_pos), Some(&extra_pos)) => {
extra_i += 1;
extra_pos
}
(Some(&base_pos), None) => {
base_i += 1;
base_pos
}
(None, Some(&extra_pos)) => {
extra_i += 1;
extra_pos
}
(None, None) => break,
};
if merged.last().copied() != Some(next) {
merged.push(next);
}
}
merged
}
fn append_new_grams_for_boundaries(
hashes: &mut Vec<u64>,
lower: &[u8],
base_boundaries: &[usize],
merged_boundaries: &[usize],
) {
let mut base_windows = base_boundaries.windows(2);
let mut current_base = base_windows.next();
for merged in merged_boundaries.windows(2) {
let merged_pair = (merged[0], merged[1]);
while let Some(base) = current_base {
let base_pair = (base[0], base[1]);
if base_pair < merged_pair {
current_base = base_windows.next();
} else {
break;
}
}
if current_base
.map(|base| (base[0], base[1]) == merged_pair)
.unwrap_or(false)
{
continue;
}
let span = merged_pair.1 - merged_pair.0;
if (MIN_GRAM_LEN..=MAX_GRAM_LEN).contains(&span) {
hashes.push(gram_hash(&lower[merged_pair.0..merged_pair.1]));
}
}
}
#[cfg(test)]
mod tests;