use syntext::tokenizer::{build_all, build_covering, gram_hash, MIN_GRAM_LEN};
fn all_hashes(input: &[u8]) -> std::collections::HashSet<u64> {
build_all(input).into_iter().collect()
}
fn covering_hashes(input: &[u8]) -> Vec<u64> {
build_covering(input).unwrap_or_default()
}
#[test]
fn parse_query_produces_some_grams() {
let all = all_hashes(b"parse_query");
assert!(
!all.is_empty(),
"parse_query must produce at least one indexable gram"
);
assert!(
build_covering(b"parse_query").is_some(),
"parse_query covering set must be non-empty (pattern must be indexable)"
);
}
#[test]
fn boundary_near_open_paren() {
let all = all_hashes(b"parse_query(");
assert!(
!all.is_empty(),
"parse_query( must produce at least one gram"
);
}
#[test]
fn digits_in_token() {
let all = all_hashes(b"user123");
let _ = all;
}
#[test]
fn build_all_case_insensitive() {
let lower = all_hashes(b"parse_query");
let upper = all_hashes(b"PARSE_QUERY");
let mixed = all_hashes(b"Parse_Query");
assert_eq!(
lower, upper,
"uppercase must produce same grams as lowercase"
);
assert_eq!(
lower, mixed,
"mixed case must produce same grams as lowercase"
);
}
#[test]
fn build_covering_case_insensitive() {
let lower = covering_hashes(b"parse_query");
let upper = covering_hashes(b"PARSE_QUERY");
let lower_set: std::collections::HashSet<u64> = lower.into_iter().collect();
let upper_set: std::collections::HashSet<u64> = upper.into_iter().collect();
assert_eq!(lower_set, upper_set);
}
#[test]
fn covering_no_duplicates() {
let covering = build_covering(b"parse_query_engine").unwrap_or_default();
let unique: std::collections::HashSet<u64> = covering.iter().copied().collect();
assert_eq!(
covering.len(),
unique.len(),
"build_covering emitted duplicate hashes"
);
}
#[test]
fn single_token_one_gram() {
let covering = build_covering(b"xyz");
if let Some(hashes) = covering {
assert_eq!(
hashes.len(),
1,
"single token must produce exactly 1 covering gram"
);
}
}
#[test]
fn covering_subset_of_all_parse_query() {
let all = all_hashes(b"parse_query");
for h in covering_hashes(b"parse_query") {
assert!(
all.contains(&h),
"covering gram not in all-grams for 'parse_query'"
);
}
}
#[test]
fn covering_subset_of_all_process_batch() {
let all = all_hashes(b"process_batch");
for h in covering_hashes(b"process_batch") {
assert!(
all.contains(&h),
"covering gram not in all-grams for 'process_batch'"
);
}
}
#[test]
fn covering_subset_of_all_generic() {
for input in &[
b"hello_world" as &[u8],
b"foo.bar.baz",
b"x",
b"ab",
b"abc",
b"function_call(",
b"return_value",
b"impl_trait",
] {
let all = all_hashes(input);
for h in covering_hashes(input) {
assert!(
all.contains(&h),
"covering gram not in all-grams for {:?}",
std::str::from_utf8(input).unwrap_or("<non-utf8>")
);
}
}
}
#[test]
fn empty_input() {
assert!(build_all(b"").is_empty());
assert!(build_covering(b"").is_none());
}
#[test]
fn one_byte() {
let _ = build_all(b"x");
assert!(build_covering(b"x").is_none());
}
#[test]
fn two_bytes() {
let _ = build_all(b"xy");
assert!(build_covering(b"xy").is_none());
}
#[test]
fn exactly_min_gram_len() {
let _ = build_all(b"abc");
}
#[test]
fn all_same_bytes() {
let _ = build_all(b"aaaaaaaaaaaaaaaa");
let _ = build_covering(b"aaaaaaaaaaaaaaaa");
}
#[test]
fn input_exactly_max_gram_len() {
let input = vec![b'a'; syntext::tokenizer::MAX_GRAM_LEN];
let all = build_all(&input);
let _ = all;
}
#[test]
fn input_just_over_max_gram_len() {
let input = vec![b'a'; syntext::tokenizer::MAX_GRAM_LEN + 1];
let all = build_all(&input);
assert!(
all.is_empty(),
"no gram should be emitted for span exceeding MAX_GRAM_LEN"
);
}
#[test]
fn non_ascii_bytes_do_not_panic() {
let _ = build_all("café_résumé".as_bytes());
let _ = build_covering("café_résumé".as_bytes());
}
#[test]
fn bar_gram_in_foobar() {
let all = all_hashes(b"foobar");
let _ = all;
}
#[test]
fn gram_hash_length_matters() {
let h3 = gram_hash(b"par");
let h4 = gram_hash(b"pars");
let h5 = gram_hash(b"parse");
assert_ne!(h3, h4);
assert_ne!(h4, h5);
assert_ne!(h3, h5);
}
#[test]
fn min_gram_len_is_three() {
assert_eq!(MIN_GRAM_LEN, 3, "MIN_GRAM_LEN must be 3 (trigram floor)");
}
#[test]
fn forced_boundary_splits_snake_case() {
let covering = build_covering(b"parse_query").unwrap();
assert!(
covering.len() >= 2,
"parse_query must produce at least 2 covering grams, got {}",
covering.len()
);
}
#[test]
fn covering_subset_in_document_context() {
let documents: &[&[u8]] = &[
b"fn parse_query(args: &str) -> Query {",
b"def process_batch(items, config):",
b"import { HashMap } from 'collections';",
b"let result = self.name.to_string();",
b"__init__(self, parse_query_engine)",
b"PARSE_QUERY_MAX_LEN = 4096",
];
for doc in documents {
let all: std::collections::HashSet<u64> = build_all(doc).into_iter().collect();
let queries: &[&[u8]] = &[
b"parse", b"query", b"process", b"batch", b"HashMap", b"result", b"self", b"name",
b"args", b"items",
];
for q in queries {
if let Some(covering) = build_covering(q) {
for h in &covering {
let q_str = std::str::from_utf8(q).unwrap();
let doc_str = String::from_utf8_lossy(doc).to_ascii_lowercase();
if doc_str.contains(&q_str.to_ascii_lowercase()) {
assert!(
all.contains(h),
"COVERAGE VIOLATION: query={:?} in doc={:?}, gram {:016x} not found",
q_str,
String::from_utf8_lossy(doc),
h
);
}
}
}
}
}
}