Struct GramDex

Source

pub struct GramDex { /* private fields */ }

Expand description

A minimal grams->docs candidate index.

Implementations§

Source §

impl GramDex

Source

pub fn new() -> Self

Create an empty index.

Source

pub fn add_document(&mut self, doc_id: DocId, grams: &[String])

Insert grams for a document id.

Source

pub fn add_document_char_kgrams( &mut self, doc_id: DocId, text: &str, k: usize, ) -> Result<(), Error>

Insert Unicode-scalar k-grams for a document id.

Source

pub fn add_document_trigrams(&mut self, doc_id: DocId, text: &str)

Insert Unicode-scalar trigrams for a document id.

§Examples

let mut ix = gramdex::GramDex::new();
ix.add_document_trigrams(0, "hello");
ix.add_document_trigrams(1, "world");
assert_eq!(ix.num_docs(), 2);

Source

pub fn num_docs(&self) -> u32

Number of indexed documents.

Source

pub fn df(&self, gram: &str) -> u32

Document frequency for a gram (number of docs containing it).

Source

pub fn document_ids(&self) -> impl Iterator<Item = DocId> + '_

Iterate all known document ids.

Source

pub fn candidates_union(&self, query_grams: &[String]) -> Vec<DocId> ⓘ

Candidate docs: union of docs that share at least one query gram.

This is intentionally permissive (no false negatives for the grams set), but may include many false positives; callers should verify.

Source

pub fn candidates_union_char_kgrams( &self, text: &str, k: usize, ) -> Result<Vec<DocId>, Error>

Convenience: union candidates for Unicode-scalar k-grams of text.

Source

pub fn candidates_union_char_kgrams_bounded( &self, text: &str, k: usize, cfg: PlannerConfig, ) -> Result<Vec<DocId>, Error>

Convenience: bounded union candidates for Unicode-scalar k-grams of text.

Source

pub fn candidates_union_trigrams(&self, text: &str) -> Vec<DocId> ⓘ

Convenience: union candidates for Unicode-scalar trigrams of text.

§Examples

let mut ix = gramdex::GramDex::new();
ix.add_document_trigrams(0, "hello");
ix.add_document_trigrams(1, "yellow");

let cands = ix.candidates_union_trigrams("mellow");
assert!(cands.contains(&1)); // "yellow" shares trigrams with "mellow"

Source

pub fn candidates_union_trigrams_bounded( &self, text: &str, cfg: PlannerConfig, ) -> Vec<DocId> ⓘ

Convenience: bounded union candidates for Unicode-scalar trigrams of text.

Source

pub fn candidates_union_scored( &self, query_grams: &[String], ) -> Vec<(DocId, u32)>

Candidate docs with an overlap count: number of distinct query grams that appear in each document.

This is useful for cheap pruning before expensive verification: min_shared = 2 often removes many candidates vs plain union.

§Examples

let mut ix = gramdex::GramDex::new();
ix.add_document_trigrams(0, "abcdef");
ix.add_document_trigrams(1, "abcxyz");

let grams = gramdex::char_trigrams("abcde");
let scored = ix.candidates_union_scored(&grams);
// doc 0 shares more trigrams than doc 1
assert!(scored[0].0 == 0 && scored[0].1 >= 2);

Source

pub fn candidates_union_min_shared( &self, query_grams: &[String], min_shared: u32, ) -> Vec<DocId> ⓘ

Candidate docs that share at least min_shared distinct query grams.

Source

pub fn candidates_union_trigrams_min_shared( &self, text: &str, min_shared: u32, ) -> Vec<DocId> ⓘ

Convenience: candidates that share at least min_shared trigrams with text.

Source

pub fn plan_candidates_union( &self, query_grams: &[String], cfg: PlannerConfig, ) -> CandidatePlan

Plan candidate generation with bailout thresholds.

This uses the actual union size (not the loose upper bound ( \sum_g df(g) )), which avoids false ScanAll decisions when grams overlap heavily. It is slightly more work than a pure df-based bound, but it reuses the work needed to produce the final candidate set.

Source