pub trait Bm25TokenIndexer {
type Bm25TokenIndex;
// Required method
fn index(&self, token: &str) -> Self::Bm25TokenIndex;
}Expand description
Trait for mapping tokens to unique indices for efficient BM25 processing.
This trait defines how string tokens are converted to numerical or other indexable representations.
Some indexing strategies include:
- Hash-based: Use hash functions (e.g. Murmur3) to map tokens to integers
- Dictionary-based: Maintain a mapping from tokens to sequential indices
§Type Parameters
Bm25TokenIndex- The type used to represent token indices. This should typically implementHash,Eq,Clone, and other traits required for use as map keys.
§Examples
use bm25_vectorizer::Bm25TokenIndexer;
use std::collections::HashMap;
// Hash-based token indexer
struct HashTokenIndexer;
impl Bm25TokenIndexer for HashTokenIndexer {
type Bm25TokenIndex = u64;
fn index(&self, token: &str) -> Self::Bm25TokenIndex {
use std::hash::{Hash, Hasher};
// Note: Better hashing algorithms can be used (e.g. Murmur3)
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
token.hash(&mut hasher);
hasher.finish()
}
}
// Dictionary-based token indexer
struct DictionaryIndexer {
token_to_id: HashMap<String, usize>,
next_id: usize,
}
impl DictionaryIndexer {
fn new() -> Self {
Self {
token_to_id: HashMap::new(),
next_id: 0,
}
}
}
impl Bm25TokenIndexer for DictionaryIndexer {
type Bm25TokenIndex = usize;
fn index(&self, token: &str) -> Self::Bm25TokenIndex {
// Note: In a real implementation, you'd want interior mutability
// or a different API design to handle the mutable state
self.token_to_id.get(token).copied().unwrap_or(0)
}
}Required Associated Types§
Sourcetype Bm25TokenIndex
type Bm25TokenIndex
The type used to represent token indices. This associated type defines what kind of index representation is used for tokens.
Required Methods§
Sourcefn index(&self, token: &str) -> Self::Bm25TokenIndex
fn index(&self, token: &str) -> Self::Bm25TokenIndex
Maps a token string to its corresponding index representation.
This method converts a string token into the index type defined by
Bm25TokenIndex.
§Arguments
token- The string token to be indexed
§Returns
An index of type Self::Bm25TokenIndex that uniquely represents the token
§Examples
use bm25_vectorizer::Bm25TokenIndexer;
use std::hash::{Hash, Hasher, DefaultHasher};
struct HashIndexer;
impl Bm25TokenIndexer for HashIndexer {
type Bm25TokenIndex = u64;
fn index(&self, token: &str) -> Self::Bm25TokenIndex {
let mut hasher = DefaultHasher::new();
token.hash(&mut hasher);
hasher.finish()
}
}
let indexer = HashIndexer;
let index1 = indexer.index("hello");
let index2 = indexer.index("hello");
assert_eq!(index1, index2); // Same token, same index