use crate::ngram::vocabulary::{encode_ngram_key_lockfree, SharedConcurrentVocab};
use crate::sources::google_books::NgramStorage;
pub struct VocabularyDictionary<'a> {
vocabulary: SharedConcurrentVocab,
storage: &'a NgramStorage,
}
impl<'a> VocabularyDictionary<'a> {
pub fn new(vocabulary: SharedConcurrentVocab, storage: &'a NgramStorage) -> Self {
Self {
vocabulary,
storage,
}
}
pub fn contains(&self, word: &str) -> bool {
self.vocabulary.get_index(word).is_some()
}
pub fn frequency(&self, word: &str) -> Option<u64> {
self.vocabulary.get_index(word)?;
let key = encode_ngram_key_lockfree(&[word], &self.vocabulary);
self.storage.get(&key)
}
pub fn len(&self) -> u64 {
self.vocabulary.len() as u64
}
pub fn is_empty(&self) -> bool {
self.vocabulary.len() == 0
}
pub fn vocabulary(&self) -> &SharedConcurrentVocab {
&self.vocabulary
}
pub fn storage(&self) -> &NgramStorage {
self.storage
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ngram::vocabulary::open_or_create_concurrent_vocabulary_lockfree;
use crate::sources::google_books::NgramStorage;
use tempfile::TempDir;
#[test]
fn test_vocabulary_dictionary() {
let dir = TempDir::new().expect("Failed to create temp dir");
let vocab_path = dir.path().join("vocab.artrie");
let trie_path = dir.path().join("ngrams.artrie");
let vocabulary = open_or_create_concurrent_vocabulary_lockfree(&vocab_path)
.expect("Failed to create vocabulary");
let storage =
NgramStorage::create_single_trie_with_vocabulary(&trie_path, Some(vocabulary.clone()))
.expect("Failed to create storage");
storage
.store_tokens(&["hello"], 100)
.expect("Failed to store");
storage
.store_tokens(&["world"], 50)
.expect("Failed to store");
storage
.store_tokens(&["test"], 25)
.expect("Failed to store");
storage
.sync_vocabulary()
.expect("Failed to sync vocabulary");
let dict = VocabularyDictionary::new(vocabulary, &storage);
assert!(dict.contains("hello"));
assert!(dict.contains("world"));
assert!(dict.contains("test"));
assert!(!dict.contains("nonexistent"));
assert_eq!(dict.frequency("hello"), Some(100));
assert_eq!(dict.frequency("world"), Some(50));
assert_eq!(dict.frequency("test"), Some(25));
assert_eq!(dict.frequency("nonexistent"), None);
assert_eq!(dict.len(), 3);
assert!(!dict.is_empty());
}
#[test]
fn vocabulary_query_frequency_requires_vocabulary_and_storage_count() {
let dir = TempDir::new().expect("Failed to create temp dir");
let vocab_path = dir.path().join("vocab.artrie");
let trie_path = dir.path().join("ngrams.artrie");
let vocabulary = open_or_create_concurrent_vocabulary_lockfree(&vocab_path)
.expect("Failed to create vocabulary");
let storage =
NgramStorage::create_single_trie_with_vocabulary(&trie_path, Some(vocabulary.clone()))
.expect("Failed to create storage");
storage
.store_tokens(&["known"], 11)
.expect("Failed to store unigram");
storage
.sync_vocabulary()
.expect("Failed to sync vocabulary");
vocabulary
.insert("vocab_only")
.expect("Failed to insert vocabulary-only word");
let dict = VocabularyDictionary::new(vocabulary, &storage);
assert!(dict.contains("known"));
assert_eq!(dict.frequency("known"), Some(11));
assert!(dict.contains("vocab_only"));
assert_eq!(
dict.frequency("vocab_only"),
None,
"frequency requires both vocabulary membership and a unigram count"
);
assert!(!dict.contains("missing"));
assert_eq!(dict.frequency("missing"), None);
}
#[test]
fn test_vocabulary_dictionary_empty() {
let dir = TempDir::new().expect("Failed to create temp dir");
let vocab_path = dir.path().join("vocab.artrie");
let trie_path = dir.path().join("ngrams.artrie");
let vocabulary = open_or_create_concurrent_vocabulary_lockfree(&vocab_path)
.expect("Failed to create vocabulary");
let storage =
NgramStorage::create_single_trie_with_vocabulary(&trie_path, Some(vocabulary.clone()))
.expect("Failed to create storage");
let dict = VocabularyDictionary::new(vocabulary, &storage);
assert!(dict.is_empty());
assert!(!dict.contains("anything"));
assert_eq!(dict.frequency("anything"), None);
}
}