libgrammstein 0.1.0

Hybrid language model (N-gram + Embeddings) for WFST text correction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
//! N-gram trie wrapper over liblevenshtein dictionary backends.
//!
//! This module provides a high-level interface for storing and querying n-grams
//! using liblevenshtein-rust's dictionary implementations.
//!
//! # Key Encoding
//!
//! N-gram keys can be encoded in two ways:
//!
//! 1. **Legacy (pipe-separated)**: `"the|quick|brown"` - Simple but vulnerable to
//!    corruption if tokens contain the pipe character.
//!
//! 2. **Vocabulary-indexed (PUA)**: Each word maps to a Unicode Private Use Area
//!    character, and n-gram keys are sequences of these characters. This eliminates
//!    the delimiter bug entirely.
//!
//! New code should use vocabulary-indexed encoding via [`crate::ngram::vocabulary`].

use super::entry::NgramEntry;
use libdictenstein::persistent_artrie::SharedTrieAccess;
use liblevenshtein::dictionary::{MappedDictionaryNode, MutableMappedDictionary};
use std::marker::PhantomData;
use std::sync::Arc;

/// Trait for dictionaries that support iteration over (key, value) pairs.
///
/// This is used for portable serialization, allowing models to be saved
/// and loaded without requiring the dictionary to implement serde traits.
pub trait IterableDictionary: MutableMappedDictionary<Value = NgramEntry> {
    /// Iterate over all (key, value) pairs in the dictionary.
    fn iter_all(&self) -> Box<dyn Iterator<Item = (String, NgramEntry)> + '_>;
}

// Implement IterableDictionary for DynamicDawgChar
impl IterableDictionary
    for liblevenshtein::dictionary::dynamic_dawg_char::DynamicDawgChar<NgramEntry>
{
    fn iter_all(&self) -> Box<dyn Iterator<Item = (String, NgramEntry)> + '_> {
        Box::new(self.iter())
    }
}

// Implement IterableDictionary for PathMapDictionary
impl IterableDictionary for liblevenshtein::dictionary::pathmap::PathMapDictionary<NgramEntry> {
    fn iter_all(&self) -> Box<dyn Iterator<Item = (String, NgramEntry)> + '_> {
        Box::new(self.iter())
    }
}

// Implement IterableDictionary for the disk-backed char ARTrie (shared handle).
// `SharedCharARTrie<V> = Arc<PersistentARTrieChar<V>>` (F4 lock-collapse) already
// implements `MutableMappedDictionary<Value = NgramEntry>`, so the supertrait holds;
// this adds the portable-serialization iteration hook so the type can back
// HybridLanguageModel / NgramModel / TrainerBuilder.
impl IterableDictionary for libdictenstein::persistent_artrie_char::SharedCharARTrie<NgramEntry> {
    fn iter_all(&self) -> Box<dyn Iterator<Item = (String, NgramEntry)> + '_> {
        // `iter_with_values()` borrows the lock-free `SharedTrieAccess` read guard;
        // materialize into an owned Vec so the returned iterator does not borrow a
        // dropped guard. The collect is load-bearing (lifetime detach), not a
        // `needless_collect`.
        let entries: Vec<(String, NgramEntry)> = self.read().iter_with_values().collect();
        Box::new(entries.into_iter())
    }
}

// Implement IterableDictionary for the vocabulary-indexed wrapper. The backend `D`
// stores varint-encoded latin1 keys; reconstruct each word string by decoding the
// key to vocabulary indices and reverse-looking-up each index, joined with the
// wrapper's delimiter (pgmcp pins it to '|' to match LEGACY_NGRAM_SEPARATOR, so the
// portable keys round-trip through NgramTrie's legacy split/join). A key whose index
// is missing from the vocabulary is skipped defensively (no panic).
impl<D> IterableDictionary for super::vocabulary_indexed::VocabularyIndexedDictionary<D>
where
    D: IterableDictionary,
    D::Node: MappedDictionaryNode<Unit = char>,
{
    fn iter_all(&self) -> Box<dyn Iterator<Item = (String, NgramEntry)> + '_> {
        let delimiter = self.delimiter().to_string();
        // Hold one vocab read guard across all reverse lookups.
        let guard = self.vocabulary().read();
        let decoded: Vec<(String, NgramEntry)> = self
            .backend()
            .iter_all()
            .filter_map(|(key, entry)| {
                let indices = super::vocabulary_indexed::decode_key_to_indices(&key);
                if indices.is_empty() {
                    return None;
                }
                let mut words = Vec::with_capacity(indices.len());
                for idx in indices {
                    words.push(guard.get_term(idx)?);
                }
                Some((words.join(&delimiter), entry))
            })
            .collect();
        drop(guard);
        Box::new(decoded.into_iter())
    }
}

/// Separator used between tokens in legacy n-gram keys.
///
/// # Deprecation Notice
///
/// This encoding scheme is deprecated because it can cause silent data corruption
/// if a token contains the pipe character. For example:
///
/// ```text
/// ["foo|bar", "baz"] → "foo|bar|baz" → ["foo", "bar", "baz"]  // WRONG!
/// ```
///
/// Use [`crate::ngram::vocabulary`] for the new vocabulary-indexed encoding
/// that avoids this issue.
#[deprecated(
    since = "0.3.0",
    note = "Use vocabulary-indexed encoding via crate::ngram::vocabulary instead. \
            Pipe-separated keys can corrupt data if tokens contain '|'."
)]
pub const NGRAM_SEPARATOR: char = '|';

// Re-export the non-deprecated version for internal use during migration
pub(crate) const LEGACY_NGRAM_SEPARATOR: char = '|';

/// N-gram trie wrapper providing high-level n-gram operations.
///
/// Wraps a dictionary backend (like `DynamicDawgChar` or `PathMapDictionary`)
/// to provide n-gram-specific operations like key encoding and batch updates.
///
/// # Type Parameters
///
/// * `D` - The dictionary backend type, must implement `MutableMappedDictionary<Value = NgramEntry>`
///
/// # Example
///
/// ```ignore
/// use libgrammstein::ngram::{NgramTrie, NgramEntry};
/// use liblevenshtein::dictionary::dynamic_dawg_char::DynamicDawgChar;
///
/// let dict = DynamicDawgChar::<NgramEntry>::new();
/// let trie = NgramTrie::new(dict);
///
/// trie.insert(&["the", "quick", "brown"]);
/// assert_eq!(trie.get(&["the", "quick", "brown"]).map(|e| e.count()), Some(1));
/// ```
#[derive(serde::Serialize, serde::Deserialize)]
#[serde(bound = "D: serde::Serialize + serde::de::DeserializeOwned")]
pub struct NgramTrie<D>
where
    D: MutableMappedDictionary<Value = NgramEntry>,
{
    /// The underlying dictionary backend.
    dictionary: Arc<D>,

    /// Maximum n-gram order stored in this trie.
    max_order: usize,

    /// Phantom data for type parameter.
    #[serde(skip)]
    _marker: PhantomData<D>,
}

impl<D> NgramTrie<D>
where
    D: MutableMappedDictionary<Value = NgramEntry>,
{
    /// Create a new n-gram trie wrapping the given dictionary.
    pub fn new(dictionary: D, max_order: usize) -> Self {
        Self {
            dictionary: Arc::new(dictionary),
            max_order,
            _marker: PhantomData,
        }
    }

    /// Create from an existing Arc-wrapped dictionary.
    pub fn from_arc(dictionary: Arc<D>, max_order: usize) -> Self {
        Self {
            dictionary,
            max_order,
            _marker: PhantomData,
        }
    }

    /// Get the maximum n-gram order.
    #[inline]
    pub fn max_order(&self) -> usize {
        self.max_order
    }

    /// Get a reference to the underlying dictionary.
    #[inline]
    pub fn dictionary(&self) -> &D {
        &self.dictionary
    }

    /// Get a clone of the Arc-wrapped dictionary.
    #[inline]
    pub fn dictionary_arc(&self) -> Arc<D> {
        Arc::clone(&self.dictionary)
    }

    /// Encode an n-gram as a dictionary key using legacy pipe-separated format.
    ///
    /// # Deprecation Notice
    ///
    /// This function is deprecated because pipe-separated encoding can cause
    /// silent data corruption if tokens contain the pipe character.
    ///
    /// Use [`crate::ngram::vocabulary::encode_ngram_key`] for the new
    /// vocabulary-indexed encoding.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let key = NgramTrie::<D>::encode_key(&["the", "quick", "brown"]);
    /// assert_eq!(key, "the|quick|brown");
    /// ```
    #[inline]
    #[deprecated(
        since = "0.3.0",
        note = "Use vocabulary::encode_ngram_key() instead. \
                Pipe-separated keys can corrupt data if tokens contain '|'."
    )]
    pub fn encode_key(tokens: &[&str]) -> String {
        Self::encode_key_legacy(tokens)
    }

    /// Encode an n-gram as a dictionary key using legacy pipe-separated format.
    ///
    /// This is the internal implementation used during migration. New code should
    /// use [`crate::ngram::vocabulary::encode_ngram_key`] instead.
    #[inline]
    pub(crate) fn encode_key_legacy(tokens: &[&str]) -> String {
        tokens.join(&LEGACY_NGRAM_SEPARATOR.to_string())
    }

    /// Insert or increment an n-gram count.
    ///
    /// If the n-gram exists, increments its count. Otherwise, inserts it with count 1.
    ///
    /// # Note
    ///
    /// This method uses legacy pipe-separated encoding. For vocabulary-indexed
    /// encoding, use [`Self::insert_with_key`] with a key from
    /// [`crate::ngram::vocabulary::encode_ngram_key`].
    ///
    /// # Returns
    ///
    /// `true` if this was a new n-gram (inserted), `false` if it already existed (incremented).
    pub fn insert(&self, tokens: &[&str]) -> bool {
        let key = Self::encode_key_legacy(tokens);
        self.dictionary
            .update_or_insert(&key, NgramEntry::new(1), |entry| entry.increment())
    }

    /// Insert or increment an n-gram using a pre-encoded key.
    ///
    /// Use this with vocabulary-indexed keys from [`crate::ngram::vocabulary::encode_ngram_key`].
    ///
    /// # Returns
    ///
    /// `true` if this was a new n-gram (inserted), `false` if it already existed (incremented).
    pub fn insert_with_key(&self, key: &str) -> bool {
        self.dictionary
            .update_or_insert(key, NgramEntry::new(1), |entry| entry.increment())
    }

    /// Insert an n-gram with a specific count.
    ///
    /// # Note
    ///
    /// This method uses legacy pipe-separated encoding. For vocabulary-indexed
    /// encoding, use [`Self::insert_with_key_and_count`].
    pub fn insert_with_count(&self, tokens: &[&str], count: u64) -> bool {
        let key = Self::encode_key_legacy(tokens);
        self.dictionary
            .insert_with_value(&key, NgramEntry::new(count))
    }

    /// Insert an n-gram with a specific count using a pre-encoded key.
    pub fn insert_with_key_and_count(&self, key: &str, count: u64) -> bool {
        self.dictionary
            .insert_with_value(key, NgramEntry::new(count))
    }

    /// Get the entry for an n-gram, if it exists.
    ///
    /// # Note
    ///
    /// This method uses legacy pipe-separated encoding. For vocabulary-indexed
    /// encoding, use [`Self::get_by_key`].
    pub fn get(&self, tokens: &[&str]) -> Option<NgramEntry> {
        let key = Self::encode_key_legacy(tokens);
        self.dictionary.get_value(&key)
    }

    /// Get the entry for an n-gram using a pre-encoded key.
    pub fn get_by_key(&self, key: &str) -> Option<NgramEntry> {
        self.dictionary.get_value(key)
    }

    /// Check if an n-gram exists in the trie.
    ///
    /// # Note
    ///
    /// This method uses legacy pipe-separated encoding. For vocabulary-indexed
    /// encoding, use [`Self::contains_key`].
    pub fn contains(&self, tokens: &[&str]) -> bool {
        let key = Self::encode_key_legacy(tokens);
        self.dictionary.contains(&key)
    }

    /// Check if an n-gram exists in the trie using a pre-encoded key.
    pub fn contains_key(&self, key: &str) -> bool {
        self.dictionary.contains(key)
    }

    /// Get the count for an n-gram, or 0 if it doesn't exist.
    #[inline]
    pub fn count(&self, tokens: &[&str]) -> u64 {
        self.get(tokens).map(|e| e.count()).unwrap_or(0)
    }

    /// Get the count for an n-gram using a pre-encoded key, or 0 if it doesn't exist.
    #[inline]
    pub fn count_by_key(&self, key: &str) -> u64 {
        self.get_by_key(key).map(|e| e.count()).unwrap_or(0)
    }

    /// Update continuation count for an n-gram.
    ///
    /// This is called during the second pass of training to set
    /// the number of unique preceding contexts.
    ///
    /// # Note
    ///
    /// This method uses legacy pipe-separated encoding. For vocabulary-indexed
    /// encoding, use [`Self::update_continuation_count_by_key`].
    pub fn update_continuation_count(&self, tokens: &[&str], continuation_count: u32) {
        let key = Self::encode_key_legacy(tokens);
        self.dictionary.update_or_insert(
            &key,
            NgramEntry::with_stats(0, continuation_count, 0),
            |entry| entry.set_continuation_count(continuation_count),
        );
    }

    /// Update continuation count for an n-gram using a pre-encoded key.
    pub fn update_continuation_count_by_key(&self, key: &str, continuation_count: u32) {
        self.dictionary.update_or_insert(
            key,
            NgramEntry::with_stats(0, continuation_count, 0),
            |entry| entry.set_continuation_count(continuation_count),
        );
    }

    /// Update unique continuations count for an n-gram.
    ///
    /// # Note
    ///
    /// This method uses legacy pipe-separated encoding. For vocabulary-indexed
    /// encoding, use [`Self::update_unique_continuations_by_key`].
    pub fn update_unique_continuations(&self, tokens: &[&str], unique_continuations: u32) {
        let key = Self::encode_key_legacy(tokens);
        self.dictionary.update_or_insert(
            &key,
            NgramEntry::with_stats(0, 0, unique_continuations),
            |entry| entry.set_unique_continuations(unique_continuations),
        );
    }

    /// Update unique continuations count for an n-gram using a pre-encoded key.
    pub fn update_unique_continuations_by_key(&self, key: &str, unique_continuations: u32) {
        self.dictionary.update_or_insert(
            key,
            NgramEntry::with_stats(0, 0, unique_continuations),
            |entry| entry.set_unique_continuations(unique_continuations),
        );
    }

    /// Get the total number of n-grams stored.
    ///
    /// Returns `None` if the dictionary doesn't support length queries.
    pub fn len(&self) -> usize {
        self.dictionary.len().unwrap_or(0)
    }

    /// Check if the trie is empty.
    pub fn is_empty(&self) -> bool {
        self.dictionary.len().map_or(true, |len| len == 0)
    }

    /// Iterate over all (key, entry) pairs in the trie.
    ///
    /// This is available when the dictionary implements `IterableDictionary`.
    pub fn iter_entries(&self) -> impl Iterator<Item = (String, NgramEntry)> + '_
    where
        D: IterableDictionary,
    {
        self.dictionary.iter_all()
    }
}

impl<D> Clone for NgramTrie<D>
where
    D: MutableMappedDictionary<Value = NgramEntry>,
{
    fn clone(&self) -> Self {
        Self {
            dictionary: Arc::clone(&self.dictionary),
            max_order: self.max_order,
            _marker: PhantomData,
        }
    }
}

/// Fast position-aware hash for n-gram keys.
///
/// Uses position-aware hashing to distinguish n-grams with the same
/// tokens in different orders (e.g., ["a", "b"] vs ["b", "a"]).
///
/// Based on MeTTaTron's collision-resistant hashing pattern.
#[inline]
#[allow(dead_code)]
pub fn hash_ngram_key(tokens: &[&str]) -> u64 {
    use crate::util::hash::safe_hash_with_seed;

    const GOLDEN_RATIO: u64 = 0x9e3779b97f4a7c15;
    const NGRAM_SEED: u64 = 0x6e6772616d5f7365; // "ngram_se"

    let mut hash = NGRAM_SEED;
    for (i, token) in tokens.iter().enumerate() {
        let token_hash = safe_hash_with_seed(token.as_bytes(), i as u64);
        hash = hash.wrapping_add(token_hash).wrapping_mul(GOLDEN_RATIO);
    }
    hash ^ (hash >> 32)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encode_key_legacy() {
        type Trie = NgramTrie<liblevenshtein::dictionary::pathmap::PathMapDictionary<NgramEntry>>;

        assert_eq!(Trie::encode_key_legacy(&["the"]), "the");
        assert_eq!(Trie::encode_key_legacy(&["the", "quick"]), "the|quick");
        assert_eq!(
            Trie::encode_key_legacy(&["the", "quick", "brown"]),
            "the|quick|brown"
        );
    }

    #[test]
    fn test_legacy_encoding_pipe_bug() {
        // This test demonstrates the bug that vocabulary-indexed encoding fixes
        type Trie = NgramTrie<liblevenshtein::dictionary::pathmap::PathMapDictionary<NgramEntry>>;

        // A token containing a pipe character
        let tokens = ["foo|bar", "baz"];
        let encoded = Trie::encode_key_legacy(&tokens);

        // When decoded by splitting on pipe, we get the wrong number of tokens!
        let decoded: Vec<_> = encoded.split(LEGACY_NGRAM_SEPARATOR).collect();
        assert_eq!(decoded.len(), 3, "Bug: pipe in token causes wrong split");
        assert_eq!(
            decoded,
            ["foo", "bar", "baz"],
            "Bug: original tokens corrupted"
        );
    }

    #[test]
    fn test_hash_ngram_key_order_matters() {
        let hash1 = hash_ngram_key(&["a", "b"]);
        let hash2 = hash_ngram_key(&["b", "a"]);
        assert_ne!(
            hash1, hash2,
            "Different orderings should have different hashes"
        );
    }

    #[test]
    fn test_hash_ngram_key_deterministic() {
        let hash1 = hash_ngram_key(&["the", "quick", "brown"]);
        let hash2 = hash_ngram_key(&["the", "quick", "brown"]);
        assert_eq!(hash1, hash2, "Same input should produce same hash");
    }

    // ── IterableDictionary impls for the persistent ARTrie backends ──

    #[test]
    fn iter_all_shared_char_artrie_roundtrip() {
        use libdictenstein::persistent_artrie_char::{PersistentARTrieChar, SharedCharARTrie};
        use std::collections::HashMap;
        use std::sync::Arc;

        let dir = tempfile::tempdir().expect("tempdir");
        let trie = PersistentARTrieChar::<NgramEntry>::create(dir.path().join("c.artrie"))
            .expect("create counts trie");
        let backend: SharedCharARTrie<NgramEntry> = Arc::new(trie);
        backend.insert_with_value("ab", NgramEntry::new(3));
        backend.insert_with_value("cd", NgramEntry::with_stats(5, 2, 1));

        let got: HashMap<String, u64> = backend.iter_all().map(|(k, v)| (k, v.count())).collect();
        assert_eq!(got.get("ab"), Some(&3));
        assert_eq!(got.get("cd"), Some(&5));
        assert_eq!(got.len(), 2);
    }

    #[test]
    fn iter_all_vocab_indexed_reconstructs_words() {
        use crate::ngram::vocabulary::create_vocabulary;
        use crate::ngram::vocabulary_indexed::VocabularyIndexedDictionary;
        use libdictenstein::persistent_artrie_char::{PersistentARTrieChar, SharedCharARTrie};
        use std::collections::HashMap;
        use std::sync::Arc;

        let dir = tempfile::tempdir().expect("tempdir");
        let vocab = create_vocabulary(&dir.path().join("v.artrie")).expect("vocab");
        let counts: SharedCharARTrie<NgramEntry> = Arc::new(
            PersistentARTrieChar::<NgramEntry>::create(dir.path().join("c.artrie"))
                .expect("counts"),
        );
        let dict = VocabularyIndexedDictionary::with_delimiter(counts, vocab, '|');

        // Insert via the MutableMappedDictionary surface (splits on '|' →
        // assigns vocab ids → stores latin1 varint keys in the counts trie).
        dict.insert_with_value("the|quick|brown", NgramEntry::new(2));
        dict.insert_with_value("the|lazy", NgramEntry::new(5));

        // iter_all must decode the integer keys back to the exact word strings.
        let got: HashMap<String, u64> = dict.iter_all().map(|(k, v)| (k, v.count())).collect();
        assert_eq!(
            got.get("the|quick|brown"),
            Some(&2),
            "trigram reconstructed"
        );
        assert_eq!(got.get("the|lazy"), Some(&5), "bigram reconstructed");
        assert_eq!(got.len(), 2);
    }

    #[test]
    fn iter_all_vocab_indexed_skips_missing_index() {
        use crate::ngram::vocabulary::{create_vocabulary, encode_varint};
        use crate::ngram::vocabulary_indexed::{
            decode_key_to_indices, VocabularyIndexedDictionary,
        };
        use libdictenstein::persistent_artrie_char::{PersistentARTrieChar, SharedCharARTrie};
        use std::sync::Arc;

        let dir = tempfile::tempdir().expect("tempdir");
        let vocab = create_vocabulary(&dir.path().join("v.artrie")).expect("vocab");
        let counts: SharedCharARTrie<NgramEntry> = Arc::new(
            PersistentARTrieChar::<NgramEntry>::create(dir.path().join("c.artrie"))
                .expect("counts"),
        );
        let dict = VocabularyIndexedDictionary::with_delimiter(counts.clone(), vocab, '|');

        // One valid n-gram (assigns vocab ids 1,2), …
        dict.insert_with_value("alpha|beta", NgramEntry::new(1));
        // … plus a forged backend key for index 9999 that was never assigned.
        let mut buf = Vec::new();
        encode_varint(9999, &mut buf);
        let bogus_key: String = buf.iter().map(|&b| b as char).collect(); // latin1
        assert_eq!(decode_key_to_indices(&bogus_key), vec![9999]);
        counts.insert_with_value(&bogus_key, NgramEntry::new(7));

        // The missing-index key must be skipped (no panic), the valid one kept.
        let got: Vec<String> = dict.iter_all().map(|(k, _)| k).collect();
        assert_eq!(got, vec!["alpha|beta".to_string()]);
    }
}