imdb_index/index/names.rs
1use std::cmp;
2use std::collections::{binary_heap, BinaryHeap};
3use std::convert::TryInto;
4use std::fmt;
5use std::fs::File;
6use std::io::{self, Write};
7use std::path::Path;
8use std::str::{self, FromStr};
9use std::time::Instant;
10
11use fnv::FnvHashMap;
12use fst;
13use memmap::Mmap;
14use serde::{Deserialize, Serialize};
15use serde_json;
16
17use crate::error::{Error, Result};
18use crate::index::writer::CursorWriter;
19use crate::scored::{Scored, SearchResults};
20use crate::util::{
21 fst_map_builder_file, fst_map_file, mmap_file, open_file, NiceDuration,
22};
23
24/// The name of the file containing the index configuration.
25///
26/// The index configuration is a JSON file with some meta data about this
27/// index, such as its version, ngram size and aggregate statistics about the
28/// corpus that has been indexed.
29const CONFIG: &str = "names.config.json";
30
31/// The name of the ngram term index.
32///
33/// The ngram term index maps ngrams (fixed size sequences of Unicode
34/// codepoints) to file offsets. Each file offset points to the postings for
35/// the corresponding term.
36const NGRAM: &str = "names.ngram.fst";
37
38/// The name of the postings list index.
39///
40/// The postings list contains an entry for every term in the ngram index.
41/// Each entry corresponds to a list of document/frequency pairs. Namely, each
42/// entry is a DocID and a frequency count indicating how many times the
43/// corresponding term appeared in that document. Each entry in the list is
44/// encoded as a single 32 little-endian integer. The high 4 bits represent
45/// the frequency (which is capped at 15, a reasonable number for indexing
46/// short name strings) while the low 28 bits represent the doc id. The
47/// `MAX_DOC_ID` constant below ensures we make sure to never use a doc id
48/// that won't fit this encoding scheme.
49///
50/// The last eight bytes in the postings index contains a 64-bit little-endian
51/// encoded integer indicating the average length of all documents represented
52/// by the ngram index. The length is recorded in units of terms, which
53/// generally correspond to the total number of ngrams in a name.
54const POSTINGS: &str = "names.postings.idx";
55
56/// The name of the identifier map index.
57///
58/// This file maps `DocID`s to `NameID`s. It consists of a sequence of
59/// 64-bit little-endian encoded integers, where the length of the sequence
60/// corresponds to the total number of names in the index. Each entry in the
61/// sequence encodes a `NameID`. In other words, the index to this sequence is
62/// a `DocID` and the value at that index is a `NameID`.
63///
64/// The id map is used to map doc ids returned by the postings to name ids
65/// which were provided by the caller. This also permits search to deduplicate
66/// results. That is, we should never return multiple results for the same
67/// NameID, even though we may have indexed multiple names for the same name
68/// id.
69const IDMAP: &str = "names.idmap.idx";
70
71/// The name of the document length index.
72///
73/// This file consists of a sequence of 16-bit little-endian encoded
74/// integers, where the length of the sequence corresponds to the total number
75/// of names in the index. Each entry represents the length, in terms, of each
76/// name.
77///
78/// The lengths are used during scoring to compute a normalization term. This
79/// allows the scoring mechanism to take document length into account.
80const NORMS: &str = "names.norms.idx";
81
82/// The external identifier for every distinct record represented by this name
83/// index. There are no restrictions on name ids, and multiple names may be
84/// indexed that correspond to the same name id.
85///
86/// With respect to IMDb, there is a 1-to-1 correspondence between the records
87/// in title.basics.tsv and the set of NameIDs, even though there may be
88/// multiple names for each record.
89///
90/// For IMDb, this is represented by the byte offset of the corresponding
91/// record in title.basics.tsv. This provides constant time lookup to full
92/// record. Note, though, that this module knows nothing about such things.
93/// To this module, name ids are opaque identifiers.
94pub type NameID = u64;
95
96/// An internal surrogate identifier for every distinct name in the index. Note
97/// that multiple distinct doc ids can map to the same name id. For example, if
98/// a name has multiple distinct forms, then they each get their own docid, but
99/// each of the docids will map to the same name id.
100///
101/// The reason why we need DocID in addition to NameID is two fold:
102///
103/// 1. Firstly, we'd like each name variant to have its own term frequency
104/// count. If every variant shared the same internal id, then names with
105/// multiple variants would behave as if they were one long name with each
106/// variant concatenated together. Our ranking scheme takes document length
107/// into account, so we don't want this.
108/// 2. Secondly, using an internal ID gives us control over the structure of
109/// those ids. For example, we can declare them to be a sorted sequence of
110/// increasing integers. This lets us traverse our postings more efficiently
111/// during search.
112type DocID = u32;
113
114/// The maximum docid allowed.
115///
116/// When writing postings, we pack docids and their term frequency counts into
117/// a single u32. We give 4 bits for frequency and 28 bits for docid. That
118/// means we can permit up to 268,435,455 = (1<<28)-1 names, which is plenty
119/// for all unique names in IMDb.
120const MAX_DOC_ID: DocID = (1 << 28) - 1;
121
122/// A query for searching the name index.
123///
124/// A query provides the name query and defines the maximum number of results
125/// returned by searching the name index.
126#[derive(Clone, Debug)]
127pub struct NameQuery {
128 name: String,
129 size: usize,
130 scorer: NameScorer,
131 stop_word_ratio: f64,
132}
133
134impl NameQuery {
135 /// Create a query that searches the given name.
136 pub fn new(name: &str) -> NameQuery {
137 NameQuery {
138 name: name.to_string(),
139 size: 30,
140 scorer: NameScorer::default(),
141 stop_word_ratio: 0.01,
142 }
143 }
144
145 /// Set this query's result set size. At most `size` results will be
146 /// returned when searching with this query.
147 pub fn with_size(self, size: usize) -> NameQuery {
148 NameQuery { size, ..self }
149 }
150
151 /// Set this query's scorer. By default, Okapi BM25 is used.
152 pub fn with_scorer(self, scorer: NameScorer) -> NameQuery {
153 NameQuery { scorer, ..self }
154 }
155
156 /// Set the ratio (in the range `0.0` to `1.0`, inclusive) at which a term
157 /// is determined to be a stop word. Set to `0.0` to disable. By default
158 /// this is set to a non-zero value.
159 ///
160 /// This ratio is used at query time to partition all of the ngrams in the
161 /// query into two bins: one bin is for "low frequency" ngrams while the
162 /// other is for "high frequency" ngrams. The partitioning is determined
163 /// by this ratio. Namely, if an ngram occurs in fewer than `ratio`
164 /// documents in the entire corpus, then it is considered a low frequency
165 /// ngram.
166 ///
167 /// Once these two partitions are created, both are used to create two
168 /// disjunction queries. The low frequency query drives search results,
169 /// while the high frequency query is only used to boost scores when it
170 /// matches a result yielded by the low frequency query. Otherwise, results
171 /// from the high frequency query aren't considered.
172 pub fn with_stop_word_ratio(self, ratio: f64) -> NameQuery {
173 NameQuery { stop_word_ratio: ratio, ..self }
174 }
175}
176
177/// A reader for the name index.
178#[derive(Debug)]
179pub struct IndexReader {
180 /// The configuration of this index. This is how we determine index-time
181 /// settings automatically, such as ngram size and type.
182 config: Config,
183 /// The ngram index, also known more generally as the "term index." It maps
184 /// terms (which are ngrams for this index) to offsets into the postings
185 /// file. The offset indicates the start of a list of document ids
186 /// containing that term.
187 ngram: fst::Map<Mmap>,
188 /// The postings. This corresponds to a sequence of lists, where each list
189 /// is a list of document ID/frequency pairs. Each list corresponds to the
190 /// document ids containing a particular term. The beginning of each list
191 /// is pointed to by an offset in the term index.
192 postings: Mmap,
193 /// A sequence of 64-bit little-endian encoded integers that provide a
194 /// map from document ID to name ID. The document ID is an internal
195 /// identifier assigned to each unique name indexed, while the name ID is
196 /// an external identifier provided by users of this index.
197 ///
198 /// This map is used to return name IDs to callers. Namely, results are
199 /// natively represented by document IDs, but they are mapped to name IDs
200 /// during collection of results and subsequently deduped. In particular,
201 /// multiple document IDs can map to the same name ID.
202 ///
203 /// The number of entries in this map is equivalent to the total number of
204 /// names indexed.
205 idmap: Mmap,
206 /// A sequence of 16-bit little-endian encoded integers indicating the
207 /// document length (in terms) of the correspond document ID.
208 ///
209 /// The number of entries in this map is equivalent to the total number of
210 /// names indexed.
211 norms: Mmap,
212}
213
214/// The configuration for this name index. It is JSON encoded to disk.
215///
216/// Note that we don't track the version here. Instead, it is tracked wholesale
217/// as part of the parent index.
218#[derive(Debug, Deserialize, Serialize)]
219struct Config {
220 ngram_type: NgramType,
221 ngram_size: usize,
222 avg_document_len: f64,
223 num_documents: u64,
224}
225
226impl IndexReader {
227 /// Open a name index in the given directory.
228 pub fn open<P: AsRef<Path>>(dir: P) -> Result<IndexReader> {
229 let dir = dir.as_ref();
230
231 // All of the following open memory maps. We claim it is safe because
232 // we don't mutate them and no other process (should) either.
233 let ngram = unsafe { fst_map_file(dir.join(NGRAM))? };
234 let postings = unsafe { mmap_file(dir.join(POSTINGS))? };
235 let idmap = unsafe { mmap_file(dir.join(IDMAP))? };
236 let norms = unsafe { mmap_file(dir.join(NORMS))? };
237
238 let config_file = open_file(dir.join(CONFIG))?;
239 let config: Config = serde_json::from_reader(config_file)
240 .map_err(|e| Error::config(e.to_string()))?;
241 Ok(IndexReader { config, ngram, postings, idmap, norms })
242 }
243
244 /// Execute a search.
245 pub fn search(&self, query: &NameQuery) -> SearchResults<NameID> {
246 let start = Instant::now();
247 let mut searcher = Searcher::new(self, query);
248 let results = CollectTopK::new(query.size).collect(&mut searcher);
249 log::debug!(
250 "search for {:?} took {}",
251 query,
252 NiceDuration::since(start)
253 );
254 results
255 }
256
257 /// Return the name ID used to the index the given document id.
258 ///
259 /// This panics if the given document id does not correspond to an indexed
260 /// document.
261 fn docid_to_nameid(&self, docid: DocID) -> NameID {
262 let start = 8 * (docid as usize);
263 let buf = self.idmap[start..start + 8].try_into().unwrap();
264 u64::from_le_bytes(buf)
265 }
266
267 /// Return the length, in terms, of the given document.
268 ///
269 /// This panics if the given document id does not correspond to an indexed
270 /// document.
271 fn document_length(&self, docid: DocID) -> u64 {
272 let start = 2 * (docid as usize);
273 let buf = self.norms[start..start + 2].try_into().unwrap();
274 u16::from_le_bytes(buf) as u64
275 }
276}
277
278/// A collector for gathering the top K results from a search.
279///
280/// This maintains a min-heap of search results. When a new result is
281/// considered, it is compared against the worst result in the heap. If the
282/// candidate is worse, then it is discarded. Otherwise, it is shuffled into
283/// the heap.
284struct CollectTopK {
285 /// The total number of hits to collect.
286 k: usize,
287 /// The min-heap, according to score. Note that since BinaryHeap is a
288 /// max-heap by default, we reverse the comparison to get a min-heap.
289 queue: BinaryHeap<cmp::Reverse<Scored<NameID>>>,
290 /// A set for deduplicating results. Namely, multiple doc IDs can map to
291 /// the same name ID. This set makes sure we only collect one name ID.
292 ///
293 /// We map name IDs to scores. In this way, we always report the best
294 /// scoring match.
295 byid: FnvHashMap<NameID, f64>,
296}
297
298impl CollectTopK {
299 /// Build a new collector that collects at most `k` results.
300 fn new(k: usize) -> CollectTopK {
301 CollectTopK {
302 k,
303 queue: BinaryHeap::with_capacity(k),
304 byid: FnvHashMap::default(),
305 }
306 }
307
308 /// Collect the top K results from the given searcher using the given
309 /// index reader. Return the results with normalized scores sorted in
310 /// order of best-to-worst.
311 fn collect(mut self, searcher: &mut Searcher) -> SearchResults<NameID> {
312 if self.k == 0 {
313 return SearchResults::new();
314 }
315 let index = searcher.index();
316 let (mut count, mut push_count) = (0, 0);
317 for scored_with_docid in searcher {
318 count += 1;
319 let scored = scored_with_docid.map(|v| index.docid_to_nameid(v));
320 // Since multiple names can correspond to a single IMDb title,
321 // we dedup our results here. That is, if our result set
322 // already contains this result, then update the score if need
323 // be, and then move on.
324 if let Some(&score) = self.byid.get(scored.value()) {
325 if scored.score() > score {
326 self.byid.insert(*scored.value(), scored.score());
327 }
328 continue;
329 }
330
331 let mut dopush = self.queue.len() < self.k;
332 if !dopush {
333 // This unwrap is OK because k > 0 and queue is non-empty.
334 let worst = self.queue.peek_mut().unwrap();
335 // If our queue is full, then we should only push if this
336 // doc id has a better score than the worst one in the queue.
337 if worst.0 < scored {
338 self.byid.remove(worst.0.value());
339 binary_heap::PeekMut::pop(worst);
340 dopush = true;
341 }
342 }
343 if dopush {
344 push_count += 1;
345 self.byid.insert(*scored.value(), scored.score());
346 self.queue.push(cmp::Reverse(scored));
347 }
348 }
349 log::debug!(
350 "collect count: {:?}, collect push count: {:?}",
351 count,
352 push_count
353 );
354
355 // Pull out the results from our heap and normalize the scores.
356 let mut results = SearchResults::from_min_heap(&mut self.queue);
357 results.normalize();
358 results
359 }
360}
361
362/// A searcher for resolving fulltext queries.
363///
364/// A searcher takes a fulltext query, usually typed by an end user, along with
365/// a scoring function and produces a stream of matching results with scores
366/// computed via the provided function. Results are always yielded in
367/// ascending order with respect to document IDs, which are internal IDs
368/// assigned to each name in the index.
369///
370/// This searcher combines a bit of smarts to handle stop words, usually
371/// referred to as "dynamic stop word detection." Namely, after the searcher
372/// splits the query into ngrams, it partitions the ngrams into infrequently
373/// occurring ngrams and frequently occurring ngrams, according to some
374/// hard-coded threshold. Each group is then turned into a `Disjunction`
375/// query. The searcher then visits every doc ID that matches the infrequently
376/// occurring disjunction. When a score is computed for a doc ID, then its
377/// score is increased if the frequently occurring disjunction also contains
378/// that same doc ID. Otherwise, the frequently occurring disjunction isn't
379/// consulted at all, which permits skipping the score calculation for a
380/// potentially large number of doc IDs.
381///
382/// When two partitions cannot be created (e.g., all of the terms are
383/// infrequently occurring or all of the terms are frequently occurring), then
384/// only one disjunction query is used and no skipping logic is employed. That
385/// means that a query consisting of all high frequency terms could be quite
386/// slow.
387///
388/// This does of course sacrifice recall for a performance benefit, but so do
389/// all filtering strategies based on stop words. The benefit of this "dynamic"
390/// approach is that stop word detection is tailored exactly to the corpus, and
391/// that stop words can still influence scoring. That means queries like "the
392/// matrix" will match "The Matrix" better than "Matrix" (which is a legitimate
393/// example, try it).
394struct Searcher<'i> {
395 /// A handle to the index.
396 index: &'i IndexReader,
397 /// The primary disjunction query that drives results. Typically, this
398 /// corresponds to the infrequent terms in the query.
399 primary: Disjunction<'i>,
400 /// A disjunction of only high frequency terms. When the query consists
401 /// of exclusively high frequency terms, then this is empty (which matches
402 /// nothing) and `primary` is set to the disjunction of terms.
403 high: Disjunction<'i>,
404}
405
406impl<'i> Searcher<'i> {
407 /// Create a new searcher.
408 fn new(idx: &'i IndexReader, query: &NameQuery) -> Searcher<'i> {
409 let num_docs = idx.config.num_documents as f64;
410 let (mut low, mut high) = (vec![], vec![]);
411 let (mut low_terms, mut high_terms) = (vec![], vec![]);
412
413 let name = normalize_query(&query.name);
414 let mut query_len = 0;
415 let mut multiset = FnvHashMap::default();
416 idx.config.ngram_type.iter(idx.config.ngram_size, &name, |term| {
417 *multiset.entry(term).or_insert(0) += 1;
418 query_len += 1;
419 });
420 for (term, &count) in multiset.iter() {
421 let postings = PostingIter::new(idx, query.scorer, count, term);
422 let ratio = (postings.len() as f64) / num_docs;
423 if ratio < query.stop_word_ratio {
424 low.push(postings);
425 low_terms.push(format!("{}:{}:{:0.6}", term, count, ratio));
426 } else {
427 high.push(postings);
428 high_terms.push(format!("{}:{}:{:0.6}", term, count, ratio));
429 }
430 }
431 log::debug!("starting search for: {:?}", name);
432 log::debug!("{:?} low frequency terms: {:?}", low.len(), low_terms);
433 log::debug!("{:?} high frequency terms: {:?}", high.len(), high_terms);
434
435 if low.is_empty() {
436 Searcher {
437 index: idx,
438 primary: Disjunction::new(idx, query_len, query.scorer, high),
439 high: Disjunction::empty(idx, query.scorer),
440 }
441 } else {
442 Searcher {
443 index: idx,
444 primary: Disjunction::new(idx, query_len, query.scorer, low),
445 high: Disjunction::new(idx, query_len, query.scorer, high),
446 }
447 }
448 }
449
450 /// Return a reference to the underlying index reader.
451 fn index(&self) -> &'i IndexReader {
452 self.index
453 }
454}
455
456impl<'i> Iterator for Searcher<'i> {
457 type Item = Scored<DocID>;
458
459 fn next(&mut self) -> Option<Scored<DocID>> {
460 // This is pretty simple. We drive the iterator via the primary
461 // disjunction, which is usually a disjunction of infrequently
462 // occurring ngrams.
463 let mut scored = match self.primary.next() {
464 None => return None,
465 Some(scored) => scored,
466 };
467 // We then skip our frequently occurring disjunction to the doc ID
468 // yielded above. Any frequently occurring ngrams found then improve
469 // this score. This makes queries like 'the matrix' match 'The Matrix'
470 // better than 'Matrix'.
471 if let Some(other_scored) = self.high.skip_to(*scored.value()) {
472 scored = scored.map_score(|s| s + other_scored.score());
473 }
474 Some(scored)
475 }
476}
477
478/// A disjunction over a collection of ngrams. A disjunction yields scored
479/// document IDs for every document that contains any of the terms in this
480/// disjunction. The more ngrams that match the document in the disjunction,
481/// the better the score.
482struct Disjunction<'i> {
483 /// A handle to the underlying index that we're searching.
484 index: &'i IndexReader,
485 /// The number of ngrams in the original query.
486 ///
487 /// This is not necessarily equivalent to the number of ngrams in this
488 /// specific disjunction. Namely, this is used to compute scores, and it
489 /// is important that scores are computed using the total number of ngrams
490 /// and not the number of ngrams in a specific disjunction. For example,
491 /// if a query consisted of 8 infrequent ngrams and 1 frequent ngram, then
492 /// the disjunction containing the single frequent ngram would contribute a
493 /// disproportionately high score.
494 query_len: f64,
495 /// The scoring function to use.
496 scorer: NameScorer,
497 /// A min-heap of posting iterators. Each posting iterator corresponds to
498 /// an iterator over (doc ID, frequency) pairs for a single ngram, sorted
499 /// by doc ID in ascending order.
500 ///
501 /// A min-heap is a classic way of optimally computing a disjunction over
502 /// an arbitrary number of ordered streams.
503 queue: BinaryHeap<PostingIter<'i>>,
504 /// Whether this disjunction has been exhausted or not.
505 is_done: bool,
506}
507
508impl<'i> Disjunction<'i> {
509 /// Create a new disjunction over the given posting iterators.
510 fn new(
511 index: &'i IndexReader,
512 query_len: usize,
513 scorer: NameScorer,
514 posting_iters: Vec<PostingIter<'i>>,
515 ) -> Disjunction<'i> {
516 let mut queue = BinaryHeap::new();
517 for postings in posting_iters {
518 queue.push(postings);
519 }
520 let is_done = queue.is_empty();
521 let query_len = query_len as f64;
522 Disjunction { index, query_len, scorer, queue, is_done }
523 }
524
525 /// Create an empty disjunction that never matches anything.
526 fn empty(index: &'i IndexReader, scorer: NameScorer) -> Disjunction<'i> {
527 Disjunction {
528 index,
529 query_len: 0.0,
530 scorer,
531 queue: BinaryHeap::new(),
532 is_done: true,
533 }
534 }
535
536 /// Skip this disjunction such that all posting iterators are either
537 /// positioned at the smallest doc ID greater than the given doc ID.
538 ///
539 /// If any posting iterator contains the given doc ID, then it is scored
540 /// and returned. The score incorporates all posting iterators that contain
541 /// the given doc ID.
542 fn skip_to(&mut self, target_docid: DocID) -> Option<Scored<DocID>> {
543 if self.is_done {
544 return None;
545 }
546 let mut found = false;
547 // loop invariant: loop until all posting iterators are either
548 // positioned directly at the target doc ID (in which case, `found`
549 // is set to that doc ID) or beyond the target doc ID. If none of the
550 // iterators contain the target doc ID, then `found` remains `None`.
551 loop {
552 // This unwrap is OK because we're only here if we have a
553 // non-empty queue.
554 let mut postings = self.queue.peek_mut().unwrap();
555 if postings.docid().map_or(true, |x| x >= target_docid) {
556 found = found || Some(target_docid) == postings.docid();
557 // This is the smallest posting iterator, which means all
558 // iterators are now either at or beyond target_docid.
559 break;
560 }
561 // Skip through this iterator until we're at or beyond the target
562 // doc ID.
563 while postings.docid().map_or(false, |x| x < target_docid) {
564 postings.next();
565 }
566 found = found || Some(target_docid) == postings.docid();
567 }
568 if !found {
569 return None;
570 }
571 // We're here if we found our target doc ID, which means at least one
572 // posting iterator is pointing to the doc ID and it is necessarily
573 // the minimum doc ID of all the posting iterators in this disjunction.
574 // Therefore, advance such that all posting iterators are beyond the
575 // target doc ID.
576 //
577 // (If we didn't find the target doc ID, then the loop invariant above
578 // guarantees that we are already passed the target doc ID.)
579 self.next()
580 }
581}
582
583impl<'i> Iterator for Disjunction<'i> {
584 type Item = Scored<DocID>;
585
586 fn next(&mut self) -> Option<Scored<DocID>> {
587 if self.is_done {
588 return None;
589 }
590 // Find our next matching ngram.
591 let mut scored1 = {
592 // This unwrap is OK because we're only here if we have a
593 // non-empty queue.
594 let mut postings = self.queue.peek_mut().unwrap();
595 match postings.score() {
596 None => {
597 self.is_done = true;
598 return None;
599 }
600 Some(scored) => {
601 postings.next();
602 scored
603 }
604 }
605 };
606 // Discover if any of the other posting iterators also match this
607 // ngram.
608 loop {
609 // This unwrap is OK because we're only here if we have a
610 // non-empty queue.
611 let mut postings = self.queue.peek_mut().unwrap();
612 match postings.score() {
613 None => break,
614 Some(scored2) => {
615 // If the smallest posting iterator isn't equivalent to
616 // the doc ID found above, then we've found all of the
617 // matching terms for this doc ID that we'll find.
618 if scored1.value() != scored2.value() {
619 break;
620 }
621 scored1 = scored1.map_score(|s| s + scored2.score());
622 postings.next();
623 }
624 }
625 }
626 // Some of our scorers are more convenient to compute at the
627 // disjunction level rather than at the term level.
628 if let NameScorer::Jaccard = self.scorer {
629 // When using Jaccard, the score returned by the posting
630 // iterator is always 1. Thus, `scored.score` represents the
631 // total number of terms that matched this document. In other
632 // words, it is the cardinality of the intersection of terms
633 // between the query and our candidate, `|A ∩ B|`.
634 //
635 // `query_len` represents the total number of terms in our query
636 // (not just the number of terms in this disjunction!), and
637 // `doc_len` represents the total number of terms in our candidate.
638 // Thus, since `|A u B| = |A| + |B| - |A ∩ B|`, we have that
639 // `|A u B| = query_len + doc_len - scored.score`. And finally, the
640 // Jaccard index is `|A ∩ B| / |A u B|`.
641 let doc_len = self.index.document_length(*scored1.value()) as f64;
642 let union = self.query_len + doc_len - scored1.score();
643 scored1 = scored1.map_score(|s| s / union);
644 } else if let NameScorer::QueryRatio = self.scorer {
645 // This is like Jaccard, but our score is computely purely as the
646 // ratio of query terms that matched this document.
647 scored1 = scored1.map_score(|s| s / self.query_len)
648 }
649 Some(scored1)
650 }
651}
652
653/// An iterator over a postings list for a specific ngram.
654///
655/// A postings list is a sequence of pairs, where each pair has a document
656/// ID and a frequency. The document ID indicates that the ngram is in the
657/// text indexed for that ID, and the frequency counts the number of times
658/// that ngram occurs in the document.
659///
660/// To save space, each pair is encoded using 32 bits. Frequencies are capped
661/// at a maximum of 15, which fit into the high 4 bits. The low 28 bits contain
662/// the doc ID.
663///
664/// The postings list starts with a single 32-bit little endian
665/// integer that represents the document frequency of the ngram. This in turn
666/// determines how many pairs to read. In other words, a posting list is a
667/// length prefixed array of 32 bit little endian integer values.
668///
669/// This type is intended to be used in a max-heap, and orients its Ord
670/// definition such that the heap becomes a min-heap. The ordering criteria
671/// is derived from only the docid.
672#[derive(Clone)]
673struct PostingIter<'i> {
674 /// A handle to the underlying index.
675 index: &'i IndexReader,
676 /// The scoring function to use.
677 scorer: NameScorer,
678 /// The number of times the term for these postings appeared in the
679 /// original query. This increases the score proportionally.
680 count: f64,
681 /// The raw bytes of the posting list. The number of bytes is
682 /// exactly equivalent to `4 * document-frequency(ngram)`, where
683 /// `document-frequency(ngram)` is the total number of documents in which
684 /// `ngram` occurs.
685 ///
686 /// This does not include the length prefix.
687 postings: &'i [u8],
688 /// The document frequency of this term.
689 len: usize,
690 /// The current posting. This is `None` once this iterator is exhausted.
691 posting: Option<Posting>,
692 /// A docid used for sorting postings. When the iterator is exhausted,
693 /// this is greater than the maximum doc id. Otherwise, this is always
694 /// equivalent to posting.docid.
695 ///
696 /// We do this for efficiency by avoiding going through the optional
697 /// Posting.
698 docid: DocID,
699 /// The OkapiBM25 IDF score. This is invariant across all items in a
700 /// posting list, so we compute it once at construction. This saves a
701 /// call to `log` for every doc ID visited.
702 okapi_idf: f64,
703}
704
705/// A single entry in a posting list.
706#[derive(Clone, Copy, Debug)]
707struct Posting {
708 /// The document id.
709 docid: DocID,
710 /// The frequency, i.e., the number of times the ngram occurred in the
711 /// document identified by the docid.
712 frequency: u32,
713}
714
715impl Posting {
716 /// Read the next posting pair (doc ID and frequency) from the given
717 /// postings list. If the list is empty, then return `None`.
718 fn read(slice: &[u8]) -> Option<Posting> {
719 if slice.is_empty() {
720 None
721 } else {
722 let v = read_le_u32(slice);
723 Some(Posting { docid: v & MAX_DOC_ID, frequency: v >> 28 })
724 }
725 }
726}
727
728impl<'i> PostingIter<'i> {
729 /// Create a new posting iterator for the given term in the given index.
730 /// Scores will be computed with the given scoring function.
731 ///
732 /// `count` should be the number of times this term occurred in the
733 /// original query string.
734 fn new(
735 index: &'i IndexReader,
736 scorer: NameScorer,
737 count: usize,
738 term: &str,
739 ) -> PostingIter<'i> {
740 let mut postings = &*index.postings;
741 let offset = match index.ngram.get(term.as_bytes()) {
742 Some(offset) => offset as usize,
743 None => {
744 // If the term isn't in the index, then return an exhausted
745 // iterator.
746 return PostingIter {
747 index,
748 scorer,
749 count: 0.0,
750 postings: &[],
751 len: 0,
752 posting: None,
753 docid: MAX_DOC_ID + 1,
754 okapi_idf: 0.0,
755 };
756 }
757 };
758 postings = &postings[offset..];
759 let len = read_le_u32(postings) as usize;
760 postings = &postings[4..];
761
762 let corpus_count = index.config.num_documents as f64;
763 let df = len as f64;
764 let okapi_idf = (1.0 + (corpus_count - df + 0.5) / (df + 0.5)).log2();
765 let mut it = PostingIter {
766 index,
767 scorer,
768 count: count as f64,
769 postings: &postings[..4 * len],
770 len,
771 posting: None,
772 docid: 0,
773 okapi_idf,
774 };
775 // Advance to the first posting.
776 it.next();
777 it
778 }
779
780 /// Return the current posting. If this iterator has been exhausted, then
781 /// this returns `None`.
782 fn posting(&self) -> Option<Posting> {
783 self.posting
784 }
785
786 /// Returns the document frequency for the term corresponding to these
787 /// postings.
788 fn len(&self) -> usize {
789 self.len
790 }
791
792 /// Return the current document ID. If this iterator has been exhausted,
793 /// then this returns `None`.
794 fn docid(&self) -> Option<DocID> {
795 self.posting().map(|p| p.docid)
796 }
797
798 /// Return the score with the current document ID. If this iterator has
799 /// been exhausted, then this returns `None`.
800 fn score(&self) -> Option<Scored<DocID>> {
801 match self.scorer {
802 NameScorer::OkapiBM25 => self.score_okapibm25(),
803 NameScorer::TFIDF => self.score_tfidf(),
804 NameScorer::Jaccard => self.score_jaccard(),
805 NameScorer::QueryRatio => self.score_query_ratio(),
806 }
807 .map(|scored| scored.map_score(|s| s * self.count))
808 }
809
810 /// Score the current doc ID using Okapi BM25. It's similarish to TF-IDF,
811 /// but uses a document length normalization term.
812 fn score_okapibm25(&self) -> Option<Scored<DocID>> {
813 let post = match self.posting() {
814 None => return None,
815 Some(post) => post,
816 };
817
818 let k1 = 1.2;
819 let b = 0.75;
820 let doc_len = self.index.document_length(post.docid);
821 let norm = (doc_len as f64) / self.index.config.avg_document_len;
822 let tf = post.frequency as f64;
823
824 let num = tf * (k1 + 1.0);
825 let den = tf + k1 * (1.0 - b + b * norm);
826 let score = (num / den) * self.okapi_idf;
827 let capped = if score < 0.0 { 0.0 } else { score };
828 Some(Scored::new(post.docid).with_score(capped))
829 }
830
831 /// Score the current doc ID using the traditional TF-IDF ranking function.
832 fn score_tfidf(&self) -> Option<Scored<DocID>> {
833 let post = match self.posting() {
834 None => return None,
835 Some(post) => post,
836 };
837
838 let corpus_docs = self.index.config.num_documents as f64;
839 let term_docs = self.len as f64;
840 let tf = post.frequency as f64;
841 let idf = (corpus_docs / (1.0 + term_docs)).log2();
842 let score = tf * idf;
843 Some(Scored::new(post.docid).with_score(score))
844 }
845
846 /// Score the current doc ID using the Jaccard index, which measures the
847 /// overlap between two sets.
848 ///
849 /// Note that this always returns `1.0`. The Jaccard index itself must be
850 /// computed by the disjunction scorer.
851 fn score_jaccard(&self) -> Option<Scored<DocID>> {
852 self.posting().map(|p| Scored::new(p.docid).with_score(1.0))
853 }
854
855 /// Score the current doc ID using the ratio of terms in the query that
856 /// matched the terms in this doc ID.
857 ///
858 /// Note that this always returns `1.0`. The query ratio itself must be
859 /// computed by the disjunction scorer.
860 fn score_query_ratio(&self) -> Option<Scored<DocID>> {
861 self.posting().map(|p| Scored::new(p.docid).with_score(1.0))
862 }
863}
864
865impl<'i> Iterator for PostingIter<'i> {
866 type Item = Posting;
867
868 fn next(&mut self) -> Option<Posting> {
869 self.posting = match Posting::read(self.postings) {
870 None => {
871 self.docid = MAX_DOC_ID + 1;
872 None
873 }
874 Some(p) => {
875 self.postings = &self.postings[4..];
876 self.docid = p.docid;
877 Some(p)
878 }
879 };
880 self.posting
881 }
882}
883
884impl<'i> Eq for PostingIter<'i> {}
885
886impl<'i> PartialEq for PostingIter<'i> {
887 fn eq(&self, other: &PostingIter<'i>) -> bool {
888 self.docid == other.docid
889 }
890}
891
892impl<'i> Ord for PostingIter<'i> {
893 fn cmp(&self, other: &PostingIter<'i>) -> cmp::Ordering {
894 // std::collections::BinaryHeap is a max-heap and we need a
895 // min-heap, so write this as-if it were a max-heap, then reverse it.
896 // Note that exhausted searchers should always have the lowest
897 // priority, and therefore, be considered maximal.
898 self.docid.cmp(&other.docid).reverse()
899 }
900}
901
902impl<'i> PartialOrd for PostingIter<'i> {
903 fn partial_cmp(&self, other: &PostingIter<'i>) -> Option<cmp::Ordering> {
904 Some(self.cmp(other))
905 }
906}
907
908/// A writer for indexing names to disk.
909///
910/// A writer opens and writes to several files simultaneously, which keeps the
911/// implementation simple.
912///
913/// The index writer cannot stream the postings or term index, since the term
914/// index requires its ngrams to be inserted in sorted order. Postings lists
915/// are written as length prefixed sequences, so we need to know the lengths
916/// of all our postings lists before writing them.
917pub struct IndexWriter {
918 /// A builder for the ngram term index.
919 ///
920 /// This isn't used until the caller indicates that it is done indexing
921 /// names. At which point, we insert all ngrams into the FST in sorted
922 /// order. Each ngram is mapped to the beginning of its correspond
923 /// postings list.
924 ngram: fst::MapBuilder<io::BufWriter<File>>,
925 /// The type of ngram extraction to use.
926 ngram_type: NgramType,
927 /// The size of ngrams to generate.
928 ngram_size: usize,
929 /// A writer for postings lists.
930 ///
931 /// This isn't written to until the caller indicates that it is done
932 /// indexing names. At which point, every posting list is written as a
933 /// length prefixed array, in the same order that terms are written to the
934 /// term index.
935 postings: CursorWriter<io::BufWriter<File>>,
936 /// A map from document ID to name ID. This is written to in a streaming
937 /// fashion during indexing. The ID map consists of N 64-bit little
938 /// endian integers, where N is the total number of names indexed.
939 ///
940 /// The document ID (the position in this map) is a unique internal
941 /// identifier assigned to each name, while the name ID is an identifier
942 /// provided by the caller. Multiple document IDs may map to the same
943 /// name ID (e.g., for indexing alternate names).
944 idmap: CursorWriter<io::BufWriter<File>>,
945 /// A map from document ID to document length, where the length corresponds
946 /// to the number of ngrams in the document. The map consists of N 16-bit
947 /// little endian integers, where N is the total number of names indexed.
948 ///
949 /// The document lengths are used at query time as normalization
950 /// parameters. They are written in a streaming fashion during the indexing
951 /// process.
952 norms: CursorWriter<io::BufWriter<File>>,
953 /// A JSON formatted configuration file that includes some aggregate
954 /// statistics (such as the average document length, in ngrams) and the
955 /// ngram configuration. The ngram configuration in particular is used at
956 /// query time to make sure that query-time uses the same analysis as
957 /// index-time.
958 ///
959 /// This is written at the end of the indexing process.
960 config: CursorWriter<io::BufWriter<File>>,
961 /// An in-memory map from ngram to its corresponding postings list. Once
962 /// indexing is done, this is written to disk via the FST term index and
963 /// postings list writers documented above.
964 terms: FnvHashMap<String, Postings>,
965 /// The next document ID, starting at 0. Each name added gets assigned its
966 /// own unique document ID. Queries read document IDs from the postings
967 /// list, but are mapped back to name IDs using the `idmap` before being
968 /// returned to the caller.
969 next_docid: DocID,
970 /// The average document length, in ngrams, for every name indexed. This is
971 /// used along with document lengths to compute normalization terms for
972 /// scoring at query time.
973 avg_document_len: f64,
974}
975
976/// A single postings list.
977#[derive(Clone, Debug, Default)]
978struct Postings {
979 /// A sorted list of postings, in order of ascending document IDs.
980 list: Vec<Posting>,
981}
982
983impl IndexWriter {
984 /// Open an index for writing to the given directory. Any previous name
985 /// index in the given directory is overwritten.
986 ///
987 /// The given ngram configuration is used to transform all indexed names
988 /// into terms for the inverted index.
989 pub fn open<P: AsRef<Path>>(
990 dir: P,
991 ngram_type: NgramType,
992 ngram_size: usize,
993 ) -> Result<IndexWriter> {
994 let dir = dir.as_ref();
995
996 let ngram = fst_map_builder_file(dir.join(NGRAM))?;
997 let postings = CursorWriter::from_path(dir.join(POSTINGS))?;
998 let idmap = CursorWriter::from_path(dir.join(IDMAP))?;
999 let norms = CursorWriter::from_path(dir.join(NORMS))?;
1000 let config = CursorWriter::from_path(dir.join(CONFIG))?;
1001 Ok(IndexWriter {
1002 ngram,
1003 ngram_type,
1004 ngram_size,
1005 postings,
1006 idmap,
1007 norms,
1008 config,
1009 terms: FnvHashMap::default(),
1010 next_docid: 0,
1011 avg_document_len: 0.0,
1012 })
1013 }
1014
1015 /// Finish writing names and serialize the index to disk.
1016 pub fn finish(mut self) -> Result<()> {
1017 let num_docs = self.num_docs();
1018 let mut ngram_to_postings: Vec<(String, Postings)> =
1019 self.terms.into_iter().collect();
1020 // We could use a BTreeMap and get out our keys in sorted order, but
1021 // the overhead of inserting into the BTreeMap dwarfs the savings we
1022 // get from pre-sorted keys.
1023 ngram_to_postings.sort_by(|&(ref t1, _), &(ref t2, _)| t1.cmp(t2));
1024
1025 for (term, postings) in ngram_to_postings {
1026 let pos = self.postings.position() as u64;
1027 self.ngram.insert(term.as_bytes(), pos).map_err(Error::fst)?;
1028 self.postings
1029 .write_u32(postings.list.len() as u32)
1030 .map_err(Error::io)?;
1031 for posting in postings.list {
1032 let freq = cmp::min(15, posting.frequency);
1033 let v = (freq << 28) | posting.docid;
1034 self.postings.write_u32(v).map_err(Error::io)?;
1035 }
1036 }
1037
1038 serde_json::to_writer_pretty(
1039 &mut self.config,
1040 &Config {
1041 ngram_type: self.ngram_type,
1042 ngram_size: self.ngram_size,
1043 avg_document_len: self.avg_document_len,
1044 num_documents: num_docs as u64,
1045 },
1046 )
1047 .map_err(|e| Error::config(e.to_string()))?;
1048 self.ngram.finish().map_err(Error::fst)?;
1049 self.idmap.flush().map_err(Error::io)?;
1050 self.postings.flush().map_err(Error::io)?;
1051 self.norms.flush().map_err(Error::io)?;
1052 self.config.flush().map_err(Error::io)?;
1053 Ok(())
1054 }
1055
1056 /// Inserts the given name to this index, and associates it with the
1057 /// provided `NameID`. Multiple names may be associated with the same
1058 /// `NameID`.
1059 pub fn insert(&mut self, name_id: NameID, name: &str) -> Result<()> {
1060 let docid = self.next_docid(name_id)?;
1061 let name = normalize_query(name);
1062 let mut count = 0u16; // document length in number of ngrams
1063 self.ngram_type.clone().iter(self.ngram_size, &name, |ngram| {
1064 self.insert_term(docid, ngram);
1065 // If a document length exceeds 2^16, then it is far too long for
1066 // a name anyway, so we cap it at 2^16.
1067 count = count.saturating_add(1);
1068 });
1069 // Update our mean document length (in ngrams).
1070 self.avg_document_len +=
1071 (count as f64 - self.avg_document_len) / (self.num_docs() as f64);
1072 // Write the document length to disk, which is used as a normalization
1073 // term for some scorers (like Okapi-BM25).
1074 self.norms.write_u16(count).map_err(Error::io)?;
1075 Ok(())
1076 }
1077
1078 /// Add a single term that is part of a name identified by the given docid.
1079 /// This updates the postings for this term, or creates a new posting if
1080 /// this is the first time this term has been seen.
1081 fn insert_term(&mut self, docid: DocID, term: &str) {
1082 if let Some(posts) = self.terms.get_mut(term) {
1083 posts.posting(docid).frequency += 1;
1084 return;
1085 }
1086 let mut list = Postings::default();
1087 list.posting(docid).frequency = 1;
1088 self.terms.insert(term.to_string(), list);
1089 }
1090
1091 /// Retrieve a fresh doc id, and associate it with the given name id.
1092 fn next_docid(&mut self, name_id: NameID) -> Result<DocID> {
1093 let docid = self.next_docid;
1094 self.idmap.write_u64(name_id).map_err(Error::io)?;
1095 self.next_docid = match self.next_docid.checked_add(1) {
1096 None => bug!("exhausted doc ids"),
1097 Some(next_docid) => next_docid,
1098 };
1099 if self.next_docid > MAX_DOC_ID {
1100 let max = MAX_DOC_ID + 1; // docids are 0-indexed
1101 bug!("exceeded maximum number of names ({})", max);
1102 }
1103 Ok(docid)
1104 }
1105
1106 /// Return the total number of documents have been assigned doc ids.
1107 fn num_docs(&self) -> u32 {
1108 self.next_docid
1109 }
1110}
1111
1112impl Postings {
1113 /// Return a mutable reference to the posting for the given docid. If one
1114 /// doesn't exist, then create one (with a zero frequency) and return it.
1115 fn posting(&mut self, docid: DocID) -> &mut Posting {
1116 if self.list.last().map_or(true, |x| x.docid != docid) {
1117 self.list.push(Posting { docid, frequency: 0 });
1118 }
1119 // This unwrap is OK because if the list was empty when this method was
1120 // called, then we added an element above, and is thus now non-empty.
1121 self.list.last_mut().unwrap()
1122 }
1123}
1124
1125/// The type of scorer that the name index should use.
1126///
1127/// The default is OkapiBM25. If you aren't sure which scorer to use, then
1128/// stick with the default.
1129#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
1130pub enum NameScorer {
1131 /// OkapiBM25 is a TF-IDF-like ranking function, which takes name length
1132 /// into account.
1133 OkapiBM25,
1134 /// TFIDF is the traditional TF-IDF ranking function, which does not
1135 /// incorporate document length.
1136 TFIDF,
1137 /// Jaccard is a ranking function determined by computing the similarity
1138 /// of ngrams between the query and a name in the index. The similarity
1139 /// is computed by dividing the number of ngrams in common by the total
1140 /// number of distinct ngrams in both the query and the name combined.
1141 Jaccard,
1142 /// QueryRatio is a ranking function that represents the ratio of query
1143 /// terms that matched a name. It is computed by dividing the number of
1144 /// ngrams in common by the total number of ngrams in the query only.
1145 QueryRatio,
1146}
1147
1148impl NameScorer {
1149 /// Returns a list of strings representing the possible scorer values.
1150 pub fn possible_names() -> &'static [&'static str] {
1151 &["okapibm25", "tfidf", "jaccard", "queryratio"]
1152 }
1153
1154 /// Return a string representation of this scorer.
1155 ///
1156 /// The string returned can be parsed back into a `NameScorer`.
1157 pub fn as_str(&self) -> &'static str {
1158 match *self {
1159 NameScorer::OkapiBM25 => "okapibm25",
1160 NameScorer::TFIDF => "tfidf",
1161 NameScorer::Jaccard => "jaccard",
1162 NameScorer::QueryRatio => "queryratio",
1163 }
1164 }
1165}
1166
1167impl Default for NameScorer {
1168 fn default() -> NameScorer {
1169 NameScorer::OkapiBM25
1170 }
1171}
1172
1173impl fmt::Display for NameScorer {
1174 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1175 write!(f, "{}", self.as_str())
1176 }
1177}
1178
1179impl FromStr for NameScorer {
1180 type Err = Error;
1181
1182 fn from_str(s: &str) -> Result<NameScorer> {
1183 match s {
1184 "okapibm25" => Ok(NameScorer::OkapiBM25),
1185 "tfidf" => Ok(NameScorer::TFIDF),
1186 "jaccard" => Ok(NameScorer::Jaccard),
1187 "queryratio" => Ok(NameScorer::QueryRatio),
1188 unk => Err(Error::unknown_scorer(unk)),
1189 }
1190 }
1191}
1192
1193/// The style of ngram extraction to use.
1194///
1195/// The same style of ngram extraction is always used at index time and at
1196/// query time.
1197///
1198/// Each ngram type uses the ngram size configuration differently.
1199///
1200/// All ngram styles used Unicode codepoints as the definition of a character.
1201/// For example, a 3-gram might contain up to 4 bytes, if it contains 3 Unicode
1202/// codepoints that each require 4 UTF-8 code units.
1203#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
1204pub enum NgramType {
1205 /// A windowing ngram.
1206 ///
1207 /// This is the tradition style of ngram, where sliding window of size
1208 /// `N` is moved across the entire content to be index. For example, the
1209 /// 3-grams for the string `homer` are hom, ome and mer.
1210 #[serde(rename = "window")]
1211 Window,
1212 /// An edge ngram.
1213 ///
1214 /// This style of ngram produces ever longer ngrams, where each ngram is
1215 /// anchored to the start of a word. Words are determined simply by
1216 /// splitting whitespace.
1217 ///
1218 /// For example, the edge ngrams of `homer simpson`, where the max ngram
1219 /// size is 5, would be: hom, home, homer, sim, simp, simps. Generally,
1220 /// for this ngram type, one wants to use a large maximum ngram size.
1221 /// Perhaps somewhere close to the maximum number of ngrams in any word
1222 /// in the corpus.
1223 ///
1224 /// Note that there is no way to set the minimum ngram size (which is 3).
1225 #[serde(rename = "edge")]
1226 Edge,
1227}
1228
1229/// The minimum size of an ngram emitted by the edge ngram iterator.
1230const MIN_EDGE_NGRAM_SIZE: usize = 3;
1231
1232impl NgramType {
1233 /// Return all possible ngram types.
1234 pub fn possible_names() -> &'static [&'static str] {
1235 &["window", "edge"]
1236 }
1237
1238 /// Return a string representation of this type.
1239 pub fn as_str(&self) -> &'static str {
1240 match *self {
1241 NgramType::Window => "window",
1242 NgramType::Edge => "edge",
1243 }
1244 }
1245
1246 /// Execute the given function over each ngram in the text provided using
1247 /// the given size configuration.
1248 ///
1249 /// We don't use normal Rust iterators here because an internal iterator
1250 /// is much easier to implement.
1251 fn iter<'t, F: FnMut(&'t str)>(&self, size: usize, text: &'t str, f: F) {
1252 match *self {
1253 NgramType::Window => NgramType::iter_window(size, text, f),
1254 NgramType::Edge => NgramType::iter_edge(size, text, f),
1255 }
1256 }
1257
1258 fn iter_window<'t, F: FnMut(&'t str)>(
1259 size: usize,
1260 text: &'t str,
1261 mut f: F,
1262 ) {
1263 if size == 0 {
1264 return;
1265 }
1266 let end_skip = text.chars().take(size).count().saturating_sub(1);
1267 let start = text.char_indices();
1268 let end = text.char_indices().skip(end_skip);
1269 for ((s, _), (e, c)) in start.zip(end) {
1270 f(&text[s..e + c.len_utf8()]);
1271 }
1272 }
1273
1274 fn iter_edge<'t, F: FnMut(&'t str)>(
1275 max_size: usize,
1276 text: &'t str,
1277 mut f: F,
1278 ) {
1279 if max_size == 0 {
1280 return;
1281 }
1282 for word in text.split_whitespace() {
1283 let end_skip = word
1284 .chars()
1285 .take(MIN_EDGE_NGRAM_SIZE)
1286 .count()
1287 .saturating_sub(1);
1288 let mut size = end_skip + 1;
1289 for (end, c) in word.char_indices().skip(end_skip) {
1290 f(&word[..end + c.len_utf8()]);
1291 size += 1;
1292 if size > max_size {
1293 break;
1294 }
1295 }
1296 }
1297 }
1298}
1299
1300impl Default for NgramType {
1301 fn default() -> NgramType {
1302 NgramType::Window
1303 }
1304}
1305
1306impl fmt::Display for NgramType {
1307 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1308 write!(f, "{}", self.as_str())
1309 }
1310}
1311
1312impl FromStr for NgramType {
1313 type Err = Error;
1314
1315 fn from_str(s: &str) -> Result<NgramType> {
1316 match s {
1317 "window" => Ok(NgramType::Window),
1318 "edge" => Ok(NgramType::Edge),
1319 unk => Err(Error::unknown_ngram_type(unk)),
1320 }
1321 }
1322}
1323
1324fn normalize_query(s: &str) -> String {
1325 // We might consider doing Unicode normalization here, but it probably
1326 // doesn't matter too much on a predominantly ASCII data set.
1327 s.to_lowercase()
1328}
1329
1330fn read_le_u32(slice: &[u8]) -> u32 {
1331 u32::from_le_bytes(slice[..4].try_into().unwrap())
1332}
1333
1334#[cfg(test)]
1335mod tests {
1336 use super::*;
1337 use crate::index::tests::TestContext;
1338
1339 // Test the actual name index.
1340
1341 /// Creates a name index, where each name provided is assigned its own
1342 /// unique ID, starting at 0.
1343 fn create_index(index_dir: &Path, names: &[&str]) -> IndexReader {
1344 let mut wtr =
1345 IndexWriter::open(index_dir, NgramType::Window, 3).unwrap();
1346 for (i, name) in names.iter().enumerate() {
1347 wtr.insert(i as u64, name).unwrap();
1348 }
1349 wtr.finish().unwrap();
1350
1351 IndexReader::open(index_dir).unwrap()
1352 }
1353
1354 /// Build a name query, and disable the dynamic stop word detection.
1355 ///
1356 /// It would be nice to test the stop word detection, but it makes writing
1357 /// unit tests very difficult unfortunately.
1358 fn name_query(name: &str) -> NameQuery {
1359 NameQuery::new(name).with_stop_word_ratio(0.0)
1360 }
1361
1362 fn ids(results: &[Scored<NameID>]) -> Vec<NameID> {
1363 let mut ids: Vec<_> = results.iter().map(|r| *r.value()).collect();
1364 ids.sort();
1365 ids
1366 }
1367
1368 /// Some names involving bruce.
1369 const BRUCES: &'static [&'static str] = &[
1370 "Bruce Springsteen", // 0
1371 "Bruce Kulick", // 1
1372 "Bruce Arians", // 2
1373 "Bruce Smith", // 3
1374 "Bruce Willis", // 4
1375 "Bruce Wayne", // 5
1376 "Bruce Banner", // 6
1377 ];
1378
1379 #[test]
1380 fn names_bruces_1() {
1381 let ctx = TestContext::new("small");
1382 let idx = create_index(ctx.index_dir(), BRUCES);
1383 let query = name_query("bruce");
1384 let results = idx.search(&query).into_vec();
1385
1386 // This query matches everything.
1387 assert_eq!(results.len(), 7);
1388 // The top two hits are the shortest documents, because of Okapi-BM25's
1389 // length normalization.
1390 assert_eq!(results[0].score(), 1.0);
1391 assert_eq!(results[1].score(), 1.0);
1392 assert_eq!(ids(&results[0..2]), vec![3, 5]);
1393 }
1394
1395 #[test]
1396 fn names_bruces_2() {
1397 let ctx = TestContext::new("small");
1398 let idx = create_index(ctx.index_dir(), BRUCES);
1399 let query = name_query("e w");
1400 let results = idx.search(&query).into_vec();
1401
1402 // The 'e w' ngram is only in two documents: Bruce Willis and
1403 // Bruce Wayne. Since Wayne is shorter than Willis, it should always
1404 // be first.
1405 assert_eq!(results.len(), 2);
1406 assert_eq!(*results[0].value(), 5);
1407 assert_eq!(*results[1].value(), 4);
1408 }
1409
1410 #[test]
1411 fn names_bruces_3() {
1412 let ctx = TestContext::new("small");
1413 let idx = create_index(ctx.index_dir(), BRUCES);
1414 let query = name_query("Springsteen");
1415 let results = idx.search(&query).into_vec();
1416
1417 assert_eq!(results.len(), 1);
1418 assert_eq!(*results[0].value(), 0);
1419 }
1420
1421 #[test]
1422 fn names_bruces_4() {
1423 let ctx = TestContext::new("small");
1424 let idx = create_index(ctx.index_dir(), BRUCES);
1425 let query =
1426 name_query("Springsteen Kulick Arians Smith Willis Wayne Banner");
1427 let results = idx.search(&query).into_vec();
1428
1429 // This query should hit everything.
1430 assert_eq!(results.len(), 7);
1431 }
1432
1433 // Test our various ngram strategies.
1434
1435 fn ngrams_window(n: usize, text: &str) -> Vec<&str> {
1436 let mut grams = vec![];
1437 NgramType::Window.iter(n, text, |gram| grams.push(gram));
1438 grams
1439 }
1440
1441 fn ngrams_edge(n: usize, text: &str) -> Vec<&str> {
1442 let mut grams = vec![];
1443 NgramType::Edge.iter(n, text, |gram| grams.push(gram));
1444 grams
1445 }
1446
1447 #[test]
1448 #[should_panic]
1449 fn ngrams_window_zero_banned() {
1450 assert_eq!(ngrams_window(0, "abc"), vec!["abc"]);
1451 }
1452
1453 #[test]
1454 fn ngrams_window_weird_sizes() {
1455 assert_eq!(
1456 ngrams_window(2, "abcdef"),
1457 vec!["ab", "bc", "cd", "de", "ef",]
1458 );
1459 assert_eq!(
1460 ngrams_window(1, "abcdef"),
1461 vec!["a", "b", "c", "d", "e", "f",]
1462 );
1463 assert_eq!(ngrams_window(2, "ab"), vec!["ab",]);
1464 assert_eq!(ngrams_window(1, "ab"), vec!["a", "b",]);
1465 assert_eq!(ngrams_window(1, "a"), vec!["a",]);
1466 assert_eq!(ngrams_window(1, ""), Vec::<&str>::new());
1467 }
1468
1469 #[test]
1470 fn ngrams_window_ascii() {
1471 assert_eq!(
1472 ngrams_window(3, "abcdef"),
1473 vec!["abc", "bcd", "cde", "def",]
1474 );
1475 assert_eq!(ngrams_window(3, "abcde"), vec!["abc", "bcd", "cde",]);
1476 assert_eq!(ngrams_window(3, "abcd"), vec!["abc", "bcd",]);
1477 assert_eq!(ngrams_window(3, "abc"), vec!["abc",]);
1478 assert_eq!(ngrams_window(3, "ab"), vec!["ab",]);
1479 assert_eq!(ngrams_window(3, "a"), vec!["a",]);
1480 assert_eq!(ngrams_window(3, ""), Vec::<&str>::new());
1481 }
1482
1483 #[test]
1484 fn ngrams_window_non_ascii() {
1485 assert_eq!(
1486 ngrams_window(3, "αβγφδε"),
1487 vec!["αβγ", "βγφ", "γφδ", "φδε",]
1488 );
1489 assert_eq!(ngrams_window(3, "αβγφδ"), vec!["αβγ", "βγφ", "γφδ",]);
1490 assert_eq!(ngrams_window(3, "αβγφ"), vec!["αβγ", "βγφ",]);
1491 assert_eq!(ngrams_window(3, "αβγ"), vec!["αβγ",]);
1492 assert_eq!(ngrams_window(3, "αβ"), vec!["αβ",]);
1493 assert_eq!(ngrams_window(3, "α"), vec!["α",]);
1494 }
1495
1496 #[test]
1497 fn ngrams_edge_ascii() {
1498 assert_eq!(
1499 ngrams_edge(5, "homer simpson"),
1500 vec!["hom", "home", "homer", "sim", "simp", "simps",]
1501 );
1502 assert_eq!(ngrams_edge(5, "h"), vec!["h",]);
1503 assert_eq!(ngrams_edge(5, "ho"), vec!["ho",]);
1504 assert_eq!(ngrams_edge(5, "hom"), vec!["hom",]);
1505 assert_eq!(ngrams_edge(5, "home"), vec!["hom", "home",]);
1506 }
1507
1508 #[test]
1509 fn ngrams_edge_non_ascii() {
1510 assert_eq!(
1511 ngrams_edge(5, "δεαβγφδε δε"),
1512 vec!["δεα", "δεαβ", "δεαβγ", "δε",]
1513 );
1514 }
1515}