imdb_index/index/
names.rs

1use std::cmp;
2use std::collections::{binary_heap, BinaryHeap};
3use std::convert::TryInto;
4use std::fmt;
5use std::fs::File;
6use std::io::{self, Write};
7use std::path::Path;
8use std::str::{self, FromStr};
9use std::time::Instant;
10
11use fnv::FnvHashMap;
12use fst;
13use memmap::Mmap;
14use serde::{Deserialize, Serialize};
15use serde_json;
16
17use crate::error::{Error, Result};
18use crate::index::writer::CursorWriter;
19use crate::scored::{Scored, SearchResults};
20use crate::util::{
21    fst_map_builder_file, fst_map_file, mmap_file, open_file, NiceDuration,
22};
23
24/// The name of the file containing the index configuration.
25///
26/// The index configuration is a JSON file with some meta data about this
27/// index, such as its version, ngram size and aggregate statistics about the
28/// corpus that has been indexed.
29const CONFIG: &str = "names.config.json";
30
31/// The name of the ngram term index.
32///
33/// The ngram term index maps ngrams (fixed size sequences of Unicode
34/// codepoints) to file offsets. Each file offset points to the postings for
35/// the corresponding term.
36const NGRAM: &str = "names.ngram.fst";
37
38/// The name of the postings list index.
39///
40/// The postings list contains an entry for every term in the ngram index.
41/// Each entry corresponds to a list of document/frequency pairs. Namely, each
42/// entry is a DocID and a frequency count indicating how many times the
43/// corresponding term appeared in that document. Each entry in the list is
44/// encoded as a single 32 little-endian integer. The high 4 bits represent
45/// the frequency (which is capped at 15, a reasonable number for indexing
46/// short name strings) while the low 28 bits represent the doc id. The
47/// `MAX_DOC_ID` constant below ensures we make sure to never use a doc id
48/// that won't fit this encoding scheme.
49///
50/// The last eight bytes in the postings index contains a 64-bit little-endian
51/// encoded integer indicating the average length of all documents represented
52/// by the ngram index. The length is recorded in units of terms, which
53/// generally correspond to the total number of ngrams in a name.
54const POSTINGS: &str = "names.postings.idx";
55
56/// The name of the identifier map index.
57///
58/// This file maps `DocID`s to `NameID`s. It consists of a sequence of
59/// 64-bit little-endian encoded integers, where the length of the sequence
60/// corresponds to the total number of names in the index. Each entry in the
61/// sequence encodes a `NameID`. In other words, the index to this sequence is
62/// a `DocID` and the value at that index is a `NameID`.
63///
64/// The id map is used to map doc ids returned by the postings to name ids
65/// which were provided by the caller. This also permits search to deduplicate
66/// results. That is, we should never return multiple results for the same
67/// NameID, even though we may have indexed multiple names for the same name
68/// id.
69const IDMAP: &str = "names.idmap.idx";
70
71/// The name of the document length index.
72///
73/// This file consists of a sequence of 16-bit little-endian encoded
74/// integers, where the length of the sequence corresponds to the total number
75/// of names in the index. Each entry represents the length, in terms, of each
76/// name.
77///
78/// The lengths are used during scoring to compute a normalization term. This
79/// allows the scoring mechanism to take document length into account.
80const NORMS: &str = "names.norms.idx";
81
82/// The external identifier for every distinct record represented by this name
83/// index. There are no restrictions on name ids, and multiple names may be
84/// indexed that correspond to the same name id.
85///
86/// With respect to IMDb, there is a 1-to-1 correspondence between the records
87/// in title.basics.tsv and the set of NameIDs, even though there may be
88/// multiple names for each record.
89///
90/// For IMDb, this is represented by the byte offset of the corresponding
91/// record in title.basics.tsv. This provides constant time lookup to full
92/// record. Note, though, that this module knows nothing about such things.
93/// To this module, name ids are opaque identifiers.
94pub type NameID = u64;
95
96/// An internal surrogate identifier for every distinct name in the index. Note
97/// that multiple distinct doc ids can map to the same name id. For example, if
98/// a name has multiple distinct forms, then they each get their own docid, but
99/// each of the docids will map to the same name id.
100///
101/// The reason why we need DocID in addition to NameID is two fold:
102///
103/// 1. Firstly, we'd like each name variant to have its own term frequency
104///    count. If every variant shared the same internal id, then names with
105///    multiple variants would behave as if they were one long name with each
106///    variant concatenated together. Our ranking scheme takes document length
107///    into account, so we don't want this.
108/// 2. Secondly, using an internal ID gives us control over the structure of
109///    those ids. For example, we can declare them to be a sorted sequence of
110///    increasing integers. This lets us traverse our postings more efficiently
111///    during search.
112type DocID = u32;
113
114/// The maximum docid allowed.
115///
116/// When writing postings, we pack docids and their term frequency counts into
117/// a single u32. We give 4 bits for frequency and 28 bits for docid. That
118/// means we can permit up to 268,435,455 = (1<<28)-1 names, which is plenty
119/// for all unique names in IMDb.
120const MAX_DOC_ID: DocID = (1 << 28) - 1;
121
122/// A query for searching the name index.
123///
124/// A query provides the name query and defines the maximum number of results
125/// returned by searching the name index.
126#[derive(Clone, Debug)]
127pub struct NameQuery {
128    name: String,
129    size: usize,
130    scorer: NameScorer,
131    stop_word_ratio: f64,
132}
133
134impl NameQuery {
135    /// Create a query that searches the given name.
136    pub fn new(name: &str) -> NameQuery {
137        NameQuery {
138            name: name.to_string(),
139            size: 30,
140            scorer: NameScorer::default(),
141            stop_word_ratio: 0.01,
142        }
143    }
144
145    /// Set this query's result set size. At most `size` results will be
146    /// returned when searching with this query.
147    pub fn with_size(self, size: usize) -> NameQuery {
148        NameQuery { size, ..self }
149    }
150
151    /// Set this query's scorer. By default, Okapi BM25 is used.
152    pub fn with_scorer(self, scorer: NameScorer) -> NameQuery {
153        NameQuery { scorer, ..self }
154    }
155
156    /// Set the ratio (in the range `0.0` to `1.0`, inclusive) at which a term
157    /// is determined to be a stop word. Set to `0.0` to disable. By default
158    /// this is set to a non-zero value.
159    ///
160    /// This ratio is used at query time to partition all of the ngrams in the
161    /// query into two bins: one bin is for "low frequency" ngrams while the
162    /// other is for "high frequency" ngrams. The partitioning is determined
163    /// by this ratio. Namely, if an ngram occurs in fewer than `ratio`
164    /// documents in the entire corpus, then it is considered a low frequency
165    /// ngram.
166    ///
167    /// Once these two partitions are created, both are used to create two
168    /// disjunction queries. The low frequency query drives search results,
169    /// while the high frequency query is only used to boost scores when it
170    /// matches a result yielded by the low frequency query. Otherwise, results
171    /// from the high frequency query aren't considered.
172    pub fn with_stop_word_ratio(self, ratio: f64) -> NameQuery {
173        NameQuery { stop_word_ratio: ratio, ..self }
174    }
175}
176
177/// A reader for the name index.
178#[derive(Debug)]
179pub struct IndexReader {
180    /// The configuration of this index. This is how we determine index-time
181    /// settings automatically, such as ngram size and type.
182    config: Config,
183    /// The ngram index, also known more generally as the "term index." It maps
184    /// terms (which are ngrams for this index) to offsets into the postings
185    /// file. The offset indicates the start of a list of document ids
186    /// containing that term.
187    ngram: fst::Map<Mmap>,
188    /// The postings. This corresponds to a sequence of lists, where each list
189    /// is a list of document ID/frequency pairs. Each list corresponds to the
190    /// document ids containing a particular term. The beginning of each list
191    /// is pointed to by an offset in the term index.
192    postings: Mmap,
193    /// A sequence of 64-bit little-endian encoded integers that provide a
194    /// map from document ID to name ID. The document ID is an internal
195    /// identifier assigned to each unique name indexed, while the name ID is
196    /// an external identifier provided by users of this index.
197    ///
198    /// This map is used to return name IDs to callers. Namely, results are
199    /// natively represented by document IDs, but they are mapped to name IDs
200    /// during collection of results and subsequently deduped. In particular,
201    /// multiple document IDs can map to the same name ID.
202    ///
203    /// The number of entries in this map is equivalent to the total number of
204    /// names indexed.
205    idmap: Mmap,
206    /// A sequence of 16-bit little-endian encoded integers indicating the
207    /// document length (in terms) of the correspond document ID.
208    ///
209    /// The number of entries in this map is equivalent to the total number of
210    /// names indexed.
211    norms: Mmap,
212}
213
214/// The configuration for this name index. It is JSON encoded to disk.
215///
216/// Note that we don't track the version here. Instead, it is tracked wholesale
217/// as part of the parent index.
218#[derive(Debug, Deserialize, Serialize)]
219struct Config {
220    ngram_type: NgramType,
221    ngram_size: usize,
222    avg_document_len: f64,
223    num_documents: u64,
224}
225
226impl IndexReader {
227    /// Open a name index in the given directory.
228    pub fn open<P: AsRef<Path>>(dir: P) -> Result<IndexReader> {
229        let dir = dir.as_ref();
230
231        // All of the following open memory maps. We claim it is safe because
232        // we don't mutate them and no other process (should) either.
233        let ngram = unsafe { fst_map_file(dir.join(NGRAM))? };
234        let postings = unsafe { mmap_file(dir.join(POSTINGS))? };
235        let idmap = unsafe { mmap_file(dir.join(IDMAP))? };
236        let norms = unsafe { mmap_file(dir.join(NORMS))? };
237
238        let config_file = open_file(dir.join(CONFIG))?;
239        let config: Config = serde_json::from_reader(config_file)
240            .map_err(|e| Error::config(e.to_string()))?;
241        Ok(IndexReader { config, ngram, postings, idmap, norms })
242    }
243
244    /// Execute a search.
245    pub fn search(&self, query: &NameQuery) -> SearchResults<NameID> {
246        let start = Instant::now();
247        let mut searcher = Searcher::new(self, query);
248        let results = CollectTopK::new(query.size).collect(&mut searcher);
249        log::debug!(
250            "search for {:?} took {}",
251            query,
252            NiceDuration::since(start)
253        );
254        results
255    }
256
257    /// Return the name ID used to the index the given document id.
258    ///
259    /// This panics if the given document id does not correspond to an indexed
260    /// document.
261    fn docid_to_nameid(&self, docid: DocID) -> NameID {
262        let start = 8 * (docid as usize);
263        let buf = self.idmap[start..start + 8].try_into().unwrap();
264        u64::from_le_bytes(buf)
265    }
266
267    /// Return the length, in terms, of the given document.
268    ///
269    /// This panics if the given document id does not correspond to an indexed
270    /// document.
271    fn document_length(&self, docid: DocID) -> u64 {
272        let start = 2 * (docid as usize);
273        let buf = self.norms[start..start + 2].try_into().unwrap();
274        u16::from_le_bytes(buf) as u64
275    }
276}
277
278/// A collector for gathering the top K results from a search.
279///
280/// This maintains a min-heap of search results. When a new result is
281/// considered, it is compared against the worst result in the heap. If the
282/// candidate is worse, then it is discarded. Otherwise, it is shuffled into
283/// the heap.
284struct CollectTopK {
285    /// The total number of hits to collect.
286    k: usize,
287    /// The min-heap, according to score. Note that since BinaryHeap is a
288    /// max-heap by default, we reverse the comparison to get a min-heap.
289    queue: BinaryHeap<cmp::Reverse<Scored<NameID>>>,
290    /// A set for deduplicating results. Namely, multiple doc IDs can map to
291    /// the same name ID. This set makes sure we only collect one name ID.
292    ///
293    /// We map name IDs to scores. In this way, we always report the best
294    /// scoring match.
295    byid: FnvHashMap<NameID, f64>,
296}
297
298impl CollectTopK {
299    /// Build a new collector that collects at most `k` results.
300    fn new(k: usize) -> CollectTopK {
301        CollectTopK {
302            k,
303            queue: BinaryHeap::with_capacity(k),
304            byid: FnvHashMap::default(),
305        }
306    }
307
308    /// Collect the top K results from the given searcher using the given
309    /// index reader. Return the results with normalized scores sorted in
310    /// order of best-to-worst.
311    fn collect(mut self, searcher: &mut Searcher) -> SearchResults<NameID> {
312        if self.k == 0 {
313            return SearchResults::new();
314        }
315        let index = searcher.index();
316        let (mut count, mut push_count) = (0, 0);
317        for scored_with_docid in searcher {
318            count += 1;
319            let scored = scored_with_docid.map(|v| index.docid_to_nameid(v));
320            // Since multiple names can correspond to a single IMDb title,
321            // we dedup our results here. That is, if our result set
322            // already contains this result, then update the score if need
323            // be, and then move on.
324            if let Some(&score) = self.byid.get(scored.value()) {
325                if scored.score() > score {
326                    self.byid.insert(*scored.value(), scored.score());
327                }
328                continue;
329            }
330
331            let mut dopush = self.queue.len() < self.k;
332            if !dopush {
333                // This unwrap is OK because k > 0 and queue is non-empty.
334                let worst = self.queue.peek_mut().unwrap();
335                // If our queue is full, then we should only push if this
336                // doc id has a better score than the worst one in the queue.
337                if worst.0 < scored {
338                    self.byid.remove(worst.0.value());
339                    binary_heap::PeekMut::pop(worst);
340                    dopush = true;
341                }
342            }
343            if dopush {
344                push_count += 1;
345                self.byid.insert(*scored.value(), scored.score());
346                self.queue.push(cmp::Reverse(scored));
347            }
348        }
349        log::debug!(
350            "collect count: {:?}, collect push count: {:?}",
351            count,
352            push_count
353        );
354
355        // Pull out the results from our heap and normalize the scores.
356        let mut results = SearchResults::from_min_heap(&mut self.queue);
357        results.normalize();
358        results
359    }
360}
361
362/// A searcher for resolving fulltext queries.
363///
364/// A searcher takes a fulltext query, usually typed by an end user, along with
365/// a scoring function and produces a stream of matching results with scores
366/// computed via the provided function. Results are always yielded in
367/// ascending order with respect to document IDs, which are internal IDs
368/// assigned to each name in the index.
369///
370/// This searcher combines a bit of smarts to handle stop words, usually
371/// referred to as "dynamic stop word detection." Namely, after the searcher
372/// splits the query into ngrams, it partitions the ngrams into infrequently
373/// occurring ngrams and frequently occurring ngrams, according to some
374/// hard-coded threshold. Each group is then turned into a `Disjunction`
375/// query. The searcher then visits every doc ID that matches the infrequently
376/// occurring disjunction. When a score is computed for a doc ID, then its
377/// score is increased if the frequently occurring disjunction also contains
378/// that same doc ID. Otherwise, the frequently occurring disjunction isn't
379/// consulted at all, which permits skipping the score calculation for a
380/// potentially large number of doc IDs.
381///
382/// When two partitions cannot be created (e.g., all of the terms are
383/// infrequently occurring or all of the terms are frequently occurring), then
384/// only one disjunction query is used and no skipping logic is employed. That
385/// means that a query consisting of all high frequency terms could be quite
386/// slow.
387///
388/// This does of course sacrifice recall for a performance benefit, but so do
389/// all filtering strategies based on stop words. The benefit of this "dynamic"
390/// approach is that stop word detection is tailored exactly to the corpus, and
391/// that stop words can still influence scoring. That means queries like "the
392/// matrix" will match "The Matrix" better than "Matrix" (which is a legitimate
393/// example, try it).
394struct Searcher<'i> {
395    /// A handle to the index.
396    index: &'i IndexReader,
397    /// The primary disjunction query that drives results. Typically, this
398    /// corresponds to the infrequent terms in the query.
399    primary: Disjunction<'i>,
400    /// A disjunction of only high frequency terms. When the query consists
401    /// of exclusively high frequency terms, then this is empty (which matches
402    /// nothing) and `primary` is set to the disjunction of terms.
403    high: Disjunction<'i>,
404}
405
406impl<'i> Searcher<'i> {
407    /// Create a new searcher.
408    fn new(idx: &'i IndexReader, query: &NameQuery) -> Searcher<'i> {
409        let num_docs = idx.config.num_documents as f64;
410        let (mut low, mut high) = (vec![], vec![]);
411        let (mut low_terms, mut high_terms) = (vec![], vec![]);
412
413        let name = normalize_query(&query.name);
414        let mut query_len = 0;
415        let mut multiset = FnvHashMap::default();
416        idx.config.ngram_type.iter(idx.config.ngram_size, &name, |term| {
417            *multiset.entry(term).or_insert(0) += 1;
418            query_len += 1;
419        });
420        for (term, &count) in multiset.iter() {
421            let postings = PostingIter::new(idx, query.scorer, count, term);
422            let ratio = (postings.len() as f64) / num_docs;
423            if ratio < query.stop_word_ratio {
424                low.push(postings);
425                low_terms.push(format!("{}:{}:{:0.6}", term, count, ratio));
426            } else {
427                high.push(postings);
428                high_terms.push(format!("{}:{}:{:0.6}", term, count, ratio));
429            }
430        }
431        log::debug!("starting search for: {:?}", name);
432        log::debug!("{:?} low frequency terms: {:?}", low.len(), low_terms);
433        log::debug!("{:?} high frequency terms: {:?}", high.len(), high_terms);
434
435        if low.is_empty() {
436            Searcher {
437                index: idx,
438                primary: Disjunction::new(idx, query_len, query.scorer, high),
439                high: Disjunction::empty(idx, query.scorer),
440            }
441        } else {
442            Searcher {
443                index: idx,
444                primary: Disjunction::new(idx, query_len, query.scorer, low),
445                high: Disjunction::new(idx, query_len, query.scorer, high),
446            }
447        }
448    }
449
450    /// Return a reference to the underlying index reader.
451    fn index(&self) -> &'i IndexReader {
452        self.index
453    }
454}
455
456impl<'i> Iterator for Searcher<'i> {
457    type Item = Scored<DocID>;
458
459    fn next(&mut self) -> Option<Scored<DocID>> {
460        // This is pretty simple. We drive the iterator via the primary
461        // disjunction, which is usually a disjunction of infrequently
462        // occurring ngrams.
463        let mut scored = match self.primary.next() {
464            None => return None,
465            Some(scored) => scored,
466        };
467        // We then skip our frequently occurring disjunction to the doc ID
468        // yielded above. Any frequently occurring ngrams found then improve
469        // this score. This makes queries like 'the matrix' match 'The Matrix'
470        // better than 'Matrix'.
471        if let Some(other_scored) = self.high.skip_to(*scored.value()) {
472            scored = scored.map_score(|s| s + other_scored.score());
473        }
474        Some(scored)
475    }
476}
477
478/// A disjunction over a collection of ngrams. A disjunction yields scored
479/// document IDs for every document that contains any of the terms in this
480/// disjunction. The more ngrams that match the document in the disjunction,
481/// the better the score.
482struct Disjunction<'i> {
483    /// A handle to the underlying index that we're searching.
484    index: &'i IndexReader,
485    /// The number of ngrams in the original query.
486    ///
487    /// This is not necessarily equivalent to the number of ngrams in this
488    /// specific disjunction. Namely, this is used to compute scores, and it
489    /// is important that scores are computed using the total number of ngrams
490    /// and not the number of ngrams in a specific disjunction. For example,
491    /// if a query consisted of 8 infrequent ngrams and 1 frequent ngram, then
492    /// the disjunction containing the single frequent ngram would contribute a
493    /// disproportionately high score.
494    query_len: f64,
495    /// The scoring function to use.
496    scorer: NameScorer,
497    /// A min-heap of posting iterators. Each posting iterator corresponds to
498    /// an iterator over (doc ID, frequency) pairs for a single ngram, sorted
499    /// by doc ID in ascending order.
500    ///
501    /// A min-heap is a classic way of optimally computing a disjunction over
502    /// an arbitrary number of ordered streams.
503    queue: BinaryHeap<PostingIter<'i>>,
504    /// Whether this disjunction has been exhausted or not.
505    is_done: bool,
506}
507
508impl<'i> Disjunction<'i> {
509    /// Create a new disjunction over the given posting iterators.
510    fn new(
511        index: &'i IndexReader,
512        query_len: usize,
513        scorer: NameScorer,
514        posting_iters: Vec<PostingIter<'i>>,
515    ) -> Disjunction<'i> {
516        let mut queue = BinaryHeap::new();
517        for postings in posting_iters {
518            queue.push(postings);
519        }
520        let is_done = queue.is_empty();
521        let query_len = query_len as f64;
522        Disjunction { index, query_len, scorer, queue, is_done }
523    }
524
525    /// Create an empty disjunction that never matches anything.
526    fn empty(index: &'i IndexReader, scorer: NameScorer) -> Disjunction<'i> {
527        Disjunction {
528            index,
529            query_len: 0.0,
530            scorer,
531            queue: BinaryHeap::new(),
532            is_done: true,
533        }
534    }
535
536    /// Skip this disjunction such that all posting iterators are either
537    /// positioned at the smallest doc ID greater than the given doc ID.
538    ///
539    /// If any posting iterator contains the given doc ID, then it is scored
540    /// and returned. The score incorporates all posting iterators that contain
541    /// the given doc ID.
542    fn skip_to(&mut self, target_docid: DocID) -> Option<Scored<DocID>> {
543        if self.is_done {
544            return None;
545        }
546        let mut found = false;
547        // loop invariant: loop until all posting iterators are either
548        // positioned directly at the target doc ID (in which case, `found`
549        // is set to that doc ID) or beyond the target doc ID. If none of the
550        // iterators contain the target doc ID, then `found` remains `None`.
551        loop {
552            // This unwrap is OK because we're only here if we have a
553            // non-empty queue.
554            let mut postings = self.queue.peek_mut().unwrap();
555            if postings.docid().map_or(true, |x| x >= target_docid) {
556                found = found || Some(target_docid) == postings.docid();
557                // This is the smallest posting iterator, which means all
558                // iterators are now either at or beyond target_docid.
559                break;
560            }
561            // Skip through this iterator until we're at or beyond the target
562            // doc ID.
563            while postings.docid().map_or(false, |x| x < target_docid) {
564                postings.next();
565            }
566            found = found || Some(target_docid) == postings.docid();
567        }
568        if !found {
569            return None;
570        }
571        // We're here if we found our target doc ID, which means at least one
572        // posting iterator is pointing to the doc ID and it is necessarily
573        // the minimum doc ID of all the posting iterators in this disjunction.
574        // Therefore, advance such that all posting iterators are beyond the
575        // target doc ID.
576        //
577        // (If we didn't find the target doc ID, then the loop invariant above
578        // guarantees that we are already passed the target doc ID.)
579        self.next()
580    }
581}
582
583impl<'i> Iterator for Disjunction<'i> {
584    type Item = Scored<DocID>;
585
586    fn next(&mut self) -> Option<Scored<DocID>> {
587        if self.is_done {
588            return None;
589        }
590        // Find our next matching ngram.
591        let mut scored1 = {
592            // This unwrap is OK because we're only here if we have a
593            // non-empty queue.
594            let mut postings = self.queue.peek_mut().unwrap();
595            match postings.score() {
596                None => {
597                    self.is_done = true;
598                    return None;
599                }
600                Some(scored) => {
601                    postings.next();
602                    scored
603                }
604            }
605        };
606        // Discover if any of the other posting iterators also match this
607        // ngram.
608        loop {
609            // This unwrap is OK because we're only here if we have a
610            // non-empty queue.
611            let mut postings = self.queue.peek_mut().unwrap();
612            match postings.score() {
613                None => break,
614                Some(scored2) => {
615                    // If the smallest posting iterator isn't equivalent to
616                    // the doc ID found above, then we've found all of the
617                    // matching terms for this doc ID that we'll find.
618                    if scored1.value() != scored2.value() {
619                        break;
620                    }
621                    scored1 = scored1.map_score(|s| s + scored2.score());
622                    postings.next();
623                }
624            }
625        }
626        // Some of our scorers are more convenient to compute at the
627        // disjunction level rather than at the term level.
628        if let NameScorer::Jaccard = self.scorer {
629            // When using Jaccard, the score returned by the posting
630            // iterator is always 1. Thus, `scored.score` represents the
631            // total number of terms that matched this document. In other
632            // words, it is the cardinality of the intersection of terms
633            // between the query and our candidate, `|A ∩ B|`.
634            //
635            // `query_len` represents the total number of terms in our query
636            // (not just the number of terms in this disjunction!), and
637            // `doc_len` represents the total number of terms in our candidate.
638            // Thus, since `|A u B| = |A| + |B| - |A ∩ B|`, we have that
639            // `|A u B| = query_len + doc_len - scored.score`. And finally, the
640            // Jaccard index is `|A ∩ B| / |A u B|`.
641            let doc_len = self.index.document_length(*scored1.value()) as f64;
642            let union = self.query_len + doc_len - scored1.score();
643            scored1 = scored1.map_score(|s| s / union);
644        } else if let NameScorer::QueryRatio = self.scorer {
645            // This is like Jaccard, but our score is computely purely as the
646            // ratio of query terms that matched this document.
647            scored1 = scored1.map_score(|s| s / self.query_len)
648        }
649        Some(scored1)
650    }
651}
652
653/// An iterator over a postings list for a specific ngram.
654///
655/// A postings list is a sequence of pairs, where each pair has a document
656/// ID and a frequency. The document ID indicates that the ngram is in the
657/// text indexed for that ID, and the frequency counts the number of times
658/// that ngram occurs in the document.
659///
660/// To save space, each pair is encoded using 32 bits. Frequencies are capped
661/// at a maximum of 15, which fit into the high 4 bits. The low 28 bits contain
662/// the doc ID.
663///
664/// The postings list starts with a single 32-bit little endian
665/// integer that represents the document frequency of the ngram. This in turn
666/// determines how many pairs to read. In other words, a posting list is a
667/// length prefixed array of 32 bit little endian integer values.
668///
669/// This type is intended to be used in a max-heap, and orients its Ord
670/// definition such that the heap becomes a min-heap. The ordering criteria
671/// is derived from only the docid.
672#[derive(Clone)]
673struct PostingIter<'i> {
674    /// A handle to the underlying index.
675    index: &'i IndexReader,
676    /// The scoring function to use.
677    scorer: NameScorer,
678    /// The number of times the term for these postings appeared in the
679    /// original query. This increases the score proportionally.
680    count: f64,
681    /// The raw bytes of the posting list. The number of bytes is
682    /// exactly equivalent to `4 * document-frequency(ngram)`, where
683    /// `document-frequency(ngram)` is the total number of documents in which
684    /// `ngram` occurs.
685    ///
686    /// This does not include the length prefix.
687    postings: &'i [u8],
688    /// The document frequency of this term.
689    len: usize,
690    /// The current posting. This is `None` once this iterator is exhausted.
691    posting: Option<Posting>,
692    /// A docid used for sorting postings. When the iterator is exhausted,
693    /// this is greater than the maximum doc id. Otherwise, this is always
694    /// equivalent to posting.docid.
695    ///
696    /// We do this for efficiency by avoiding going through the optional
697    /// Posting.
698    docid: DocID,
699    /// The OkapiBM25 IDF score. This is invariant across all items in a
700    /// posting list, so we compute it once at construction. This saves a
701    /// call to `log` for every doc ID visited.
702    okapi_idf: f64,
703}
704
705/// A single entry in a posting list.
706#[derive(Clone, Copy, Debug)]
707struct Posting {
708    /// The document id.
709    docid: DocID,
710    /// The frequency, i.e., the number of times the ngram occurred in the
711    /// document identified by the docid.
712    frequency: u32,
713}
714
715impl Posting {
716    /// Read the next posting pair (doc ID and frequency) from the given
717    /// postings list. If the list is empty, then return `None`.
718    fn read(slice: &[u8]) -> Option<Posting> {
719        if slice.is_empty() {
720            None
721        } else {
722            let v = read_le_u32(slice);
723            Some(Posting { docid: v & MAX_DOC_ID, frequency: v >> 28 })
724        }
725    }
726}
727
728impl<'i> PostingIter<'i> {
729    /// Create a new posting iterator for the given term in the given index.
730    /// Scores will be computed with the given scoring function.
731    ///
732    /// `count` should be the number of times this term occurred in the
733    /// original query string.
734    fn new(
735        index: &'i IndexReader,
736        scorer: NameScorer,
737        count: usize,
738        term: &str,
739    ) -> PostingIter<'i> {
740        let mut postings = &*index.postings;
741        let offset = match index.ngram.get(term.as_bytes()) {
742            Some(offset) => offset as usize,
743            None => {
744                // If the term isn't in the index, then return an exhausted
745                // iterator.
746                return PostingIter {
747                    index,
748                    scorer,
749                    count: 0.0,
750                    postings: &[],
751                    len: 0,
752                    posting: None,
753                    docid: MAX_DOC_ID + 1,
754                    okapi_idf: 0.0,
755                };
756            }
757        };
758        postings = &postings[offset..];
759        let len = read_le_u32(postings) as usize;
760        postings = &postings[4..];
761
762        let corpus_count = index.config.num_documents as f64;
763        let df = len as f64;
764        let okapi_idf = (1.0 + (corpus_count - df + 0.5) / (df + 0.5)).log2();
765        let mut it = PostingIter {
766            index,
767            scorer,
768            count: count as f64,
769            postings: &postings[..4 * len],
770            len,
771            posting: None,
772            docid: 0,
773            okapi_idf,
774        };
775        // Advance to the first posting.
776        it.next();
777        it
778    }
779
780    /// Return the current posting. If this iterator has been exhausted, then
781    /// this returns `None`.
782    fn posting(&self) -> Option<Posting> {
783        self.posting
784    }
785
786    /// Returns the document frequency for the term corresponding to these
787    /// postings.
788    fn len(&self) -> usize {
789        self.len
790    }
791
792    /// Return the current document ID. If this iterator has been exhausted,
793    /// then this returns `None`.
794    fn docid(&self) -> Option<DocID> {
795        self.posting().map(|p| p.docid)
796    }
797
798    /// Return the score with the current document ID. If this iterator has
799    /// been exhausted, then this returns `None`.
800    fn score(&self) -> Option<Scored<DocID>> {
801        match self.scorer {
802            NameScorer::OkapiBM25 => self.score_okapibm25(),
803            NameScorer::TFIDF => self.score_tfidf(),
804            NameScorer::Jaccard => self.score_jaccard(),
805            NameScorer::QueryRatio => self.score_query_ratio(),
806        }
807        .map(|scored| scored.map_score(|s| s * self.count))
808    }
809
810    /// Score the current doc ID using Okapi BM25. It's similarish to TF-IDF,
811    /// but uses a document length normalization term.
812    fn score_okapibm25(&self) -> Option<Scored<DocID>> {
813        let post = match self.posting() {
814            None => return None,
815            Some(post) => post,
816        };
817
818        let k1 = 1.2;
819        let b = 0.75;
820        let doc_len = self.index.document_length(post.docid);
821        let norm = (doc_len as f64) / self.index.config.avg_document_len;
822        let tf = post.frequency as f64;
823
824        let num = tf * (k1 + 1.0);
825        let den = tf + k1 * (1.0 - b + b * norm);
826        let score = (num / den) * self.okapi_idf;
827        let capped = if score < 0.0 { 0.0 } else { score };
828        Some(Scored::new(post.docid).with_score(capped))
829    }
830
831    /// Score the current doc ID using the traditional TF-IDF ranking function.
832    fn score_tfidf(&self) -> Option<Scored<DocID>> {
833        let post = match self.posting() {
834            None => return None,
835            Some(post) => post,
836        };
837
838        let corpus_docs = self.index.config.num_documents as f64;
839        let term_docs = self.len as f64;
840        let tf = post.frequency as f64;
841        let idf = (corpus_docs / (1.0 + term_docs)).log2();
842        let score = tf * idf;
843        Some(Scored::new(post.docid).with_score(score))
844    }
845
846    /// Score the current doc ID using the Jaccard index, which measures the
847    /// overlap between two sets.
848    ///
849    /// Note that this always returns `1.0`. The Jaccard index itself must be
850    /// computed by the disjunction scorer.
851    fn score_jaccard(&self) -> Option<Scored<DocID>> {
852        self.posting().map(|p| Scored::new(p.docid).with_score(1.0))
853    }
854
855    /// Score the current doc ID using the ratio of terms in the query that
856    /// matched the terms in this doc ID.
857    ///
858    /// Note that this always returns `1.0`. The query ratio itself must be
859    /// computed by the disjunction scorer.
860    fn score_query_ratio(&self) -> Option<Scored<DocID>> {
861        self.posting().map(|p| Scored::new(p.docid).with_score(1.0))
862    }
863}
864
865impl<'i> Iterator for PostingIter<'i> {
866    type Item = Posting;
867
868    fn next(&mut self) -> Option<Posting> {
869        self.posting = match Posting::read(self.postings) {
870            None => {
871                self.docid = MAX_DOC_ID + 1;
872                None
873            }
874            Some(p) => {
875                self.postings = &self.postings[4..];
876                self.docid = p.docid;
877                Some(p)
878            }
879        };
880        self.posting
881    }
882}
883
884impl<'i> Eq for PostingIter<'i> {}
885
886impl<'i> PartialEq for PostingIter<'i> {
887    fn eq(&self, other: &PostingIter<'i>) -> bool {
888        self.docid == other.docid
889    }
890}
891
892impl<'i> Ord for PostingIter<'i> {
893    fn cmp(&self, other: &PostingIter<'i>) -> cmp::Ordering {
894        // std::collections::BinaryHeap is a max-heap and we need a
895        // min-heap, so write this as-if it were a max-heap, then reverse it.
896        // Note that exhausted searchers should always have the lowest
897        // priority, and therefore, be considered maximal.
898        self.docid.cmp(&other.docid).reverse()
899    }
900}
901
902impl<'i> PartialOrd for PostingIter<'i> {
903    fn partial_cmp(&self, other: &PostingIter<'i>) -> Option<cmp::Ordering> {
904        Some(self.cmp(other))
905    }
906}
907
908/// A writer for indexing names to disk.
909///
910/// A writer opens and writes to several files simultaneously, which keeps the
911/// implementation simple.
912///
913/// The index writer cannot stream the postings or term index, since the term
914/// index requires its ngrams to be inserted in sorted order. Postings lists
915/// are written as length prefixed sequences, so we need to know the lengths
916/// of all our postings lists before writing them.
917pub struct IndexWriter {
918    /// A builder for the ngram term index.
919    ///
920    /// This isn't used until the caller indicates that it is done indexing
921    /// names. At which point, we insert all ngrams into the FST in sorted
922    /// order. Each ngram is mapped to the beginning of its correspond
923    /// postings list.
924    ngram: fst::MapBuilder<io::BufWriter<File>>,
925    /// The type of ngram extraction to use.
926    ngram_type: NgramType,
927    /// The size of ngrams to generate.
928    ngram_size: usize,
929    /// A writer for postings lists.
930    ///
931    /// This isn't written to until the caller indicates that it is done
932    /// indexing names. At which point, every posting list is written as a
933    /// length prefixed array, in the same order that terms are written to the
934    /// term index.
935    postings: CursorWriter<io::BufWriter<File>>,
936    /// A map from document ID to name ID. This is written to in a streaming
937    /// fashion during indexing. The ID map consists of N 64-bit little
938    /// endian integers, where N is the total number of names indexed.
939    ///
940    /// The document ID (the position in this map) is a unique internal
941    /// identifier assigned to each name, while the name ID is an identifier
942    /// provided by the caller. Multiple document IDs may map to the same
943    /// name ID (e.g., for indexing alternate names).
944    idmap: CursorWriter<io::BufWriter<File>>,
945    /// A map from document ID to document length, where the length corresponds
946    /// to the number of ngrams in the document. The map consists of N 16-bit
947    /// little endian integers, where N is the total number of names indexed.
948    ///
949    /// The document lengths are used at query time as normalization
950    /// parameters. They are written in a streaming fashion during the indexing
951    /// process.
952    norms: CursorWriter<io::BufWriter<File>>,
953    /// A JSON formatted configuration file that includes some aggregate
954    /// statistics (such as the average document length, in ngrams) and the
955    /// ngram configuration. The ngram configuration in particular is used at
956    /// query time to make sure that query-time uses the same analysis as
957    /// index-time.
958    ///
959    /// This is written at the end of the indexing process.
960    config: CursorWriter<io::BufWriter<File>>,
961    /// An in-memory map from ngram to its corresponding postings list. Once
962    /// indexing is done, this is written to disk via the FST term index and
963    /// postings list writers documented above.
964    terms: FnvHashMap<String, Postings>,
965    /// The next document ID, starting at 0. Each name added gets assigned its
966    /// own unique document ID. Queries read document IDs from the postings
967    /// list, but are mapped back to name IDs using the `idmap` before being
968    /// returned to the caller.
969    next_docid: DocID,
970    /// The average document length, in ngrams, for every name indexed. This is
971    /// used along with document lengths to compute normalization terms for
972    /// scoring at query time.
973    avg_document_len: f64,
974}
975
976/// A single postings list.
977#[derive(Clone, Debug, Default)]
978struct Postings {
979    /// A sorted list of postings, in order of ascending document IDs.
980    list: Vec<Posting>,
981}
982
983impl IndexWriter {
984    /// Open an index for writing to the given directory. Any previous name
985    /// index in the given directory is overwritten.
986    ///
987    /// The given ngram configuration is used to transform all indexed names
988    /// into terms for the inverted index.
989    pub fn open<P: AsRef<Path>>(
990        dir: P,
991        ngram_type: NgramType,
992        ngram_size: usize,
993    ) -> Result<IndexWriter> {
994        let dir = dir.as_ref();
995
996        let ngram = fst_map_builder_file(dir.join(NGRAM))?;
997        let postings = CursorWriter::from_path(dir.join(POSTINGS))?;
998        let idmap = CursorWriter::from_path(dir.join(IDMAP))?;
999        let norms = CursorWriter::from_path(dir.join(NORMS))?;
1000        let config = CursorWriter::from_path(dir.join(CONFIG))?;
1001        Ok(IndexWriter {
1002            ngram,
1003            ngram_type,
1004            ngram_size,
1005            postings,
1006            idmap,
1007            norms,
1008            config,
1009            terms: FnvHashMap::default(),
1010            next_docid: 0,
1011            avg_document_len: 0.0,
1012        })
1013    }
1014
1015    /// Finish writing names and serialize the index to disk.
1016    pub fn finish(mut self) -> Result<()> {
1017        let num_docs = self.num_docs();
1018        let mut ngram_to_postings: Vec<(String, Postings)> =
1019            self.terms.into_iter().collect();
1020        // We could use a BTreeMap and get out our keys in sorted order, but
1021        // the overhead of inserting into the BTreeMap dwarfs the savings we
1022        // get from pre-sorted keys.
1023        ngram_to_postings.sort_by(|&(ref t1, _), &(ref t2, _)| t1.cmp(t2));
1024
1025        for (term, postings) in ngram_to_postings {
1026            let pos = self.postings.position() as u64;
1027            self.ngram.insert(term.as_bytes(), pos).map_err(Error::fst)?;
1028            self.postings
1029                .write_u32(postings.list.len() as u32)
1030                .map_err(Error::io)?;
1031            for posting in postings.list {
1032                let freq = cmp::min(15, posting.frequency);
1033                let v = (freq << 28) | posting.docid;
1034                self.postings.write_u32(v).map_err(Error::io)?;
1035            }
1036        }
1037
1038        serde_json::to_writer_pretty(
1039            &mut self.config,
1040            &Config {
1041                ngram_type: self.ngram_type,
1042                ngram_size: self.ngram_size,
1043                avg_document_len: self.avg_document_len,
1044                num_documents: num_docs as u64,
1045            },
1046        )
1047        .map_err(|e| Error::config(e.to_string()))?;
1048        self.ngram.finish().map_err(Error::fst)?;
1049        self.idmap.flush().map_err(Error::io)?;
1050        self.postings.flush().map_err(Error::io)?;
1051        self.norms.flush().map_err(Error::io)?;
1052        self.config.flush().map_err(Error::io)?;
1053        Ok(())
1054    }
1055
1056    /// Inserts the given name to this index, and associates it with the
1057    /// provided `NameID`. Multiple names may be associated with the same
1058    /// `NameID`.
1059    pub fn insert(&mut self, name_id: NameID, name: &str) -> Result<()> {
1060        let docid = self.next_docid(name_id)?;
1061        let name = normalize_query(name);
1062        let mut count = 0u16; // document length in number of ngrams
1063        self.ngram_type.clone().iter(self.ngram_size, &name, |ngram| {
1064            self.insert_term(docid, ngram);
1065            // If a document length exceeds 2^16, then it is far too long for
1066            // a name anyway, so we cap it at 2^16.
1067            count = count.saturating_add(1);
1068        });
1069        // Update our mean document length (in ngrams).
1070        self.avg_document_len +=
1071            (count as f64 - self.avg_document_len) / (self.num_docs() as f64);
1072        // Write the document length to disk, which is used as a normalization
1073        // term for some scorers (like Okapi-BM25).
1074        self.norms.write_u16(count).map_err(Error::io)?;
1075        Ok(())
1076    }
1077
1078    /// Add a single term that is part of a name identified by the given docid.
1079    /// This updates the postings for this term, or creates a new posting if
1080    /// this is the first time this term has been seen.
1081    fn insert_term(&mut self, docid: DocID, term: &str) {
1082        if let Some(posts) = self.terms.get_mut(term) {
1083            posts.posting(docid).frequency += 1;
1084            return;
1085        }
1086        let mut list = Postings::default();
1087        list.posting(docid).frequency = 1;
1088        self.terms.insert(term.to_string(), list);
1089    }
1090
1091    /// Retrieve a fresh doc id, and associate it with the given name id.
1092    fn next_docid(&mut self, name_id: NameID) -> Result<DocID> {
1093        let docid = self.next_docid;
1094        self.idmap.write_u64(name_id).map_err(Error::io)?;
1095        self.next_docid = match self.next_docid.checked_add(1) {
1096            None => bug!("exhausted doc ids"),
1097            Some(next_docid) => next_docid,
1098        };
1099        if self.next_docid > MAX_DOC_ID {
1100            let max = MAX_DOC_ID + 1; // docids are 0-indexed
1101            bug!("exceeded maximum number of names ({})", max);
1102        }
1103        Ok(docid)
1104    }
1105
1106    /// Return the total number of documents have been assigned doc ids.
1107    fn num_docs(&self) -> u32 {
1108        self.next_docid
1109    }
1110}
1111
1112impl Postings {
1113    /// Return a mutable reference to the posting for the given docid. If one
1114    /// doesn't exist, then create one (with a zero frequency) and return it.
1115    fn posting(&mut self, docid: DocID) -> &mut Posting {
1116        if self.list.last().map_or(true, |x| x.docid != docid) {
1117            self.list.push(Posting { docid, frequency: 0 });
1118        }
1119        // This unwrap is OK because if the list was empty when this method was
1120        // called, then we added an element above, and is thus now non-empty.
1121        self.list.last_mut().unwrap()
1122    }
1123}
1124
1125/// The type of scorer that the name index should use.
1126///
1127/// The default is OkapiBM25. If you aren't sure which scorer to use, then
1128/// stick with the default.
1129#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
1130pub enum NameScorer {
1131    /// OkapiBM25 is a TF-IDF-like ranking function, which takes name length
1132    /// into account.
1133    OkapiBM25,
1134    /// TFIDF is the traditional TF-IDF ranking function, which does not
1135    /// incorporate document length.
1136    TFIDF,
1137    /// Jaccard is a ranking function determined by computing the similarity
1138    /// of ngrams between the query and a name in the index. The similarity
1139    /// is computed by dividing the number of ngrams in common by the total
1140    /// number of distinct ngrams in both the query and the name combined.
1141    Jaccard,
1142    /// QueryRatio is a ranking function that represents the ratio of query
1143    /// terms that matched a name. It is computed by dividing the number of
1144    /// ngrams in common by the total number of ngrams in the query only.
1145    QueryRatio,
1146}
1147
1148impl NameScorer {
1149    /// Returns a list of strings representing the possible scorer values.
1150    pub fn possible_names() -> &'static [&'static str] {
1151        &["okapibm25", "tfidf", "jaccard", "queryratio"]
1152    }
1153
1154    /// Return a string representation of this scorer.
1155    ///
1156    /// The string returned can be parsed back into a `NameScorer`.
1157    pub fn as_str(&self) -> &'static str {
1158        match *self {
1159            NameScorer::OkapiBM25 => "okapibm25",
1160            NameScorer::TFIDF => "tfidf",
1161            NameScorer::Jaccard => "jaccard",
1162            NameScorer::QueryRatio => "queryratio",
1163        }
1164    }
1165}
1166
1167impl Default for NameScorer {
1168    fn default() -> NameScorer {
1169        NameScorer::OkapiBM25
1170    }
1171}
1172
1173impl fmt::Display for NameScorer {
1174    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1175        write!(f, "{}", self.as_str())
1176    }
1177}
1178
1179impl FromStr for NameScorer {
1180    type Err = Error;
1181
1182    fn from_str(s: &str) -> Result<NameScorer> {
1183        match s {
1184            "okapibm25" => Ok(NameScorer::OkapiBM25),
1185            "tfidf" => Ok(NameScorer::TFIDF),
1186            "jaccard" => Ok(NameScorer::Jaccard),
1187            "queryratio" => Ok(NameScorer::QueryRatio),
1188            unk => Err(Error::unknown_scorer(unk)),
1189        }
1190    }
1191}
1192
1193/// The style of ngram extraction to use.
1194///
1195/// The same style of ngram extraction is always used at index time and at
1196/// query time.
1197///
1198/// Each ngram type uses the ngram size configuration differently.
1199///
1200/// All ngram styles used Unicode codepoints as the definition of a character.
1201/// For example, a 3-gram might contain up to 4 bytes, if it contains 3 Unicode
1202/// codepoints that each require 4 UTF-8 code units.
1203#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
1204pub enum NgramType {
1205    /// A windowing ngram.
1206    ///
1207    /// This is the tradition style of ngram, where sliding window of size
1208    /// `N` is moved across the entire content to be index. For example, the
1209    /// 3-grams for the string `homer` are hom, ome and mer.
1210    #[serde(rename = "window")]
1211    Window,
1212    /// An edge ngram.
1213    ///
1214    /// This style of ngram produces ever longer ngrams, where each ngram is
1215    /// anchored to the start of a word. Words are determined simply by
1216    /// splitting whitespace.
1217    ///
1218    /// For example, the edge ngrams of `homer simpson`, where the max ngram
1219    /// size is 5, would be: hom, home, homer, sim, simp, simps. Generally,
1220    /// for this ngram type, one wants to use a large maximum ngram size.
1221    /// Perhaps somewhere close to the maximum number of ngrams in any word
1222    /// in the corpus.
1223    ///
1224    /// Note that there is no way to set the minimum ngram size (which is 3).
1225    #[serde(rename = "edge")]
1226    Edge,
1227}
1228
1229/// The minimum size of an ngram emitted by the edge ngram iterator.
1230const MIN_EDGE_NGRAM_SIZE: usize = 3;
1231
1232impl NgramType {
1233    /// Return all possible ngram types.
1234    pub fn possible_names() -> &'static [&'static str] {
1235        &["window", "edge"]
1236    }
1237
1238    /// Return a string representation of this type.
1239    pub fn as_str(&self) -> &'static str {
1240        match *self {
1241            NgramType::Window => "window",
1242            NgramType::Edge => "edge",
1243        }
1244    }
1245
1246    /// Execute the given function over each ngram in the text provided using
1247    /// the given size configuration.
1248    ///
1249    /// We don't use normal Rust iterators here because an internal iterator
1250    /// is much easier to implement.
1251    fn iter<'t, F: FnMut(&'t str)>(&self, size: usize, text: &'t str, f: F) {
1252        match *self {
1253            NgramType::Window => NgramType::iter_window(size, text, f),
1254            NgramType::Edge => NgramType::iter_edge(size, text, f),
1255        }
1256    }
1257
1258    fn iter_window<'t, F: FnMut(&'t str)>(
1259        size: usize,
1260        text: &'t str,
1261        mut f: F,
1262    ) {
1263        if size == 0 {
1264            return;
1265        }
1266        let end_skip = text.chars().take(size).count().saturating_sub(1);
1267        let start = text.char_indices();
1268        let end = text.char_indices().skip(end_skip);
1269        for ((s, _), (e, c)) in start.zip(end) {
1270            f(&text[s..e + c.len_utf8()]);
1271        }
1272    }
1273
1274    fn iter_edge<'t, F: FnMut(&'t str)>(
1275        max_size: usize,
1276        text: &'t str,
1277        mut f: F,
1278    ) {
1279        if max_size == 0 {
1280            return;
1281        }
1282        for word in text.split_whitespace() {
1283            let end_skip = word
1284                .chars()
1285                .take(MIN_EDGE_NGRAM_SIZE)
1286                .count()
1287                .saturating_sub(1);
1288            let mut size = end_skip + 1;
1289            for (end, c) in word.char_indices().skip(end_skip) {
1290                f(&word[..end + c.len_utf8()]);
1291                size += 1;
1292                if size > max_size {
1293                    break;
1294                }
1295            }
1296        }
1297    }
1298}
1299
1300impl Default for NgramType {
1301    fn default() -> NgramType {
1302        NgramType::Window
1303    }
1304}
1305
1306impl fmt::Display for NgramType {
1307    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1308        write!(f, "{}", self.as_str())
1309    }
1310}
1311
1312impl FromStr for NgramType {
1313    type Err = Error;
1314
1315    fn from_str(s: &str) -> Result<NgramType> {
1316        match s {
1317            "window" => Ok(NgramType::Window),
1318            "edge" => Ok(NgramType::Edge),
1319            unk => Err(Error::unknown_ngram_type(unk)),
1320        }
1321    }
1322}
1323
1324fn normalize_query(s: &str) -> String {
1325    // We might consider doing Unicode normalization here, but it probably
1326    // doesn't matter too much on a predominantly ASCII data set.
1327    s.to_lowercase()
1328}
1329
1330fn read_le_u32(slice: &[u8]) -> u32 {
1331    u32::from_le_bytes(slice[..4].try_into().unwrap())
1332}
1333
1334#[cfg(test)]
1335mod tests {
1336    use super::*;
1337    use crate::index::tests::TestContext;
1338
1339    // Test the actual name index.
1340
1341    /// Creates a name index, where each name provided is assigned its own
1342    /// unique ID, starting at 0.
1343    fn create_index(index_dir: &Path, names: &[&str]) -> IndexReader {
1344        let mut wtr =
1345            IndexWriter::open(index_dir, NgramType::Window, 3).unwrap();
1346        for (i, name) in names.iter().enumerate() {
1347            wtr.insert(i as u64, name).unwrap();
1348        }
1349        wtr.finish().unwrap();
1350
1351        IndexReader::open(index_dir).unwrap()
1352    }
1353
1354    /// Build a name query, and disable the dynamic stop word detection.
1355    ///
1356    /// It would be nice to test the stop word detection, but it makes writing
1357    /// unit tests very difficult unfortunately.
1358    fn name_query(name: &str) -> NameQuery {
1359        NameQuery::new(name).with_stop_word_ratio(0.0)
1360    }
1361
1362    fn ids(results: &[Scored<NameID>]) -> Vec<NameID> {
1363        let mut ids: Vec<_> = results.iter().map(|r| *r.value()).collect();
1364        ids.sort();
1365        ids
1366    }
1367
1368    /// Some names involving bruce.
1369    const BRUCES: &'static [&'static str] = &[
1370        "Bruce Springsteen", // 0
1371        "Bruce Kulick",      // 1
1372        "Bruce Arians",      // 2
1373        "Bruce Smith",       // 3
1374        "Bruce Willis",      // 4
1375        "Bruce Wayne",       // 5
1376        "Bruce Banner",      // 6
1377    ];
1378
1379    #[test]
1380    fn names_bruces_1() {
1381        let ctx = TestContext::new("small");
1382        let idx = create_index(ctx.index_dir(), BRUCES);
1383        let query = name_query("bruce");
1384        let results = idx.search(&query).into_vec();
1385
1386        // This query matches everything.
1387        assert_eq!(results.len(), 7);
1388        // The top two hits are the shortest documents, because of Okapi-BM25's
1389        // length normalization.
1390        assert_eq!(results[0].score(), 1.0);
1391        assert_eq!(results[1].score(), 1.0);
1392        assert_eq!(ids(&results[0..2]), vec![3, 5]);
1393    }
1394
1395    #[test]
1396    fn names_bruces_2() {
1397        let ctx = TestContext::new("small");
1398        let idx = create_index(ctx.index_dir(), BRUCES);
1399        let query = name_query("e w");
1400        let results = idx.search(&query).into_vec();
1401
1402        // The 'e w' ngram is only in two documents: Bruce Willis and
1403        // Bruce Wayne. Since Wayne is shorter than Willis, it should always
1404        // be first.
1405        assert_eq!(results.len(), 2);
1406        assert_eq!(*results[0].value(), 5);
1407        assert_eq!(*results[1].value(), 4);
1408    }
1409
1410    #[test]
1411    fn names_bruces_3() {
1412        let ctx = TestContext::new("small");
1413        let idx = create_index(ctx.index_dir(), BRUCES);
1414        let query = name_query("Springsteen");
1415        let results = idx.search(&query).into_vec();
1416
1417        assert_eq!(results.len(), 1);
1418        assert_eq!(*results[0].value(), 0);
1419    }
1420
1421    #[test]
1422    fn names_bruces_4() {
1423        let ctx = TestContext::new("small");
1424        let idx = create_index(ctx.index_dir(), BRUCES);
1425        let query =
1426            name_query("Springsteen Kulick Arians Smith Willis Wayne Banner");
1427        let results = idx.search(&query).into_vec();
1428
1429        // This query should hit everything.
1430        assert_eq!(results.len(), 7);
1431    }
1432
1433    // Test our various ngram strategies.
1434
1435    fn ngrams_window(n: usize, text: &str) -> Vec<&str> {
1436        let mut grams = vec![];
1437        NgramType::Window.iter(n, text, |gram| grams.push(gram));
1438        grams
1439    }
1440
1441    fn ngrams_edge(n: usize, text: &str) -> Vec<&str> {
1442        let mut grams = vec![];
1443        NgramType::Edge.iter(n, text, |gram| grams.push(gram));
1444        grams
1445    }
1446
1447    #[test]
1448    #[should_panic]
1449    fn ngrams_window_zero_banned() {
1450        assert_eq!(ngrams_window(0, "abc"), vec!["abc"]);
1451    }
1452
1453    #[test]
1454    fn ngrams_window_weird_sizes() {
1455        assert_eq!(
1456            ngrams_window(2, "abcdef"),
1457            vec!["ab", "bc", "cd", "de", "ef",]
1458        );
1459        assert_eq!(
1460            ngrams_window(1, "abcdef"),
1461            vec!["a", "b", "c", "d", "e", "f",]
1462        );
1463        assert_eq!(ngrams_window(2, "ab"), vec!["ab",]);
1464        assert_eq!(ngrams_window(1, "ab"), vec!["a", "b",]);
1465        assert_eq!(ngrams_window(1, "a"), vec!["a",]);
1466        assert_eq!(ngrams_window(1, ""), Vec::<&str>::new());
1467    }
1468
1469    #[test]
1470    fn ngrams_window_ascii() {
1471        assert_eq!(
1472            ngrams_window(3, "abcdef"),
1473            vec!["abc", "bcd", "cde", "def",]
1474        );
1475        assert_eq!(ngrams_window(3, "abcde"), vec!["abc", "bcd", "cde",]);
1476        assert_eq!(ngrams_window(3, "abcd"), vec!["abc", "bcd",]);
1477        assert_eq!(ngrams_window(3, "abc"), vec!["abc",]);
1478        assert_eq!(ngrams_window(3, "ab"), vec!["ab",]);
1479        assert_eq!(ngrams_window(3, "a"), vec!["a",]);
1480        assert_eq!(ngrams_window(3, ""), Vec::<&str>::new());
1481    }
1482
1483    #[test]
1484    fn ngrams_window_non_ascii() {
1485        assert_eq!(
1486            ngrams_window(3, "αβγφδε"),
1487            vec!["αβγ", "βγφ", "γφδ", "φδε",]
1488        );
1489        assert_eq!(ngrams_window(3, "αβγφδ"), vec!["αβγ", "βγφ", "γφδ",]);
1490        assert_eq!(ngrams_window(3, "αβγφ"), vec!["αβγ", "βγφ",]);
1491        assert_eq!(ngrams_window(3, "αβγ"), vec!["αβγ",]);
1492        assert_eq!(ngrams_window(3, "αβ"), vec!["αβ",]);
1493        assert_eq!(ngrams_window(3, "α"), vec!["α",]);
1494    }
1495
1496    #[test]
1497    fn ngrams_edge_ascii() {
1498        assert_eq!(
1499            ngrams_edge(5, "homer simpson"),
1500            vec!["hom", "home", "homer", "sim", "simp", "simps",]
1501        );
1502        assert_eq!(ngrams_edge(5, "h"), vec!["h",]);
1503        assert_eq!(ngrams_edge(5, "ho"), vec!["ho",]);
1504        assert_eq!(ngrams_edge(5, "hom"), vec!["hom",]);
1505        assert_eq!(ngrams_edge(5, "home"), vec!["hom", "home",]);
1506    }
1507
1508    #[test]
1509    fn ngrams_edge_non_ascii() {
1510        assert_eq!(
1511            ngrams_edge(5, "δεαβγφδε δε"),
1512            vec!["δεα", "δεαβ", "δεαβγ", "δε",]
1513        );
1514    }
1515}
imdb_index/index/names.rs

imdb_index/index/
names.rs