imdb_index/index/
mod.rs

1use std::fs;
2use std::io;
3use std::path::{Path, PathBuf};
4use std::thread;
5use std::time::Instant;
6
7use csv;
8use memmap::Mmap;
9use serde::{Deserialize, Serialize};
10use serde_json;
11
12use crate::error::{Error, Result};
13use crate::record::{Episode, Rating, Title, TitleKind};
14use crate::scored::SearchResults;
15use crate::util::{
16    create_file, csv_file, csv_mmap, open_file, NiceDuration, IMDB_BASICS,
17};
18
19pub use self::aka::AKARecordIter;
20pub use self::names::{NameQuery, NameScorer, NgramType};
21
22mod aka;
23mod episode;
24mod id;
25mod names;
26mod rating;
27#[cfg(test)]
28mod tests;
29mod writer;
30
31/// The version of the index format on disk.
32///
33/// Generally speaking, if the version of the index on disk doesn't exactly
34/// match the version expected by this code, then the index won't be read.
35/// The caller must then re-generate the index.
36///
37/// This version represents all indexing structures on disk in this module.
38const VERSION: u64 = 1;
39
40/// The name of the title file index.
41///
42/// This index represents a map from the IMDb title ID to the file offset
43/// corresponding to that record in title.basics.tsv.
44const TITLE: &str = "title.fst";
45
46/// The name of the file containing the index configuration.
47///
48/// The index configuration is a JSON file with some meta data about this
49/// index, such as its version.
50const CONFIG: &str = "config.json";
51
52/// A media entity is a title with optional episode and rating records.
53///
54/// A media entity makes it convenient to deal with the complete information
55/// of an IMDb media record. This is the default value returned by search
56/// routines such as what the [`Searcher`](struct.Searcher.html) provides, and
57/// can also be cheaply constructed by an [`Index`](struct.Index.html) given a
58/// [`Title`](struct.Title.html) or an IMDb ID.
59#[derive(Clone, Debug)]
60pub struct MediaEntity {
61    title: Title,
62    episode: Option<Episode>,
63    rating: Option<Rating>,
64}
65
66impl MediaEntity {
67    /// Return a reference to the underlying `Title`.
68    pub fn title(&self) -> &Title {
69        &self.title
70    }
71
72    /// Return a reference to the underlying `Episode`, if it exists.
73    pub fn episode(&self) -> Option<&Episode> {
74        self.episode.as_ref()
75    }
76
77    /// Return a reference to the underlying `Rating`, if it exists.
78    pub fn rating(&self) -> Option<&Rating> {
79        self.rating.as_ref()
80    }
81}
82
83/// An index into IMDb titles and their associated data.
84///
85/// This index consists of a set of on disk index data structures in addition
86/// to the uncompressed IMDb `tsv` files. The on disk index structures are used
87/// to provide access to the records in the `tsv` files efficiently.
88///
89/// With this index, one can do the following things:
90///
91/// * Return a ranked list
92///   [`Title`](struct.Title.html)
93///   records matching a fuzzy name query.
94/// * Access any `Title` record by ID in constant time.
95/// * Access all
96///   [`AKA`](struct.AKA.html)
97///   records for any `Title` in constant time.
98/// * Access the
99///   [`Rating`](struct.Rating.html)
100///   for any `Title` in constant time.
101/// * Access the complete set of
102///   [`Episode`](struct.Episode.html)
103///   records for any TV show in constant time.
104/// * Access the specific `Episode` given its ID in constant time.
105#[derive(Debug)]
106pub struct Index {
107    /// The directory containing the IMDb tsv files.
108    data_dir: PathBuf,
109    /// The directory containing this crate's index structures.
110    index_dir: PathBuf,
111    /// A seekable reader for `title.basics.tsv`. The index structures
112    /// typically return offsets that can be used to seek this reader to the
113    /// beginning of any `Title` record.
114    csv_basic: csv::Reader<io::Cursor<Mmap>>,
115    /// The name index. This is what provides fuzzy queries.
116    idx_names: names::IndexReader,
117    /// The AKA index.
118    idx_aka: aka::Index,
119    /// The episode index.
120    idx_episode: episode::Index,
121    /// The rating index.
122    idx_rating: rating::Index,
123    /// The title index.
124    idx_title: id::IndexReader,
125}
126
127#[derive(Debug, Deserialize, Serialize)]
128struct Config {
129    version: u64,
130}
131
132impl Index {
133    /// Open an existing index using default settings. If the index does not
134    /// exist, or if there was a problem opening it, then this returns an
135    /// error.
136    ///
137    /// Generally, this method is cheap to call. It opens some file
138    /// descriptors, but otherwise does no work.
139    ///
140    /// `data_dir` should be the directory containing decompressed IMDb
141    /// `tsv` files. See: https://www.imdb.com/interfaces/
142    ///
143    /// `index_dir` should be the directory containing a previously created
144    /// index using `Index::create`.
145    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(
146        data_dir: P1,
147        index_dir: P2,
148    ) -> Result<Index> {
149        IndexBuilder::new().open(data_dir, index_dir)
150    }
151
152    /// Create a new index using default settings.
153    ///
154    /// Calling this method is expensive, and one should expect this to take
155    /// dozens of seconds or more to complete.
156    ///
157    /// `data_dir` should be the directory containing decompressed IMDb tsv`
158    /// `files. See: https://www.imdb.com/interfaces/
159    ///
160    /// `index_dir` should be the directory containing a previously created
161    /// index using `Index::create`.
162    ///
163    /// This will overwrite any previous index that may have existed in
164    /// `index_dir`.
165    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(
166        data_dir: P1,
167        index_dir: P2,
168    ) -> Result<Index> {
169        IndexBuilder::new().create(data_dir, index_dir)
170    }
171
172    /// Attempt to clone this index, returning a distinct `Index`.
173    ///
174    /// This is as cheap to call as `Index::open` and returns an error if there
175    /// was a problem reading the underlying index.
176    ///
177    /// This is useful when one wants to query the same `Index` on disk from
178    /// multiple threads.
179    pub fn try_clone(&self) -> Result<Index> {
180        Index::open(&self.data_dir, &self.index_dir)
181    }
182
183    /// Search this index for `Title` records whose name matches the given
184    /// query.
185    ///
186    /// The query controls the following things:
187    ///
188    /// * The name to search for.
189    /// * The maximum number of results returned.
190    /// * The scorer to use to rank results.
191    ///
192    /// The name can be any string. It is normalized and broken down into
193    /// component pieces, which are then used to quickly search all existing
194    /// titles quickly and fuzzily.
195    ///
196    /// This returns an error if there was a problem reading the index or the
197    /// underlying CSV data.
198    pub fn search(
199        &mut self,
200        query: &names::NameQuery,
201    ) -> Result<SearchResults<Title>> {
202        let mut results = SearchResults::new();
203        // The name index gives us back scores with offsets. The offset can be
204        // used to seek our `Title` CSV reader to the corresponding record and
205        // read it in constant time.
206        for result in self.idx_names.search(query) {
207            let title = match self.read_record(*result.value())? {
208                None => continue,
209                Some(title) => title,
210            };
211            results.push(result.map(|_| title));
212        }
213        Ok(results)
214    }
215
216    /// Returns the `MediaEntity` for the given IMDb ID.
217    ///
218    /// An entity includes an [`Episode`](struct.Episode.html) and
219    /// [`Rating`](struct.Rating.html) records if they exist for the title.
220    ///
221    /// This returns an error if there was a problem reading the underlying
222    /// index. If no such title exists for the given ID, then `None` is
223    /// returned.
224    pub fn entity(&mut self, id: &str) -> Result<Option<MediaEntity>> {
225        match self.title(id)? {
226            None => Ok(None),
227            Some(title) => self.entity_from_title(title).map(Some),
228        }
229    }
230
231    /// Returns the `MediaEntity` for the given `Title`.
232    ///
233    /// This is like the `entity` method, except it takes a `Title` record as
234    /// given.
235    pub fn entity_from_title(&mut self, title: Title) -> Result<MediaEntity> {
236        let episode = match title.kind {
237            TitleKind::TVEpisode => self.episode(&title.id)?,
238            _ => None,
239        };
240        let rating = self.rating(&title.id)?;
241        Ok(MediaEntity { title, episode, rating })
242    }
243
244    /// Returns the `Title` record for the given IMDb ID.
245    ///
246    /// This returns an error if there was a problem reading the underlying
247    /// index. If no such title exists for the given ID, then `None` is
248    /// returned.
249    pub fn title(&mut self, id: &str) -> Result<Option<Title>> {
250        match self.idx_title.get(id.as_bytes()) {
251            None => Ok(None),
252            Some(offset) => self.read_record(offset),
253        }
254    }
255
256    /// Returns an iterator over all `AKA` records for the given IMDb ID.
257    ///
258    /// If no AKA records exist for the given ID, then an empty iterator is
259    /// returned.
260    ///
261    /// If there was a problem reading the index, then an error is returned.
262    pub fn aka_records(&mut self, id: &str) -> Result<AKARecordIter> {
263        self.idx_aka.find(id.as_bytes())
264    }
265
266    /// Returns the `Rating` associated with the given IMDb ID.
267    ///
268    /// If no rating exists for the given ID, then this returns `None`.
269    ///
270    /// If there was a problem reading the index, then an error is returned.
271    pub fn rating(&mut self, id: &str) -> Result<Option<Rating>> {
272        self.idx_rating.rating(id.as_bytes())
273    }
274
275    /// Returns all of the episodes for the given TV show. The TV show should
276    /// be identified by its IMDb ID.
277    ///
278    /// If the given ID isn't a TV show or if the TV show doesn't have any
279    /// episodes, then an empty list is returned.
280    ///
281    /// The episodes returned are sorted in order of their season and episode
282    /// numbers. Episodes without a season or episode number are sorted after
283    /// episodes with a season or episode number.
284    ///
285    /// If there was a problem reading the index, then an error is returned.
286    pub fn seasons(&mut self, tvshow_id: &str) -> Result<Vec<Episode>> {
287        self.idx_episode.seasons(tvshow_id.as_bytes())
288    }
289
290    /// Returns all of the episodes for the given TV show and season number.
291    /// The TV show should be identified by its IMDb ID, and the season should
292    /// be identified by its number. (Season numbers generally start at `1`.)
293    ///
294    /// If the given ID isn't a TV show or if the TV show doesn't have any
295    /// episodes for the given season, then an empty list is returned.
296    ///
297    /// The episodes returned are sorted in order of their episode numbers.
298    /// Episodes without an episode number are sorted after episodes with an
299    /// episode number.
300    ///
301    /// If there was a problem reading the index, then an error is returned.
302    pub fn episodes(
303        &mut self,
304        tvshow_id: &str,
305        season: u32,
306    ) -> Result<Vec<Episode>> {
307        self.idx_episode.episodes(tvshow_id.as_bytes(), season)
308    }
309
310    /// Return the episode corresponding to the given IMDb ID.
311    ///
312    /// If the ID doesn't correspond to an episode, then `None` is returned.
313    ///
314    /// If there was a problem reading the index, then an error is returned.
315    pub fn episode(&mut self, episode_id: &str) -> Result<Option<Episode>> {
316        self.idx_episode.episode(episode_id.as_bytes())
317    }
318
319    /// Returns the data directory that this index returns results for.
320    pub fn data_dir(&self) -> &Path {
321        &self.data_dir
322    }
323
324    /// Returns the directory containing this index's files.
325    pub fn index_dir(&self) -> &Path {
326        &self.index_dir
327    }
328
329    /// Read the CSV `Title` record beginning at the given file offset.
330    ///
331    /// If no such record exists, then this returns `None`.
332    ///
333    /// If there was a problem reading the underlying CSV data, then an error
334    /// is returned.
335    ///
336    /// If the given offset does not point to the start of a record in the CSV
337    /// data, then the behavior of this method is unspecified.
338    fn read_record(&mut self, offset: u64) -> Result<Option<Title>> {
339        let mut pos = csv::Position::new();
340        pos.set_byte(offset);
341        self.csv_basic.seek(pos).map_err(Error::csv)?;
342
343        let mut record = csv::StringRecord::new();
344        if !self.csv_basic.read_record(&mut record).map_err(Error::csv)? {
345            Ok(None)
346        } else {
347            let headers = self.csv_basic.headers().map_err(Error::csv)?;
348            Ok(record.deserialize(Some(headers)).map_err(Error::csv)?)
349        }
350    }
351}
352
353/// A builder for opening or creating an `Index`.
354#[derive(Debug)]
355pub struct IndexBuilder {
356    ngram_type: NgramType,
357    ngram_size: usize,
358}
359
360impl IndexBuilder {
361    /// Create a new builder with a default configuration.
362    pub fn new() -> IndexBuilder {
363        IndexBuilder { ngram_type: NgramType::default(), ngram_size: 3 }
364    }
365
366    /// Use the current configuration to open an existing index. If the index
367    /// does not exist, or if there was a problem opening it, then this returns
368    /// an error.
369    ///
370    /// Generally, this method is cheap to call. It opens some file
371    /// descriptors, but otherwise does no work.
372    ///
373    /// `data_dir` should be the directory containing decompressed IMDb tsv`
374    /// `files. See: https://www.imdb.com/interfaces/
375    ///
376    /// `index_dir` should be the directory containing a previously created
377    /// index using `Index::create`.
378    ///
379    /// Note that settings for index creation are ignored.
380    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(
381        &self,
382        data_dir: P1,
383        index_dir: P2,
384    ) -> Result<Index> {
385        let data_dir = data_dir.as_ref();
386        let index_dir = index_dir.as_ref();
387        log::debug!("opening index {}", index_dir.display());
388
389        let config_file = open_file(index_dir.join(CONFIG))?;
390        let config: Config = serde_json::from_reader(config_file)
391            .map_err(|e| Error::config(e.to_string()))?;
392        if config.version != VERSION {
393            return Err(Error::version(VERSION, config.version));
394        }
395
396        Ok(Index {
397            data_dir: data_dir.to_path_buf(),
398            index_dir: index_dir.to_path_buf(),
399            // We claim it is safe to open the following memory map because we
400            // don't mutate them and no other process (should) either.
401            csv_basic: unsafe { csv_mmap(data_dir.join(IMDB_BASICS))? },
402            idx_names: names::IndexReader::open(index_dir)?,
403            idx_aka: aka::Index::open(data_dir, index_dir)?,
404            idx_episode: episode::Index::open(index_dir)?,
405            idx_rating: rating::Index::open(index_dir)?,
406            idx_title: id::IndexReader::from_path(index_dir.join(TITLE))?,
407        })
408    }
409
410    /// Use the current configuration to create a new index.
411    ///
412    /// Calling this method is expensive, and one should expect this to take
413    /// dozens of seconds or more to complete.
414    ///
415    /// `data_dir` should be the directory containing decompressed IMDb tsv`
416    /// `files. See: https://www.imdb.com/interfaces/
417    ///
418    /// `index_dir` should be the directory containing a previously created
419    /// index using `Index::create`.
420    ///
421    /// This will overwrite any previous index that may have existed in
422    /// `index_dir`.
423    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(
424        &self,
425        data_dir: P1,
426        index_dir: P2,
427    ) -> Result<Index> {
428        let data_dir = data_dir.as_ref();
429        let index_dir = index_dir.as_ref();
430        fs::create_dir_all(index_dir)
431            .map_err(|e| Error::io_path(e, index_dir))?;
432        log::info!("creating index at {}", index_dir.display());
433
434        // Creating the rating and episode indices are completely independent
435        // from the name/AKA indexes, so do them in a background thread. The
436        // episode index takes long enough to build to justify this.
437        let job = {
438            let data_dir = data_dir.to_path_buf();
439            let index_dir = index_dir.to_path_buf();
440            thread::spawn(move || -> Result<()> {
441                let start = Instant::now();
442                rating::Index::create(&data_dir, &index_dir)?;
443                log::info!(
444                    "created rating index (took {})",
445                    NiceDuration::since(start)
446                );
447
448                let start = Instant::now();
449                episode::Index::create(&data_dir, &index_dir)?;
450                log::info!(
451                    "created episode index (took {})",
452                    NiceDuration::since(start)
453                );
454                Ok(())
455            })
456        };
457
458        let start = Instant::now();
459        let mut aka_index = aka::Index::create(data_dir, index_dir)?;
460        log::info!("created AKA index (took {})", NiceDuration::since(start));
461
462        let start = Instant::now();
463        create_name_index(
464            &mut aka_index,
465            data_dir,
466            index_dir,
467            self.ngram_type,
468            self.ngram_size,
469        )?;
470        log::info!(
471            "created name index, ngram type: {}, ngram size: {} (took {})",
472            self.ngram_type,
473            self.ngram_size,
474            NiceDuration::since(start)
475        );
476
477        job.join().unwrap()?;
478
479        // Write out our config.
480        let config_file = create_file(index_dir.join(CONFIG))?;
481        serde_json::to_writer_pretty(
482            config_file,
483            &Config { version: VERSION },
484        )
485        .map_err(|e| Error::config(e.to_string()))?;
486
487        self.open(data_dir, index_dir)
488    }
489
490    /// Set the type of ngram generation to use.
491    ///
492    /// The default type is `Window`.
493    pub fn ngram_type(&mut self, ngram_type: NgramType) -> &mut IndexBuilder {
494        self.ngram_type = ngram_type;
495        self
496    }
497
498    /// Set the ngram size on this index.
499    ///
500    /// When creating an index, ngrams with this size will be used.
501    pub fn ngram_size(&mut self, ngram_size: usize) -> &mut IndexBuilder {
502        self.ngram_size = ngram_size;
503        self
504    }
505}
506
507impl Default for IndexBuilder {
508    fn default() -> IndexBuilder {
509        IndexBuilder::new()
510    }
511}
512
513/// Creates the name index from the title tsv data and an AKA index. The AKA
514/// index is used to index additional names for each title record to improve
515/// recall during search.
516///
517/// To avoid a second pass through the title records, this also creates the
518/// title ID index, which provides an index for looking up a `Title` by its
519/// ID in constant time.
520fn create_name_index(
521    aka_index: &mut aka::Index,
522    data_dir: &Path,
523    index_dir: &Path,
524    ngram_type: NgramType,
525    ngram_size: usize,
526) -> Result<()> {
527    // For logging.
528    let (mut count, mut title_count) = (0u64, 0u64);
529
530    let mut wtr = names::IndexWriter::open(index_dir, ngram_type, ngram_size)?;
531    let mut twtr = id::IndexSortedWriter::from_path(index_dir.join(TITLE))?;
532
533    let mut rdr = csv_file(data_dir.join(IMDB_BASICS))?;
534    let mut record = csv::StringRecord::new();
535    while rdr.read_record(&mut record).map_err(Error::csv)? {
536        let pos = record.position().expect("position on row");
537        let id = &record[0];
538        let title = &record[2];
539        let original_title = &record[3];
540        let is_adult = &record[4] == "1";
541        if is_adult {
542            // TODO: Expose an option to permit this.
543            continue;
544        }
545        count += 1;
546        title_count += 1;
547
548        twtr.insert(id.as_bytes(), pos.byte())?;
549        // Index the primary name.
550        wtr.insert(pos.byte(), title)?;
551        if title != original_title {
552            // Index the "original" name.
553            wtr.insert(pos.byte(), original_title)?;
554            count += 1;
555        }
556        // Now index all of the alternate names, if they exist.
557        for result in aka_index.find(id.as_bytes())? {
558            let akarecord = result?;
559            if title != akarecord.title {
560                wtr.insert(pos.byte(), &akarecord.title)?;
561                count += 1;
562            }
563        }
564    }
565    wtr.finish()?;
566    twtr.finish()?;
567
568    log::info!("{} titles indexed", title_count);
569    log::info!("{} total names indexed", count);
570    Ok(())
571}