imdb-index 0.1.4

A library for indexing and searching IMDb using information retrieval.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Instant;

use csv;
use memmap::Mmap;
use serde::{Deserialize, Serialize};
use serde_json;

use crate::error::{Error, Result};
use crate::record::{Episode, Rating, Title, TitleKind};
use crate::scored::SearchResults;
use crate::util::{
    create_file, csv_file, csv_mmap, open_file, NiceDuration, IMDB_BASICS,
};

pub use self::aka::AKARecordIter;
pub use self::names::{NameQuery, NameScorer, NgramType};

mod aka;
mod episode;
mod id;
mod names;
mod rating;
#[cfg(test)]
mod tests;
mod writer;

/// The version of the index format on disk.
///
/// Generally speaking, if the version of the index on disk doesn't exactly
/// match the version expected by this code, then the index won't be read.
/// The caller must then re-generate the index.
///
/// This version represents all indexing structures on disk in this module.
const VERSION: u64 = 1;

/// The name of the title file index.
///
/// This index represents a map from the IMDb title ID to the file offset
/// corresponding to that record in title.basics.tsv.
const TITLE: &str = "title.fst";

/// The name of the file containing the index configuration.
///
/// The index configuration is a JSON file with some meta data about this
/// index, such as its version.
const CONFIG: &str = "config.json";

/// A media entity is a title with optional episode and rating records.
///
/// A media entity makes it convenient to deal with the complete information
/// of an IMDb media record. This is the default value returned by search
/// routines such as what the [`Searcher`](struct.Searcher.html) provides, and
/// can also be cheaply constructed by an [`Index`](struct.Index.html) given a
/// [`Title`](struct.Title.html) or an IMDb ID.
#[derive(Clone, Debug)]
pub struct MediaEntity {
    title: Title,
    episode: Option<Episode>,
    rating: Option<Rating>,
}

impl MediaEntity {
    /// Return a reference to the underlying `Title`.
    pub fn title(&self) -> &Title {
        &self.title
    }

    /// Return a reference to the underlying `Episode`, if it exists.
    pub fn episode(&self) -> Option<&Episode> {
        self.episode.as_ref()
    }

    /// Return a reference to the underlying `Rating`, if it exists.
    pub fn rating(&self) -> Option<&Rating> {
        self.rating.as_ref()
    }
}

/// An index into IMDb titles and their associated data.
///
/// This index consists of a set of on disk index data structures in addition
/// to the uncompressed IMDb `tsv` files. The on disk index structures are used
/// to provide access to the records in the `tsv` files efficiently.
///
/// With this index, one can do the following things:
///
/// * Return a ranked list
///   [`Title`](struct.Title.html)
///   records matching a fuzzy name query.
/// * Access any `Title` record by ID in constant time.
/// * Access all
///   [`AKA`](struct.AKA.html)
///   records for any `Title` in constant time.
/// * Access the
///   [`Rating`](struct.Rating.html)
///   for any `Title` in constant time.
/// * Access the complete set of
///   [`Episode`](struct.Episode.html)
///   records for any TV show in constant time.
/// * Access the specific `Episode` given its ID in constant time.
#[derive(Debug)]
pub struct Index {
    /// The directory containing the IMDb tsv files.
    data_dir: PathBuf,
    /// The directory containing this crate's index structures.
    index_dir: PathBuf,
    /// A seekable reader for `title.basics.tsv`. The index structures
    /// typically return offsets that can be used to seek this reader to the
    /// beginning of any `Title` record.
    csv_basic: csv::Reader<io::Cursor<Mmap>>,
    /// The name index. This is what provides fuzzy queries.
    idx_names: names::IndexReader,
    /// The AKA index.
    idx_aka: aka::Index,
    /// The episode index.
    idx_episode: episode::Index,
    /// The rating index.
    idx_rating: rating::Index,
    /// The title index.
    idx_title: id::IndexReader,
}

#[derive(Debug, Deserialize, Serialize)]
struct Config {
    version: u64,
}

impl Index {
    /// Open an existing index using default settings. If the index does not
    /// exist, or if there was a problem opening it, then this returns an
    /// error.
    ///
    /// Generally, this method is cheap to call. It opens some file
    /// descriptors, but otherwise does no work.
    ///
    /// `data_dir` should be the directory containing decompressed IMDb
    /// `tsv` files. See: https://www.imdb.com/interfaces/
    ///
    /// `index_dir` should be the directory containing a previously created
    /// index using `Index::create`.
    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(
        data_dir: P1,
        index_dir: P2,
    ) -> Result<Index> {
        IndexBuilder::new().open(data_dir, index_dir)
    }

    /// Create a new index using default settings.
    ///
    /// Calling this method is expensive, and one should expect this to take
    /// dozens of seconds or more to complete.
    ///
    /// `data_dir` should be the directory containing decompressed IMDb tsv`
    /// `files. See: https://www.imdb.com/interfaces/
    ///
    /// `index_dir` should be the directory containing a previously created
    /// index using `Index::create`.
    ///
    /// This will overwrite any previous index that may have existed in
    /// `index_dir`.
    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(
        data_dir: P1,
        index_dir: P2,
    ) -> Result<Index> {
        IndexBuilder::new().create(data_dir, index_dir)
    }

    /// Attempt to clone this index, returning a distinct `Index`.
    ///
    /// This is as cheap to call as `Index::open` and returns an error if there
    /// was a problem reading the underlying index.
    ///
    /// This is useful when one wants to query the same `Index` on disk from
    /// multiple threads.
    pub fn try_clone(&self) -> Result<Index> {
        Index::open(&self.data_dir, &self.index_dir)
    }

    /// Search this index for `Title` records whose name matches the given
    /// query.
    ///
    /// The query controls the following things:
    ///
    /// * The name to search for.
    /// * The maximum number of results returned.
    /// * The scorer to use to rank results.
    ///
    /// The name can be any string. It is normalized and broken down into
    /// component pieces, which are then used to quickly search all existing
    /// titles quickly and fuzzily.
    ///
    /// This returns an error if there was a problem reading the index or the
    /// underlying CSV data.
    pub fn search(
        &mut self,
        query: &names::NameQuery,
    ) -> Result<SearchResults<Title>> {
        let mut results = SearchResults::new();
        // The name index gives us back scores with offsets. The offset can be
        // used to seek our `Title` CSV reader to the corresponding record and
        // read it in constant time.
        for result in self.idx_names.search(query) {
            let title = match self.read_record(*result.value())? {
                None => continue,
                Some(title) => title,
            };
            results.push(result.map(|_| title));
        }
        Ok(results)
    }

    /// Returns the `MediaEntity` for the given IMDb ID.
    ///
    /// An entity includes an [`Episode`](struct.Episode.html) and
    /// [`Rating`](struct.Rating.html) records if they exist for the title.
    ///
    /// This returns an error if there was a problem reading the underlying
    /// index. If no such title exists for the given ID, then `None` is
    /// returned.
    pub fn entity(&mut self, id: &str) -> Result<Option<MediaEntity>> {
        match self.title(id)? {
            None => Ok(None),
            Some(title) => self.entity_from_title(title).map(Some),
        }
    }

    /// Returns the `MediaEntity` for the given `Title`.
    ///
    /// This is like the `entity` method, except it takes a `Title` record as
    /// given.
    pub fn entity_from_title(&mut self, title: Title) -> Result<MediaEntity> {
        let episode = match title.kind {
            TitleKind::TVEpisode => self.episode(&title.id)?,
            _ => None,
        };
        let rating = self.rating(&title.id)?;
        Ok(MediaEntity { title, episode, rating })
    }

    /// Returns the `Title` record for the given IMDb ID.
    ///
    /// This returns an error if there was a problem reading the underlying
    /// index. If no such title exists for the given ID, then `None` is
    /// returned.
    pub fn title(&mut self, id: &str) -> Result<Option<Title>> {
        match self.idx_title.get(id.as_bytes()) {
            None => Ok(None),
            Some(offset) => self.read_record(offset),
        }
    }

    /// Returns an iterator over all `AKA` records for the given IMDb ID.
    ///
    /// If no AKA records exist for the given ID, then an empty iterator is
    /// returned.
    ///
    /// If there was a problem reading the index, then an error is returned.
    pub fn aka_records(&mut self, id: &str) -> Result<AKARecordIter> {
        self.idx_aka.find(id.as_bytes())
    }

    /// Returns the `Rating` associated with the given IMDb ID.
    ///
    /// If no rating exists for the given ID, then this returns `None`.
    ///
    /// If there was a problem reading the index, then an error is returned.
    pub fn rating(&mut self, id: &str) -> Result<Option<Rating>> {
        self.idx_rating.rating(id.as_bytes())
    }

    /// Returns all of the episodes for the given TV show. The TV show should
    /// be identified by its IMDb ID.
    ///
    /// If the given ID isn't a TV show or if the TV show doesn't have any
    /// episodes, then an empty list is returned.
    ///
    /// The episodes returned are sorted in order of their season and episode
    /// numbers. Episodes without a season or episode number are sorted after
    /// episodes with a season or episode number.
    ///
    /// If there was a problem reading the index, then an error is returned.
    pub fn seasons(&mut self, tvshow_id: &str) -> Result<Vec<Episode>> {
        self.idx_episode.seasons(tvshow_id.as_bytes())
    }

    /// Returns all of the episodes for the given TV show and season number.
    /// The TV show should be identified by its IMDb ID, and the season should
    /// be identified by its number. (Season numbers generally start at `1`.)
    ///
    /// If the given ID isn't a TV show or if the TV show doesn't have any
    /// episodes for the given season, then an empty list is returned.
    ///
    /// The episodes returned are sorted in order of their episode numbers.
    /// Episodes without an episode number are sorted after episodes with an
    /// episode number.
    ///
    /// If there was a problem reading the index, then an error is returned.
    pub fn episodes(
        &mut self,
        tvshow_id: &str,
        season: u32,
    ) -> Result<Vec<Episode>> {
        self.idx_episode.episodes(tvshow_id.as_bytes(), season)
    }

    /// Return the episode corresponding to the given IMDb ID.
    ///
    /// If the ID doesn't correspond to an episode, then `None` is returned.
    ///
    /// If there was a problem reading the index, then an error is returned.
    pub fn episode(&mut self, episode_id: &str) -> Result<Option<Episode>> {
        self.idx_episode.episode(episode_id.as_bytes())
    }

    /// Returns the data directory that this index returns results for.
    pub fn data_dir(&self) -> &Path {
        &self.data_dir
    }

    /// Returns the directory containing this index's files.
    pub fn index_dir(&self) -> &Path {
        &self.index_dir
    }

    /// Read the CSV `Title` record beginning at the given file offset.
    ///
    /// If no such record exists, then this returns `None`.
    ///
    /// If there was a problem reading the underlying CSV data, then an error
    /// is returned.
    ///
    /// If the given offset does not point to the start of a record in the CSV
    /// data, then the behavior of this method is unspecified.
    fn read_record(&mut self, offset: u64) -> Result<Option<Title>> {
        let mut pos = csv::Position::new();
        pos.set_byte(offset);
        self.csv_basic.seek(pos).map_err(Error::csv)?;

        let mut record = csv::StringRecord::new();
        if !self.csv_basic.read_record(&mut record).map_err(Error::csv)? {
            Ok(None)
        } else {
            let headers = self.csv_basic.headers().map_err(Error::csv)?;
            Ok(record.deserialize(Some(headers)).map_err(Error::csv)?)
        }
    }
}

/// A builder for opening or creating an `Index`.
#[derive(Debug)]
pub struct IndexBuilder {
    ngram_type: NgramType,
    ngram_size: usize,
}

impl IndexBuilder {
    /// Create a new builder with a default configuration.
    pub fn new() -> IndexBuilder {
        IndexBuilder { ngram_type: NgramType::default(), ngram_size: 3 }
    }

    /// Use the current configuration to open an existing index. If the index
    /// does not exist, or if there was a problem opening it, then this returns
    /// an error.
    ///
    /// Generally, this method is cheap to call. It opens some file
    /// descriptors, but otherwise does no work.
    ///
    /// `data_dir` should be the directory containing decompressed IMDb tsv`
    /// `files. See: https://www.imdb.com/interfaces/
    ///
    /// `index_dir` should be the directory containing a previously created
    /// index using `Index::create`.
    ///
    /// Note that settings for index creation are ignored.
    pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(
        &self,
        data_dir: P1,
        index_dir: P2,
    ) -> Result<Index> {
        let data_dir = data_dir.as_ref();
        let index_dir = index_dir.as_ref();
        log::debug!("opening index {}", index_dir.display());

        let config_file = open_file(index_dir.join(CONFIG))?;
        let config: Config = serde_json::from_reader(config_file)
            .map_err(|e| Error::config(e.to_string()))?;
        if config.version != VERSION {
            return Err(Error::version(VERSION, config.version));
        }

        Ok(Index {
            data_dir: data_dir.to_path_buf(),
            index_dir: index_dir.to_path_buf(),
            // We claim it is safe to open the following memory map because we
            // don't mutate them and no other process (should) either.
            csv_basic: unsafe { csv_mmap(data_dir.join(IMDB_BASICS))? },
            idx_names: names::IndexReader::open(index_dir)?,
            idx_aka: aka::Index::open(data_dir, index_dir)?,
            idx_episode: episode::Index::open(index_dir)?,
            idx_rating: rating::Index::open(index_dir)?,
            idx_title: id::IndexReader::from_path(index_dir.join(TITLE))?,
        })
    }

    /// Use the current configuration to create a new index.
    ///
    /// Calling this method is expensive, and one should expect this to take
    /// dozens of seconds or more to complete.
    ///
    /// `data_dir` should be the directory containing decompressed IMDb tsv`
    /// `files. See: https://www.imdb.com/interfaces/
    ///
    /// `index_dir` should be the directory containing a previously created
    /// index using `Index::create`.
    ///
    /// This will overwrite any previous index that may have existed in
    /// `index_dir`.
    pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(
        &self,
        data_dir: P1,
        index_dir: P2,
    ) -> Result<Index> {
        let data_dir = data_dir.as_ref();
        let index_dir = index_dir.as_ref();
        fs::create_dir_all(index_dir)
            .map_err(|e| Error::io_path(e, index_dir))?;
        log::info!("creating index at {}", index_dir.display());

        // Creating the rating and episode indices are completely independent
        // from the name/AKA indexes, so do them in a background thread. The
        // episode index takes long enough to build to justify this.
        let job = {
            let data_dir = data_dir.to_path_buf();
            let index_dir = index_dir.to_path_buf();
            thread::spawn(move || -> Result<()> {
                let start = Instant::now();
                rating::Index::create(&data_dir, &index_dir)?;
                log::info!(
                    "created rating index (took {})",
                    NiceDuration::since(start)
                );

                let start = Instant::now();
                episode::Index::create(&data_dir, &index_dir)?;
                log::info!(
                    "created episode index (took {})",
                    NiceDuration::since(start)
                );
                Ok(())
            })
        };

        let start = Instant::now();
        let mut aka_index = aka::Index::create(data_dir, index_dir)?;
        log::info!("created AKA index (took {})", NiceDuration::since(start));

        let start = Instant::now();
        create_name_index(
            &mut aka_index,
            data_dir,
            index_dir,
            self.ngram_type,
            self.ngram_size,
        )?;
        log::info!(
            "created name index, ngram type: {}, ngram size: {} (took {})",
            self.ngram_type,
            self.ngram_size,
            NiceDuration::since(start)
        );

        job.join().unwrap()?;

        // Write out our config.
        let config_file = create_file(index_dir.join(CONFIG))?;
        serde_json::to_writer_pretty(
            config_file,
            &Config { version: VERSION },
        )
        .map_err(|e| Error::config(e.to_string()))?;

        self.open(data_dir, index_dir)
    }

    /// Set the type of ngram generation to use.
    ///
    /// The default type is `Window`.
    pub fn ngram_type(&mut self, ngram_type: NgramType) -> &mut IndexBuilder {
        self.ngram_type = ngram_type;
        self
    }

    /// Set the ngram size on this index.
    ///
    /// When creating an index, ngrams with this size will be used.
    pub fn ngram_size(&mut self, ngram_size: usize) -> &mut IndexBuilder {
        self.ngram_size = ngram_size;
        self
    }
}

impl Default for IndexBuilder {
    fn default() -> IndexBuilder {
        IndexBuilder::new()
    }
}

/// Creates the name index from the title tsv data and an AKA index. The AKA
/// index is used to index additional names for each title record to improve
/// recall during search.
///
/// To avoid a second pass through the title records, this also creates the
/// title ID index, which provides an index for looking up a `Title` by its
/// ID in constant time.
fn create_name_index(
    aka_index: &mut aka::Index,
    data_dir: &Path,
    index_dir: &Path,
    ngram_type: NgramType,
    ngram_size: usize,
) -> Result<()> {
    // For logging.
    let (mut count, mut title_count) = (0u64, 0u64);

    let mut wtr = names::IndexWriter::open(index_dir, ngram_type, ngram_size)?;
    let mut twtr = id::IndexSortedWriter::from_path(index_dir.join(TITLE))?;

    let mut rdr = csv_file(data_dir.join(IMDB_BASICS))?;
    let mut record = csv::StringRecord::new();
    while rdr.read_record(&mut record).map_err(Error::csv)? {
        let pos = record.position().expect("position on row");
        let id = &record[0];
        let title = &record[2];
        let original_title = &record[3];
        let is_adult = &record[4] == "1";
        if is_adult {
            // TODO: Expose an option to permit this.
            continue;
        }
        count += 1;
        title_count += 1;

        twtr.insert(id.as_bytes(), pos.byte())?;
        // Index the primary name.
        wtr.insert(pos.byte(), title)?;
        if title != original_title {
            // Index the "original" name.
            wtr.insert(pos.byte(), original_title)?;
            count += 1;
        }
        // Now index all of the alternate names, if they exist.
        for result in aka_index.find(id.as_bytes())? {
            let akarecord = result?;
            if title != akarecord.title {
                wtr.insert(pos.byte(), &akarecord.title)?;
                count += 1;
            }
        }
    }
    wtr.finish()?;
    twtr.finish()?;

    log::info!("{} titles indexed", title_count);
    log::info!("{} total names indexed", count);
    Ok(())
}