imdb_index/index/mod.rs
1use std::fs;
2use std::io;
3use std::path::{Path, PathBuf};
4use std::thread;
5use std::time::Instant;
6
7use csv;
8use memmap::Mmap;
9use serde::{Deserialize, Serialize};
10use serde_json;
11
12use crate::error::{Error, Result};
13use crate::record::{Episode, Rating, Title, TitleKind};
14use crate::scored::SearchResults;
15use crate::util::{
16 create_file, csv_file, csv_mmap, open_file, NiceDuration, IMDB_BASICS,
17};
18
19pub use self::aka::AKARecordIter;
20pub use self::names::{NameQuery, NameScorer, NgramType};
21
22mod aka;
23mod episode;
24mod id;
25mod names;
26mod rating;
27#[cfg(test)]
28mod tests;
29mod writer;
30
31/// The version of the index format on disk.
32///
33/// Generally speaking, if the version of the index on disk doesn't exactly
34/// match the version expected by this code, then the index won't be read.
35/// The caller must then re-generate the index.
36///
37/// This version represents all indexing structures on disk in this module.
38const VERSION: u64 = 1;
39
40/// The name of the title file index.
41///
42/// This index represents a map from the IMDb title ID to the file offset
43/// corresponding to that record in title.basics.tsv.
44const TITLE: &str = "title.fst";
45
46/// The name of the file containing the index configuration.
47///
48/// The index configuration is a JSON file with some meta data about this
49/// index, such as its version.
50const CONFIG: &str = "config.json";
51
52/// A media entity is a title with optional episode and rating records.
53///
54/// A media entity makes it convenient to deal with the complete information
55/// of an IMDb media record. This is the default value returned by search
56/// routines such as what the [`Searcher`](struct.Searcher.html) provides, and
57/// can also be cheaply constructed by an [`Index`](struct.Index.html) given a
58/// [`Title`](struct.Title.html) or an IMDb ID.
59#[derive(Clone, Debug)]
60pub struct MediaEntity {
61 title: Title,
62 episode: Option<Episode>,
63 rating: Option<Rating>,
64}
65
66impl MediaEntity {
67 /// Return a reference to the underlying `Title`.
68 pub fn title(&self) -> &Title {
69 &self.title
70 }
71
72 /// Return a reference to the underlying `Episode`, if it exists.
73 pub fn episode(&self) -> Option<&Episode> {
74 self.episode.as_ref()
75 }
76
77 /// Return a reference to the underlying `Rating`, if it exists.
78 pub fn rating(&self) -> Option<&Rating> {
79 self.rating.as_ref()
80 }
81}
82
83/// An index into IMDb titles and their associated data.
84///
85/// This index consists of a set of on disk index data structures in addition
86/// to the uncompressed IMDb `tsv` files. The on disk index structures are used
87/// to provide access to the records in the `tsv` files efficiently.
88///
89/// With this index, one can do the following things:
90///
91/// * Return a ranked list
92/// [`Title`](struct.Title.html)
93/// records matching a fuzzy name query.
94/// * Access any `Title` record by ID in constant time.
95/// * Access all
96/// [`AKA`](struct.AKA.html)
97/// records for any `Title` in constant time.
98/// * Access the
99/// [`Rating`](struct.Rating.html)
100/// for any `Title` in constant time.
101/// * Access the complete set of
102/// [`Episode`](struct.Episode.html)
103/// records for any TV show in constant time.
104/// * Access the specific `Episode` given its ID in constant time.
105#[derive(Debug)]
106pub struct Index {
107 /// The directory containing the IMDb tsv files.
108 data_dir: PathBuf,
109 /// The directory containing this crate's index structures.
110 index_dir: PathBuf,
111 /// A seekable reader for `title.basics.tsv`. The index structures
112 /// typically return offsets that can be used to seek this reader to the
113 /// beginning of any `Title` record.
114 csv_basic: csv::Reader<io::Cursor<Mmap>>,
115 /// The name index. This is what provides fuzzy queries.
116 idx_names: names::IndexReader,
117 /// The AKA index.
118 idx_aka: aka::Index,
119 /// The episode index.
120 idx_episode: episode::Index,
121 /// The rating index.
122 idx_rating: rating::Index,
123 /// The title index.
124 idx_title: id::IndexReader,
125}
126
127#[derive(Debug, Deserialize, Serialize)]
128struct Config {
129 version: u64,
130}
131
132impl Index {
133 /// Open an existing index using default settings. If the index does not
134 /// exist, or if there was a problem opening it, then this returns an
135 /// error.
136 ///
137 /// Generally, this method is cheap to call. It opens some file
138 /// descriptors, but otherwise does no work.
139 ///
140 /// `data_dir` should be the directory containing decompressed IMDb
141 /// `tsv` files. See: https://www.imdb.com/interfaces/
142 ///
143 /// `index_dir` should be the directory containing a previously created
144 /// index using `Index::create`.
145 pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(
146 data_dir: P1,
147 index_dir: P2,
148 ) -> Result<Index> {
149 IndexBuilder::new().open(data_dir, index_dir)
150 }
151
152 /// Create a new index using default settings.
153 ///
154 /// Calling this method is expensive, and one should expect this to take
155 /// dozens of seconds or more to complete.
156 ///
157 /// `data_dir` should be the directory containing decompressed IMDb tsv`
158 /// `files. See: https://www.imdb.com/interfaces/
159 ///
160 /// `index_dir` should be the directory containing a previously created
161 /// index using `Index::create`.
162 ///
163 /// This will overwrite any previous index that may have existed in
164 /// `index_dir`.
165 pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(
166 data_dir: P1,
167 index_dir: P2,
168 ) -> Result<Index> {
169 IndexBuilder::new().create(data_dir, index_dir)
170 }
171
172 /// Attempt to clone this index, returning a distinct `Index`.
173 ///
174 /// This is as cheap to call as `Index::open` and returns an error if there
175 /// was a problem reading the underlying index.
176 ///
177 /// This is useful when one wants to query the same `Index` on disk from
178 /// multiple threads.
179 pub fn try_clone(&self) -> Result<Index> {
180 Index::open(&self.data_dir, &self.index_dir)
181 }
182
183 /// Search this index for `Title` records whose name matches the given
184 /// query.
185 ///
186 /// The query controls the following things:
187 ///
188 /// * The name to search for.
189 /// * The maximum number of results returned.
190 /// * The scorer to use to rank results.
191 ///
192 /// The name can be any string. It is normalized and broken down into
193 /// component pieces, which are then used to quickly search all existing
194 /// titles quickly and fuzzily.
195 ///
196 /// This returns an error if there was a problem reading the index or the
197 /// underlying CSV data.
198 pub fn search(
199 &mut self,
200 query: &names::NameQuery,
201 ) -> Result<SearchResults<Title>> {
202 let mut results = SearchResults::new();
203 // The name index gives us back scores with offsets. The offset can be
204 // used to seek our `Title` CSV reader to the corresponding record and
205 // read it in constant time.
206 for result in self.idx_names.search(query) {
207 let title = match self.read_record(*result.value())? {
208 None => continue,
209 Some(title) => title,
210 };
211 results.push(result.map(|_| title));
212 }
213 Ok(results)
214 }
215
216 /// Returns the `MediaEntity` for the given IMDb ID.
217 ///
218 /// An entity includes an [`Episode`](struct.Episode.html) and
219 /// [`Rating`](struct.Rating.html) records if they exist for the title.
220 ///
221 /// This returns an error if there was a problem reading the underlying
222 /// index. If no such title exists for the given ID, then `None` is
223 /// returned.
224 pub fn entity(&mut self, id: &str) -> Result<Option<MediaEntity>> {
225 match self.title(id)? {
226 None => Ok(None),
227 Some(title) => self.entity_from_title(title).map(Some),
228 }
229 }
230
231 /// Returns the `MediaEntity` for the given `Title`.
232 ///
233 /// This is like the `entity` method, except it takes a `Title` record as
234 /// given.
235 pub fn entity_from_title(&mut self, title: Title) -> Result<MediaEntity> {
236 let episode = match title.kind {
237 TitleKind::TVEpisode => self.episode(&title.id)?,
238 _ => None,
239 };
240 let rating = self.rating(&title.id)?;
241 Ok(MediaEntity { title, episode, rating })
242 }
243
244 /// Returns the `Title` record for the given IMDb ID.
245 ///
246 /// This returns an error if there was a problem reading the underlying
247 /// index. If no such title exists for the given ID, then `None` is
248 /// returned.
249 pub fn title(&mut self, id: &str) -> Result<Option<Title>> {
250 match self.idx_title.get(id.as_bytes()) {
251 None => Ok(None),
252 Some(offset) => self.read_record(offset),
253 }
254 }
255
256 /// Returns an iterator over all `AKA` records for the given IMDb ID.
257 ///
258 /// If no AKA records exist for the given ID, then an empty iterator is
259 /// returned.
260 ///
261 /// If there was a problem reading the index, then an error is returned.
262 pub fn aka_records(&mut self, id: &str) -> Result<AKARecordIter> {
263 self.idx_aka.find(id.as_bytes())
264 }
265
266 /// Returns the `Rating` associated with the given IMDb ID.
267 ///
268 /// If no rating exists for the given ID, then this returns `None`.
269 ///
270 /// If there was a problem reading the index, then an error is returned.
271 pub fn rating(&mut self, id: &str) -> Result<Option<Rating>> {
272 self.idx_rating.rating(id.as_bytes())
273 }
274
275 /// Returns all of the episodes for the given TV show. The TV show should
276 /// be identified by its IMDb ID.
277 ///
278 /// If the given ID isn't a TV show or if the TV show doesn't have any
279 /// episodes, then an empty list is returned.
280 ///
281 /// The episodes returned are sorted in order of their season and episode
282 /// numbers. Episodes without a season or episode number are sorted after
283 /// episodes with a season or episode number.
284 ///
285 /// If there was a problem reading the index, then an error is returned.
286 pub fn seasons(&mut self, tvshow_id: &str) -> Result<Vec<Episode>> {
287 self.idx_episode.seasons(tvshow_id.as_bytes())
288 }
289
290 /// Returns all of the episodes for the given TV show and season number.
291 /// The TV show should be identified by its IMDb ID, and the season should
292 /// be identified by its number. (Season numbers generally start at `1`.)
293 ///
294 /// If the given ID isn't a TV show or if the TV show doesn't have any
295 /// episodes for the given season, then an empty list is returned.
296 ///
297 /// The episodes returned are sorted in order of their episode numbers.
298 /// Episodes without an episode number are sorted after episodes with an
299 /// episode number.
300 ///
301 /// If there was a problem reading the index, then an error is returned.
302 pub fn episodes(
303 &mut self,
304 tvshow_id: &str,
305 season: u32,
306 ) -> Result<Vec<Episode>> {
307 self.idx_episode.episodes(tvshow_id.as_bytes(), season)
308 }
309
310 /// Return the episode corresponding to the given IMDb ID.
311 ///
312 /// If the ID doesn't correspond to an episode, then `None` is returned.
313 ///
314 /// If there was a problem reading the index, then an error is returned.
315 pub fn episode(&mut self, episode_id: &str) -> Result<Option<Episode>> {
316 self.idx_episode.episode(episode_id.as_bytes())
317 }
318
319 /// Returns the data directory that this index returns results for.
320 pub fn data_dir(&self) -> &Path {
321 &self.data_dir
322 }
323
324 /// Returns the directory containing this index's files.
325 pub fn index_dir(&self) -> &Path {
326 &self.index_dir
327 }
328
329 /// Read the CSV `Title` record beginning at the given file offset.
330 ///
331 /// If no such record exists, then this returns `None`.
332 ///
333 /// If there was a problem reading the underlying CSV data, then an error
334 /// is returned.
335 ///
336 /// If the given offset does not point to the start of a record in the CSV
337 /// data, then the behavior of this method is unspecified.
338 fn read_record(&mut self, offset: u64) -> Result<Option<Title>> {
339 let mut pos = csv::Position::new();
340 pos.set_byte(offset);
341 self.csv_basic.seek(pos).map_err(Error::csv)?;
342
343 let mut record = csv::StringRecord::new();
344 if !self.csv_basic.read_record(&mut record).map_err(Error::csv)? {
345 Ok(None)
346 } else {
347 let headers = self.csv_basic.headers().map_err(Error::csv)?;
348 Ok(record.deserialize(Some(headers)).map_err(Error::csv)?)
349 }
350 }
351}
352
353/// A builder for opening or creating an `Index`.
354#[derive(Debug)]
355pub struct IndexBuilder {
356 ngram_type: NgramType,
357 ngram_size: usize,
358}
359
360impl IndexBuilder {
361 /// Create a new builder with a default configuration.
362 pub fn new() -> IndexBuilder {
363 IndexBuilder { ngram_type: NgramType::default(), ngram_size: 3 }
364 }
365
366 /// Use the current configuration to open an existing index. If the index
367 /// does not exist, or if there was a problem opening it, then this returns
368 /// an error.
369 ///
370 /// Generally, this method is cheap to call. It opens some file
371 /// descriptors, but otherwise does no work.
372 ///
373 /// `data_dir` should be the directory containing decompressed IMDb tsv`
374 /// `files. See: https://www.imdb.com/interfaces/
375 ///
376 /// `index_dir` should be the directory containing a previously created
377 /// index using `Index::create`.
378 ///
379 /// Note that settings for index creation are ignored.
380 pub fn open<P1: AsRef<Path>, P2: AsRef<Path>>(
381 &self,
382 data_dir: P1,
383 index_dir: P2,
384 ) -> Result<Index> {
385 let data_dir = data_dir.as_ref();
386 let index_dir = index_dir.as_ref();
387 log::debug!("opening index {}", index_dir.display());
388
389 let config_file = open_file(index_dir.join(CONFIG))?;
390 let config: Config = serde_json::from_reader(config_file)
391 .map_err(|e| Error::config(e.to_string()))?;
392 if config.version != VERSION {
393 return Err(Error::version(VERSION, config.version));
394 }
395
396 Ok(Index {
397 data_dir: data_dir.to_path_buf(),
398 index_dir: index_dir.to_path_buf(),
399 // We claim it is safe to open the following memory map because we
400 // don't mutate them and no other process (should) either.
401 csv_basic: unsafe { csv_mmap(data_dir.join(IMDB_BASICS))? },
402 idx_names: names::IndexReader::open(index_dir)?,
403 idx_aka: aka::Index::open(data_dir, index_dir)?,
404 idx_episode: episode::Index::open(index_dir)?,
405 idx_rating: rating::Index::open(index_dir)?,
406 idx_title: id::IndexReader::from_path(index_dir.join(TITLE))?,
407 })
408 }
409
410 /// Use the current configuration to create a new index.
411 ///
412 /// Calling this method is expensive, and one should expect this to take
413 /// dozens of seconds or more to complete.
414 ///
415 /// `data_dir` should be the directory containing decompressed IMDb tsv`
416 /// `files. See: https://www.imdb.com/interfaces/
417 ///
418 /// `index_dir` should be the directory containing a previously created
419 /// index using `Index::create`.
420 ///
421 /// This will overwrite any previous index that may have existed in
422 /// `index_dir`.
423 pub fn create<P1: AsRef<Path>, P2: AsRef<Path>>(
424 &self,
425 data_dir: P1,
426 index_dir: P2,
427 ) -> Result<Index> {
428 let data_dir = data_dir.as_ref();
429 let index_dir = index_dir.as_ref();
430 fs::create_dir_all(index_dir)
431 .map_err(|e| Error::io_path(e, index_dir))?;
432 log::info!("creating index at {}", index_dir.display());
433
434 // Creating the rating and episode indices are completely independent
435 // from the name/AKA indexes, so do them in a background thread. The
436 // episode index takes long enough to build to justify this.
437 let job = {
438 let data_dir = data_dir.to_path_buf();
439 let index_dir = index_dir.to_path_buf();
440 thread::spawn(move || -> Result<()> {
441 let start = Instant::now();
442 rating::Index::create(&data_dir, &index_dir)?;
443 log::info!(
444 "created rating index (took {})",
445 NiceDuration::since(start)
446 );
447
448 let start = Instant::now();
449 episode::Index::create(&data_dir, &index_dir)?;
450 log::info!(
451 "created episode index (took {})",
452 NiceDuration::since(start)
453 );
454 Ok(())
455 })
456 };
457
458 let start = Instant::now();
459 let mut aka_index = aka::Index::create(data_dir, index_dir)?;
460 log::info!("created AKA index (took {})", NiceDuration::since(start));
461
462 let start = Instant::now();
463 create_name_index(
464 &mut aka_index,
465 data_dir,
466 index_dir,
467 self.ngram_type,
468 self.ngram_size,
469 )?;
470 log::info!(
471 "created name index, ngram type: {}, ngram size: {} (took {})",
472 self.ngram_type,
473 self.ngram_size,
474 NiceDuration::since(start)
475 );
476
477 job.join().unwrap()?;
478
479 // Write out our config.
480 let config_file = create_file(index_dir.join(CONFIG))?;
481 serde_json::to_writer_pretty(
482 config_file,
483 &Config { version: VERSION },
484 )
485 .map_err(|e| Error::config(e.to_string()))?;
486
487 self.open(data_dir, index_dir)
488 }
489
490 /// Set the type of ngram generation to use.
491 ///
492 /// The default type is `Window`.
493 pub fn ngram_type(&mut self, ngram_type: NgramType) -> &mut IndexBuilder {
494 self.ngram_type = ngram_type;
495 self
496 }
497
498 /// Set the ngram size on this index.
499 ///
500 /// When creating an index, ngrams with this size will be used.
501 pub fn ngram_size(&mut self, ngram_size: usize) -> &mut IndexBuilder {
502 self.ngram_size = ngram_size;
503 self
504 }
505}
506
507impl Default for IndexBuilder {
508 fn default() -> IndexBuilder {
509 IndexBuilder::new()
510 }
511}
512
513/// Creates the name index from the title tsv data and an AKA index. The AKA
514/// index is used to index additional names for each title record to improve
515/// recall during search.
516///
517/// To avoid a second pass through the title records, this also creates the
518/// title ID index, which provides an index for looking up a `Title` by its
519/// ID in constant time.
520fn create_name_index(
521 aka_index: &mut aka::Index,
522 data_dir: &Path,
523 index_dir: &Path,
524 ngram_type: NgramType,
525 ngram_size: usize,
526) -> Result<()> {
527 // For logging.
528 let (mut count, mut title_count) = (0u64, 0u64);
529
530 let mut wtr = names::IndexWriter::open(index_dir, ngram_type, ngram_size)?;
531 let mut twtr = id::IndexSortedWriter::from_path(index_dir.join(TITLE))?;
532
533 let mut rdr = csv_file(data_dir.join(IMDB_BASICS))?;
534 let mut record = csv::StringRecord::new();
535 while rdr.read_record(&mut record).map_err(Error::csv)? {
536 let pos = record.position().expect("position on row");
537 let id = &record[0];
538 let title = &record[2];
539 let original_title = &record[3];
540 let is_adult = &record[4] == "1";
541 if is_adult {
542 // TODO: Expose an option to permit this.
543 continue;
544 }
545 count += 1;
546 title_count += 1;
547
548 twtr.insert(id.as_bytes(), pos.byte())?;
549 // Index the primary name.
550 wtr.insert(pos.byte(), title)?;
551 if title != original_title {
552 // Index the "original" name.
553 wtr.insert(pos.byte(), original_title)?;
554 count += 1;
555 }
556 // Now index all of the alternate names, if they exist.
557 for result in aka_index.find(id.as_bytes())? {
558 let akarecord = result?;
559 if title != akarecord.title {
560 wtr.insert(pos.byte(), &akarecord.title)?;
561 count += 1;
562 }
563 }
564 }
565 wtr.finish()?;
566 twtr.finish()?;
567
568 log::info!("{} titles indexed", title_count);
569 log::info!("{} total names indexed", count);
570 Ok(())
571}