ciff/
lib.rs

1//! Library supporting converting CIFF to PISA uncompressed collection format.
2//! Refer to [`osirrc/ciff`](https://github.com/osirrc/ciff) on Github
3//! for more detailed information about the format.
4//!
5//! For more information about PISA's internal storage formats, see the
6//! [documentation](https://pisa.readthedocs.io/en/latest/index.html).
7//!
8//! # Examples
9//!
10//! Use [`PisaToCiff`] and [`CiffToPisa`] builders to convert from one format
11//! to another.
12//!
13//! ```
14//! # use std::path::PathBuf;
15//! # use tempfile::TempDir;
16//! # use ciff::{PisaToCiff, CiffToPisa};
17//! # fn main() -> anyhow::Result<()> {
18//! # let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
19//! # let ciff_file = dir.join("tests").join("test_data").join("toy-complete-20200309.ciff");
20//! # let temp = TempDir::new()?;
21//! # let pisa_base_path = temp.path().join("pisa");
22//! # let output = temp.path().join("output");
23//! CiffToPisa::default()
24//!     .input_path(ciff_file)
25//!     .output_paths(&pisa_base_path)
26//!     .convert()?;
27//! PisaToCiff::default()
28//!     .description("Hello, CIFF!")
29//!     .pisa_paths(&pisa_base_path)
30//!     .output_path(output)
31//!     .convert()?;
32//! # Ok(())
33//! # }
34//! ```
35
36#![doc(html_root_url = "https://docs.rs/ciff/0.3.1")]
37#![warn(
38    missing_docs,
39    trivial_casts,
40    trivial_numeric_casts,
41    unused_import_braces,
42    unused_qualifications
43)]
44#![warn(clippy::all, clippy::pedantic)]
45#![allow(
46    clippy::module_name_repetitions,
47    clippy::default_trait_access,
48    clippy::cast_possible_wrap,
49    clippy::cast_possible_truncation,
50    clippy::copy_iterator
51)]
52
53use anyhow::{anyhow, Context};
54use indicatif::ProgressIterator;
55use indicatif::{ProgressBar, ProgressStyle};
56use memmap::Mmap;
57use num_traits::ToPrimitive;
58use protobuf::{CodedInputStream, CodedOutputStream};
59use std::borrow::Borrow;
60use std::convert::TryFrom;
61use std::ffi::{OsStr, OsString};
62use std::fmt;
63use std::fs::File;
64use std::io::{self, BufRead, BufReader, BufWriter, Write};
65use std::path::{Path, PathBuf};
66use tempfile::TempDir;
67
68mod proto;
69pub use proto::{DocRecord, Posting, PostingsList};
70
71mod binary_collection;
72pub use binary_collection::{
73    BinaryCollection, BinarySequence, InvalidFormat, RandomAccessBinaryCollection,
74};
75
76mod payload_vector;
77pub use payload_vector::{build_lexicon, PayloadIter, PayloadSlice, PayloadVector};
78
79type Result<T> = anyhow::Result<T>;
80
81const DEFAULT_PROGRESS_TEMPLATE: &str =
82    "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {count}/{total} ({eta})";
83
84/// Wraps [`proto::Header`] and additionally provides some important counts that are already cast
85/// to an unsigned type.
86#[derive(PartialEq, Clone, Default)]
87struct Header {
88    num_postings_lists: u32,
89    num_documents: u32,
90    /// Used for printing.
91    protobuf_header: proto::Header,
92}
93
94impl Header {
95    /// Reads the protobuf header, and converts to a proper-typed header to fail fast if the protobuf
96    /// header contains any negative values.
97    ///
98    /// # Errors
99    ///
100    /// Returns an error if the protobuf header contains negative counts.
101    fn from_stream(input: &mut CodedInputStream<'_>) -> Result<Self> {
102        let header = input.read_message::<proto::Header>()?;
103        let num_documents = u32::try_from(header.get_num_docs())
104            .context("Number of documents must be non-negative.")?;
105        let num_postings_lists = u32::try_from(header.get_num_postings_lists())
106            .context("Number of documents must be non-negative.")?;
107        Ok(Self {
108            protobuf_header: header,
109            num_documents,
110            num_postings_lists,
111        })
112    }
113}
114
115impl fmt::Display for Header {
116    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
117        write!(f, "{}", self.protobuf_header)
118    }
119}
120
121/// Returns default progress style.
122fn pb_style() -> ProgressStyle {
123    ProgressStyle::default_bar()
124        .template(DEFAULT_PROGRESS_TEMPLATE)
125        .progress_chars("=> ")
126}
127
128/// Encodes a sequence of 4-byte unsigned integers into `writer` in native-endianness.
129///
130/// # Examples
131///
132/// ```
133/// # use ciff::encode_u32_sequence;
134/// # fn main() -> anyhow::Result<()> {
135/// let mut buf: Vec<u8> = vec![];
136/// let input = vec![4_u32, 98765];
137/// encode_u32_sequence(&mut buf, 2, input)?;
138///
139/// #[cfg(target_endian = "little")]
140/// assert_eq!(buf, &[
141///     2_u8, 0, 0, 0,  // Sequence length
142///     4, 0, 0, 0,     // First element
143///     205, 129, 1, 0, // Second element
144///     ]);
145/// # Ok(())
146/// # }
147///
148/// ```
149///
150/// # Errors
151///
152/// Passes along any IO errors.
153pub fn encode_u32_sequence<N, S, W>(writer: &mut W, len: u32, sequence: S) -> io::Result<()>
154where
155    N: Borrow<u32>,
156    S: IntoIterator<Item = N>,
157    W: Write,
158{
159    let size: [u8; 4] = len.to_le_bytes();
160    writer.write_all(&size)?;
161    for element in sequence {
162        writer.write_all(&element.borrow().to_le_bytes())?;
163    }
164    Ok(())
165}
166
167fn write_posting_list<DW, FW, TW>(
168    posting_list: &PostingsList,
169    documents: &mut DW,
170    frequencies: &mut FW,
171    terms: &mut TW,
172) -> Result<()>
173where
174    DW: Write,
175    FW: Write,
176    TW: Write,
177{
178    let length = posting_list
179        .get_df()
180        .to_u32()
181        .ok_or_else(|| anyhow!("Cannot cast to u32: {}", posting_list.get_df()))?;
182
183    let postings = posting_list.get_postings();
184
185    encode_u32_sequence(
186        documents,
187        length,
188        postings.iter().scan(0_u32, |prev, p| {
189            *prev += u32::try_from(p.get_docid()).expect("Negative ID");
190            Some(*prev)
191        }),
192    )?;
193
194    encode_u32_sequence(
195        frequencies,
196        length,
197        postings
198            .iter()
199            .map(|p| u32::try_from(p.get_tf()).expect("Negative frequency")),
200    )?;
201
202    writeln!(terms, "{}", posting_list.get_term())?;
203    Ok(())
204}
205
206fn check_lines_sorted<R: BufRead>(reader: R) -> io::Result<bool> {
207    let mut prev = String::from("");
208    for line in reader.lines() {
209        let line = line?;
210        if line < prev {
211            return Ok(false);
212        }
213        prev = line;
214    }
215    Ok(true)
216}
217
218/// Concatenate two [`OsStr`]ings.
219///
220/// Takes two arguments that can be used as a reference to [`OsStr`], and returns
221/// a new [`OsString`] instance by concatenating them.
222pub fn concat<S1, S2>(path: S1, suffix: S2) -> OsString
223where
224    S1: AsRef<OsStr>,
225    S2: AsRef<OsStr>,
226{
227    let mut path = path.as_ref().to_owned();
228    path.push(suffix);
229    path
230}
231
232/// Paths to an inverted index in an uncompressed PISA format.
233#[derive(Debug, Clone, Default)]
234struct PisaIndexPaths {
235    documents: PathBuf,
236    frequencies: PathBuf,
237    sizes: PathBuf,
238}
239
240impl PisaIndexPaths {
241    #[must_use]
242    fn from_base_path<P: AsRef<OsStr>>(path: P) -> Self {
243        Self {
244            documents: PathBuf::from(concat(path.as_ref(), ".docs")),
245            frequencies: PathBuf::from(concat(path.as_ref(), ".freqs")),
246            sizes: PathBuf::from(concat(path.as_ref(), ".sizes")),
247        }
248    }
249}
250
251#[derive(Debug, Clone, Default)]
252struct PisaPaths {
253    index: PisaIndexPaths,
254    terms: PathBuf,
255    titles: PathBuf,
256    termlex: Option<PathBuf>,
257    doclex: Option<PathBuf>,
258}
259
260impl PisaPaths {
261    #[must_use]
262    fn from_base_path<P: AsRef<OsStr>>(path: P) -> Self {
263        Self {
264            index: PisaIndexPaths::from_base_path(&path),
265            terms: PathBuf::from(concat(&path, ".terms")),
266            titles: PathBuf::from(concat(&path, ".documents")),
267            termlex: Some(PathBuf::from(concat(&path, ".termlex"))),
268            doclex: Some(PathBuf::from(concat(&path, ".doclex"))),
269        }
270    }
271}
272
273fn reorder_postings(path: &Path, order: &[usize], skip_first: bool) -> Result<()> {
274    let temp = TempDir::new()?;
275    let tmp_path = temp.path().join("coll");
276    std::fs::rename(path, &tmp_path)?;
277    let mmap = unsafe { Mmap::map(&File::open(tmp_path)?)? };
278    let coll = RandomAccessBinaryCollection::try_from(mmap.as_ref())?;
279    let mut writer = BufWriter::new(File::create(path)?);
280    if skip_first {
281        let order: Vec<_> = std::iter::once(0)
282            .chain(order.iter().map(|&i| i + 1))
283            .collect();
284        binary_collection::reorder(&coll, &order, &mut writer)?;
285    } else {
286        binary_collection::reorder(&coll, order, &mut writer)?;
287    }
288    writer.flush()?;
289    Ok(())
290}
291
292fn reorder_pisa_index(paths: &PisaPaths) -> Result<()> {
293    let terms = BufReader::new(File::open(&paths.terms)?)
294        .lines()
295        .collect::<io::Result<Vec<_>>>()?;
296    let mut order: Vec<_> = (0..terms.len()).collect();
297    order.sort_by_key(|&i| &terms[i]);
298    reorder_postings(&paths.index.documents, &order, true)?;
299    reorder_postings(&paths.index.frequencies, &order, false)?;
300    let mut term_writer = BufWriter::new(File::create(&paths.terms)?);
301    for index in order {
302        writeln!(&mut term_writer, "{}", terms[index])?;
303    }
304    Ok(())
305}
306
307/// CIFF to PISA converter.
308#[derive(Debug, Default, Clone)]
309pub struct CiffToPisa {
310    input: Option<PathBuf>,
311    documents_path: Option<PathBuf>,
312    frequencies_path: Option<PathBuf>,
313    sizes_path: Option<PathBuf>,
314    terms_path: Option<PathBuf>,
315    titles_path: Option<PathBuf>,
316    termlex_path: Option<PathBuf>,
317    doclex_path: Option<PathBuf>,
318}
319
320impl CiffToPisa {
321    /// Sets the CIFF path. Required.
322    pub fn input_path<P: Into<PathBuf>>(&mut self, path: P) -> &mut Self {
323        self.input = Some(path.into());
324        self
325    }
326
327    /// Sets PISA (uncompressed) inverted index paths. Required.
328    ///
329    /// Paths are constructed by appending file extensions to the base path:
330    ///  - `.docs` for document postings,
331    ///  - `.freqs` for frequency postings,
332    ///  - `.sizes` for document sizes,
333    ///  - `.terms` for terms text file,
334    ///  - `.documents` for document titles text file,
335    ///  - `.termlex` for term lexicon,
336    ///  - `.doclex` for document lexicon.
337    pub fn output_paths<P: AsRef<OsStr>>(&mut self, base_path: P) -> &mut Self {
338        let paths = PisaPaths::from_base_path(base_path);
339        self.documents_path = Some(paths.index.documents);
340        self.frequencies_path = Some(paths.index.frequencies);
341        self.sizes_path = Some(paths.index.sizes);
342        self.terms_path = Some(paths.terms);
343        self.titles_path = Some(paths.titles);
344        self.termlex_path = paths.termlex;
345        self.doclex_path = paths.doclex;
346        self
347    }
348
349    /// Do not construct document and term lexicons.
350    pub fn skip_lexicons(&mut self) -> &mut Self {
351        self.termlex_path = None;
352        self.doclex_path = None;
353        self
354    }
355
356    /// Builds a PISA index using the previously defined parameters.
357    ///
358    /// # Errors
359    ///
360    /// Error will be returned if:
361    ///  - some required parameters are not defined,
362    ///  - any I/O error occurs during reading input files or writing to the output file,
363    ///  - any input file is in an incorrect format.
364    pub fn convert(&self) -> Result<()> {
365        let input = self
366            .input
367            .as_ref()
368            .ok_or_else(|| anyhow!("input path undefined"))?;
369        let index_output = PisaIndexPaths {
370            documents: self
371                .documents_path
372                .clone()
373                .ok_or_else(|| anyhow!("document postings path undefined"))?,
374            frequencies: self
375                .frequencies_path
376                .clone()
377                .ok_or_else(|| anyhow!("frequency postings path undefined"))?,
378            sizes: self
379                .sizes_path
380                .clone()
381                .ok_or_else(|| anyhow!("document sizes path undefined"))?,
382        };
383        let output = PisaPaths {
384            index: index_output,
385            terms: self
386                .terms_path
387                .clone()
388                .ok_or_else(|| anyhow!("terms path undefined"))?,
389            titles: self
390                .titles_path
391                .clone()
392                .ok_or_else(|| anyhow!("terms path undefined"))?,
393            termlex: self.termlex_path.clone(),
394            doclex: self.doclex_path.clone(),
395        };
396        convert_to_pisa(input, &output)
397    }
398}
399
400/// Converts a CIFF index stored in `path` to a PISA "binary collection" (uncompressed inverted
401/// index) with a basename `output`.
402///
403/// # Errors
404///
405/// Returns an error when:
406/// - an IO error occurs,
407/// - reading protobuf format fails,
408/// - data format is valid but any ID, frequency, or a count is negative,
409/// - document records is out of order.
410#[deprecated = "use CiffToPisa instead"]
411pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Result<()> {
412    let mut converter = CiffToPisa::default();
413    converter.input_path(input).output_paths(output);
414    if !generate_lexicons {
415        converter.skip_lexicons();
416    }
417    converter.convert()
418}
419
420fn convert_to_pisa(input: &Path, output: &PisaPaths) -> Result<()> {
421    println!("{:?}", output);
422    let mut ciff_reader =
423        File::open(input).with_context(|| format!("Unable to open {}", input.display()))?;
424    let mut input = CodedInputStream::new(&mut ciff_reader);
425    let mut documents = BufWriter::new(File::create(&output.index.documents)?);
426    let mut frequencies = BufWriter::new(File::create(&output.index.frequencies)?);
427    let mut terms = BufWriter::new(File::create(&output.terms)?);
428
429    let header = Header::from_stream(&mut input)?;
430    println!("{}", header);
431
432    eprintln!("Processing postings");
433    encode_u32_sequence(&mut documents, 1, [header.num_documents].iter())?;
434    let progress = ProgressBar::new(u64::try_from(header.num_postings_lists)?);
435    progress.set_style(pb_style());
436    progress.set_draw_delta(10);
437    for _ in 0..header.num_postings_lists {
438        write_posting_list(
439            &input.read_message::<PostingsList>()?,
440            &mut documents,
441            &mut frequencies,
442            &mut terms,
443        )?;
444        progress.inc(1);
445    }
446    progress.finish();
447
448    documents.flush()?;
449    frequencies.flush()?;
450    terms.flush()?;
451
452    eprintln!("Processing document lengths");
453    let mut sizes = BufWriter::new(File::create(&output.index.sizes)?);
454    let mut trecids = BufWriter::new(File::create(&output.titles)?);
455
456    let progress = ProgressBar::new(u64::from(header.num_documents));
457    progress.set_style(pb_style());
458    progress.set_draw_delta(u64::from(header.num_documents) / 100);
459    sizes.write_all(&header.num_documents.to_le_bytes())?;
460    sizes.flush()?;
461
462    for docs_seen in 0..header.num_documents {
463        let doc_record = input.read_message::<DocRecord>()?;
464
465        let docid: u32 = doc_record
466            .get_docid()
467            .to_u32()
468            .ok_or_else(|| anyhow!("Cannot cast docid to u32: {}", doc_record.get_docid()))?;
469
470        let trecid = doc_record.get_collection_docid();
471        let length: u32 = doc_record.get_doclength().to_u32().ok_or_else(|| {
472            anyhow!(
473                "Cannot cast doc length to u32: {}",
474                doc_record.get_doclength()
475            )
476        })?;
477
478        if docid != docs_seen {
479            anyhow::bail!("Document sizes must come in order");
480        }
481
482        sizes.write_all(&length.to_le_bytes())?;
483        writeln!(trecids, "{}", trecid)?;
484        progress.inc(1);
485    }
486    trecids.flush()?;
487    progress.finish();
488
489    if !check_lines_sorted(BufReader::new(File::open(&output.terms)?))? {
490        reorder_pisa_index(output)?;
491    }
492
493    eprintln!("Generating the document and term lexicons...");
494    drop(trecids);
495    if let Some(termlex) = output.termlex.as_ref() {
496        build_lexicon(&output.terms, termlex)?;
497    }
498    if let Some(doclex) = output.doclex.as_ref() {
499        build_lexicon(&output.titles, doclex)?;
500    }
501
502    Ok(())
503}
504
505fn read_document_count(
506    documents: &mut BinaryCollection,
507) -> std::result::Result<u32, InvalidFormat> {
508    let invalid = || InvalidFormat::new("Unable to read document count");
509    documents
510        .next()
511        .ok_or_else(invalid)??
512        .get(0)
513        .ok_or_else(invalid)
514}
515
516fn header(documents_bytes: &[u8], sizes_bytes: &[u8], description: &str) -> Result<proto::Header> {
517    let mut num_postings_lists = 0;
518
519    eprintln!("Collecting posting lists statistics");
520    let progress = ProgressBar::new(documents_bytes.len() as u64);
521    progress.set_style(pb_style());
522    progress.set_draw_delta(100_000);
523    let mut collection = BinaryCollection::try_from(documents_bytes)?;
524    let num_documents = read_document_count(&mut collection)?;
525    for sequence in collection {
526        num_postings_lists += 1;
527        let sequence = sequence?;
528        progress.inc((sequence.bytes().len() + 4) as u64);
529    }
530    progress.finish();
531
532    eprintln!("Computing average document length");
533    let progress = ProgressBar::new(u64::from(num_documents));
534    progress.set_style(pb_style());
535    let doclen_sum: i64 = sizes(sizes_bytes)?
536        .iter()
537        .map(i64::from)
538        .progress_with(progress)
539        .sum();
540
541    let mut header = proto::Header::default();
542    header.set_version(1);
543    header.set_description(description.into());
544    header.set_num_postings_lists(num_postings_lists);
545    header.set_total_postings_lists(num_postings_lists);
546    header.set_total_terms_in_collection(doclen_sum);
547    header.set_num_docs(num_documents as i32);
548    header.set_total_docs(num_documents as i32);
549    #[allow(clippy::cast_precision_loss)]
550    header.set_average_doclength(doclen_sum as f64 / f64::from(num_documents));
551    Ok(header)
552}
553
554fn sizes(memory: &[u8]) -> std::result::Result<BinarySequence<'_>, InvalidFormat> {
555    BinaryCollection::try_from(memory)?
556        .next()
557        .ok_or_else(|| InvalidFormat::new("sizes collection is empty"))?
558}
559
560fn write_sizes(sizes_mmap: &Mmap, titles_file: &File, out: &mut CodedOutputStream) -> Result<()> {
561    let titles = BufReader::new(titles_file);
562    for ((docid, size), title) in sizes(sizes_mmap)?.iter().enumerate().zip(titles.lines()) {
563        let mut document = DocRecord::default();
564        document.set_docid(docid as i32);
565        document.set_collection_docid(title?);
566        document.set_doclength(size as i32);
567        out.write_message_no_tag(&document)?;
568    }
569    Ok(())
570}
571
572fn write_postings(
573    documents_mmap: &Mmap,
574    frequencies_mmap: &Mmap,
575    terms_file: &File,
576    out: &mut CodedOutputStream,
577) -> Result<()> {
578    let mut documents = BinaryCollection::try_from(&documents_mmap[..])?;
579    let num_documents = u64::from(read_document_count(&mut documents)?);
580    let frequencies = BinaryCollection::try_from(&frequencies_mmap[..])?;
581    let terms = BufReader::new(terms_file);
582
583    eprintln!("Writing postings");
584    let progress = ProgressBar::new(num_documents);
585    progress.set_style(pb_style());
586    progress.set_draw_delta(num_documents / 100);
587    for ((term_documents, term_frequencies), term) in documents
588        .zip(frequencies)
589        .zip(terms.lines())
590        .progress_with(progress)
591    {
592        let mut posting_list = PostingsList::default();
593        posting_list.set_term(term?);
594        let mut count = 0;
595        let mut sum = 0;
596        let mut last_doc = 0;
597        for (docid, frequency) in term_documents?.iter().zip(term_frequencies?.iter()) {
598            let mut posting = Posting::default();
599            posting.set_docid(docid as i32 - last_doc);
600            posting.set_tf(frequency as i32);
601            posting_list.postings.push(posting);
602            count += 1;
603            sum += i64::from(frequency);
604            last_doc = docid as i32;
605        }
606        posting_list.set_df(count);
607        posting_list.set_cf(sum);
608        out.write_message_no_tag(&posting_list)?;
609    }
610    Ok(())
611}
612
613/// PISA to CIFF converter.
614#[derive(Debug, Default, Clone)]
615pub struct PisaToCiff {
616    documents_path: Option<PathBuf>,
617    frequencies_path: Option<PathBuf>,
618    sizes_path: Option<PathBuf>,
619    terms_path: Option<PathBuf>,
620    titles_path: Option<PathBuf>,
621    output_path: Option<PathBuf>,
622    description: String,
623}
624
625impl PisaToCiff {
626    /// Sets CIFF index description.
627    pub fn description<S: Into<String>>(&mut self, description: S) -> &mut Self {
628        self.description = description.into();
629        self
630    }
631
632    /// Sets PISA paths. Required.
633    ///
634    /// Paths are constructed by appending file extensions to the base path:
635    ///  - `.docs` for document postings,
636    ///  - `.freqs` for frequency postings,
637    ///  - `.sizes` for document sizes,
638    ///  - `.terms` for terms text file,
639    ///  - `.documents` for document titles text file,
640    pub fn pisa_paths<P: AsRef<OsStr>>(&mut self, base_path: P) -> &mut Self {
641        let paths = PisaPaths::from_base_path(base_path);
642        self.documents_path = Some(paths.index.documents);
643        self.frequencies_path = Some(paths.index.frequencies);
644        self.sizes_path = Some(paths.index.sizes);
645        self.terms_path = Some(paths.terms);
646        self.titles_path = Some(paths.titles);
647        self
648    }
649
650    /// Sets PISA (uncompressed) inverted index paths. Required.
651    ///
652    /// Constructs paths using the given base path, appeding suffixes:
653    /// `.docs`, `.freqs`, and `.sizes`.
654    pub fn index_paths<P: AsRef<OsStr>>(&mut self, base_path: P) -> &mut Self {
655        let PisaIndexPaths {
656            documents,
657            frequencies,
658            sizes,
659        } = PisaIndexPaths::from_base_path(base_path);
660        self.documents_path = Some(documents);
661        self.frequencies_path = Some(frequencies);
662        self.sizes_path = Some(sizes);
663        self
664    }
665
666    /// Sets the path of the term file (newline-delimited text format). Required.
667    pub fn terms_path<P: Into<PathBuf>>(&mut self, path: P) -> &mut Self {
668        self.terms_path = Some(path.into());
669        self
670    }
671
672    /// Sets the path of the document titles file (newline-delimited text format). Required.
673    pub fn titles_path<P: Into<PathBuf>>(&mut self, path: P) -> &mut Self {
674        self.titles_path = Some(path.into());
675        self
676    }
677
678    /// Set the output file path. Required.
679    pub fn output_path<P: Into<PathBuf>>(&mut self, path: P) -> &mut Self {
680        self.output_path = Some(path.into());
681        self
682    }
683
684    /// Builds a CIFF index using the previously defined parameters.
685    ///
686    /// # Errors
687    ///
688    /// Error will be returned if:
689    ///  - some required parameters are not defined,
690    ///  - any I/O error occurs during reading input files or writing to the output file,
691    ///  - any input file is in an incorrect format.
692    pub fn convert(&self) -> Result<()> {
693        pisa_to_ciff_from_paths(
694            self.documents_path
695                .as_ref()
696                .ok_or_else(|| anyhow!("undefined document postings path"))?,
697            self.frequencies_path
698                .as_ref()
699                .ok_or_else(|| anyhow!("undefined frequency postings path"))?,
700            self.sizes_path
701                .as_ref()
702                .ok_or_else(|| anyhow!("undefined document sizes path"))?,
703            self.terms_path
704                .as_ref()
705                .ok_or_else(|| anyhow!("undefined terms path"))?,
706            self.titles_path
707                .as_ref()
708                .ok_or_else(|| anyhow!("undefined titles path"))?,
709            self.output_path
710                .as_ref()
711                .ok_or_else(|| anyhow!("undefined output path"))?,
712            &self.description,
713        )
714    }
715}
716
717/// Converts a a PISA "binary collection" (uncompressed inverted index) with a basename `input`
718/// to a CIFF index stored in `output`.
719///
720/// # Errors
721///
722/// Returns an error when:
723/// - an IO error occurs,
724/// - writing protobuf format fails,
725#[deprecated = "use PisaToCiff instead"]
726pub fn pisa_to_ciff(
727    collection_input: &Path,
728    terms_input: &Path,
729    titles_input: &Path,
730    output: &Path,
731    description: &str,
732) -> Result<()> {
733    PisaToCiff::default()
734        .description(description)
735        .index_paths(collection_input)
736        .terms_path(terms_input)
737        .titles_path(titles_input)
738        .output_path(output)
739        .convert()
740}
741
742fn pisa_to_ciff_from_paths(
743    documents_path: &Path,
744    frequencies_path: &Path,
745    sizes_path: &Path,
746    terms_path: &Path,
747    titles_path: &Path,
748    output: &Path,
749    description: &str,
750) -> Result<()> {
751    let documents_file = File::open(documents_path)?;
752    let frequencies_file = File::open(frequencies_path)?;
753    let sizes_file = File::open(sizes_path)?;
754    let terms_file = File::open(terms_path)?;
755    let titles_file = File::open(titles_path)?;
756
757    let documents_mmap = unsafe { Mmap::map(&documents_file)? };
758    let frequencies_mmap = unsafe { Mmap::map(&frequencies_file)? };
759    let sizes_mmap = unsafe { Mmap::map(&sizes_file)? };
760
761    let mut writer = BufWriter::new(File::create(output)?);
762    let mut out = CodedOutputStream::new(&mut writer);
763
764    let header = header(&documents_mmap[..], &sizes_mmap[..], description)?;
765    out.write_message_no_tag(&header)?;
766
767    write_postings(&documents_mmap, &frequencies_mmap, &terms_file, &mut out)?;
768    write_sizes(&sizes_mmap, &titles_file, &mut out)?;
769
770    out.flush()?;
771
772    Ok(())
773}
774
775#[cfg(test)]
776mod test {
777    use super::*;
778
779    #[test]
780    fn test_size_sequence() {
781        let empty_memory = Vec::<u8>::new();
782        let sizes = sizes(&empty_memory);
783        assert!(sizes.is_err());
784        assert_eq!(
785            "Invalid binary collection format: sizes collection is empty",
786            &format!("{}", sizes.err().unwrap())
787        );
788
789        let valid_memory: Vec<u8> = [
790            5_u32.to_le_bytes(),
791            1_u32.to_le_bytes(),
792            2_u32.to_le_bytes(),
793            3_u32.to_le_bytes(),
794            4_u32.to_le_bytes(),
795            5_u32.to_le_bytes(),
796        ]
797        .iter()
798        .flatten()
799        .copied()
800        .collect();
801        let sizes = super::sizes(&valid_memory);
802        assert!(sizes.is_ok());
803        assert_eq!(
804            sizes.unwrap().iter().collect::<Vec<u32>>(),
805            vec![1_u32, 2, 3, 4, 5]
806        );
807    }
808
809    fn header_to_buf(header: &proto::Header) -> Result<Vec<u8>> {
810        let mut buffer = Vec::<u8>::new();
811        let mut out = CodedOutputStream::vec(&mut buffer);
812        out.write_message_no_tag(header)?;
813        out.flush()?;
814        Ok(buffer)
815    }
816
817    #[test]
818    fn test_read_default_header() -> Result<()> {
819        let mut proto_header = proto::Header::default();
820        proto_header.set_num_docs(17);
821        proto_header.set_num_postings_lists(1234);
822
823        let buffer = header_to_buf(&proto_header)?;
824
825        let mut input = CodedInputStream::from_bytes(&buffer);
826        let header = Header::from_stream(&mut input)?;
827        assert_eq!(header.protobuf_header, proto_header);
828        assert_eq!(header.num_documents, 17);
829        assert_eq!(header.num_postings_lists, 1234);
830        Ok(())
831    }
832
833    #[test]
834    fn test_read_negative_num_documents() -> Result<()> {
835        let mut proto_header = proto::Header::default();
836        proto_header.set_num_docs(-17);
837
838        let buffer = header_to_buf(&proto_header)?;
839
840        let mut input = CodedInputStream::from_bytes(&buffer);
841        assert!(Header::from_stream(&mut input).is_err());
842        Ok(())
843    }
844
845    #[test]
846    fn test_read_negative_num_posting_lists() -> Result<()> {
847        let mut proto_header = proto::Header::default();
848        proto_header.set_num_postings_lists(-1234);
849
850        let buffer = header_to_buf(&proto_header)?;
851
852        let mut input = CodedInputStream::from_bytes(&buffer);
853        assert!(Header::from_stream(&mut input).is_err());
854        Ok(())
855    }
856}
ciff/lib.rs

ciff/
lib.rs