use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::{create_dir, File, remove_dir_all};
use std::io::Write;
use std::path::PathBuf;
use clap::arg_enum;
use croaring::Bitmap;
use crate::{Term, TermId};
use crate::build::chunks::{ChunkStrKey, ChunkU32Key};
use crate::build::chunks::bst::BstChunks;
use crate::build::js::generate_worker_js;
use crate::build::wasm::generate_and_compile_runner_wasm;
use crate::data::document_terms::DocumentTermsReader;
use crate::data::documents::DocumentsReader;
use crate::util::format::{number, percent};
use crate::util::log::status_log_interval;
mod js;
mod chunks;
mod wasm;
arg_enum! {
pub enum DataStore {
URL,
KV,
}
}
pub struct BuildConfig {
pub chunk_size: usize,
pub data_store: DataStore,
pub data_store_url_prefix: Option<String>,
pub document_terms_source: File,
pub documents_source: File,
pub maximum_query_results: usize,
pub maximum_query_terms: usize,
pub output_dir: PathBuf,
}
pub fn build(BuildConfig {
chunk_size,
data_store,
data_store_url_prefix,
document_terms_source,
documents_source,
maximum_query_results,
maximum_query_terms,
output_dir,
}: BuildConfig) -> () {
let mut terms = Vec::<Term>::new();
let mut term_ids = HashMap::<Term, TermId>::new();
let mut terms_by_document = Vec::<Vec<TermId>>::new();
let mut inverted_index = Vec::<Bitmap>::new();
let mut term_frequency = HashMap::<TermId, usize>::new();
for (document_id, term) in DocumentTermsReader::new(document_terms_source) {
while terms_by_document.len() <= document_id {
terms_by_document.push(Vec::<TermId>::new());
};
let document_terms = &mut terms_by_document[document_id];
let term_id = match term_ids.get(&term) {
Some(term_id) => *term_id,
None => {
assert_eq!(terms.len(), inverted_index.len());
let term_id = terms.len() as TermId;
term_ids.insert(term.clone(), term_id);
terms.push(term);
inverted_index.push(Bitmap::create());
term_id
}
};
document_terms.push(term_id);
term_frequency.insert(term_id, term_frequency.get(&term_id).unwrap_or(&0) + 1);
};
let document_count = terms_by_document.len();
let hash_log_interval = status_log_interval(document_count, 10);
for (document_id, doc_terms) in terms_by_document.iter().enumerate() {
interval_log!(hash_log_interval, document_id, document_count, "Processing documents ({})...");
for term_id in doc_terms {
inverted_index[*term_id].add(document_id.try_into().expect("too many documents"));
};
};
println!("There are {} documents with {} terms", number(terms_by_document.len()), number(terms.len()));
let mut terms_index_builder = BstChunks::<ChunkStrKey>::new(chunk_size);
let mut terms_sorted = (0..terms.len()).collect::<Vec<TermId>>();
terms_sorted.sort_by(|a, b| terms[*a].cmp(&terms[*b]));
for term_id in terms_sorted.iter() {
let postings_list_entry = &mut inverted_index[*term_id];
postings_list_entry.run_optimize();
let serialised = postings_list_entry.serialize();
terms_index_builder.insert(ChunkStrKey::new(&terms[*term_id]), serialised);
};
let (terms_index_raw_lookup, terms_index_serialised_entries) = terms_index_builder.serialise();
println!("{} chunks contain terms", number(terms_index_builder.chunk_count()));
let _ = remove_dir_all(output_dir.join("terms"));
create_dir(output_dir.join("terms")).expect("create terms chunks folder");
for (i, chunk) in terms_index_serialised_entries.iter().enumerate() {
let mut f = File::create(
output_dir.join("terms").join(format!("{}", i))
).expect("open terms chunk file for writing");
f.write_all(chunk).expect("write terms chunk");
};
let mut documents_builder = BstChunks::<ChunkU32Key>::new(chunk_size);
for (document_id, document) in DocumentsReader::new(documents_source) {
documents_builder.insert(ChunkU32Key::new(document_id.try_into().expect("too many documents")), document.as_bytes().to_vec());
};
let (documents_raw_lookup, documents_serialised_entries) = documents_builder.serialise();
println!("{} chunks contain documents", number(documents_builder.chunk_count()));
let _ = remove_dir_all(output_dir.join("documents"));
create_dir(output_dir.join("documents")).expect("create documents chunks folder");
for (i, chunk) in documents_serialised_entries.iter().enumerate() {
let mut f = File::create(
output_dir.join("documents").join(format!("{}", i))
).expect("open documents chunk file for writing");
f.write_all(chunk).expect("write documents chunk");
};
generate_worker_js(
&output_dir,
data_store,
data_store_url_prefix,
terms_by_document.len(),
maximum_query_terms,
maximum_query_results,
);
generate_and_compile_runner_wasm(
&output_dir,
maximum_query_results,
maximum_query_terms,
terms_index_raw_lookup.as_str(),
terms_index_serialised_entries.len(),
documents_raw_lookup.as_str(),
documents_serialised_entries.len(),
);
println!("Build complete")
}