stork-lib 1.6.0

Impossibly fast web search, made for static sites.
Documentation
pub mod intermediate_excerpt;
use std::collections::BTreeMap;
use std::collections::HashMap;

use intermediate_excerpt::IntermediateExcerpt;

mod entry_and_intermediate_excerpts;
use entry_and_intermediate_excerpts::EntryAndIntermediateExcerpts;

use crate::stopwords;
use crate::Output;
use crate::Result;
use crate::V3Index as Index;

use super::scores::STOPWORD_SCORE;
use super::AliasTarget;
use super::Container;
use super::Entry;
use super::EntryIndex;
use super::Score;
use super::SearchResult;

pub fn search(index: &Index, query: &str) -> Output {
    let normalized_query = query.to_lowercase();
    let words_in_query: Vec<String> = normalized_query
        .split(|c| c == ' ' || c == '-')
        .map(ToString::to_string)
        .collect();

    // Get the containers for each word in the query, and separate them
    // into intermediate excerpts
    let mut intermediate_excerpts: Vec<IntermediateExcerpt> = words_in_query
        .iter()
        .filter_map(|word| index.containers.get_key_value(word))
        .map(|(word, ctr)| ContainerWithQuery::new(ctr.clone(), word))
        .flat_map(|ctr_query| ctr_query.get_intermediate_excerpts(index))
        .collect();

    for mut ie in &mut intermediate_excerpts {
        if stopwords.contains(&ie.query.as_str()) {
            ie.score = STOPWORD_SCORE;
        }
    }

    let mut excerpts_by_index: BTreeMap<EntryIndex, Vec<IntermediateExcerpt>> = BTreeMap::new();
    for ie in intermediate_excerpts {
        excerpts_by_index
            .entry(ie.entry_index)
            .or_insert_with(Vec::new)
            .push(ie);
    }

    let total_len = &excerpts_by_index.len();

    let mut output_results: Vec<Result> = excerpts_by_index
        .iter()
        .map(|(entry_index, ies)| {
            let data = EntryAndIntermediateExcerpts {
                entry: index.entries[*entry_index].clone(),
                config: index.config.clone(),
                intermediate_excerpts: ies.clone(),
            };
            Result::from(data)
        })
        .collect();
    output_results.sort_by_key(|or| or.entry.title.clone());
    output_results.sort_by_key(|or| -(or.score as i64));
    output_results.truncate(index.config.displayed_results_count as usize);

    Output {
        results: output_results,
        total_hit_count: *total_len,
        url_prefix: index.config.url_prefix.clone(),
    }
}

#[derive(Debug)]
struct ContainerWithQuery {
    results: BTreeMap<EntryIndex, SearchResult>,
    aliases: BTreeMap<AliasTarget, Score>,
    query: String,
}

impl ContainerWithQuery {
    fn new(container: Container, query: &str) -> Self {
        ContainerWithQuery {
            query: query.to_string(),
            results: container.results,
            aliases: container.aliases,
        }
    }

    fn get_intermediate_excerpts(&self, index: &Index) -> Vec<IntermediateExcerpt> {
        let mut output = vec![];
        // Put container's results in output
        for (entry_index, result) in &self.results {
            if result.excerpts.is_empty() {
                output.push(IntermediateExcerpt {
                    query: self.query.to_string(),
                    entry_index: *entry_index,
                    score: result.score,
                    source: super::WordListSource::Contents,
                    word_index: 0,
                    internal_annotations: vec![],
                    fields: HashMap::new(),
                });
            }

            for excerpt in &result.excerpts {
                output.push(IntermediateExcerpt {
                    query: self.query.to_string(),
                    entry_index: *entry_index,
                    score: result.score,
                    source: excerpt.source,
                    word_index: excerpt.word_index,
                    internal_annotations: excerpt.internal_annotations.clone(),
                    fields: excerpt.fields.clone(),
                });
            }
        }

        // Put alias containers' results in output
        for (alias_target, alias_score) in &self.aliases {
            if let Some(target_container) = index.containers.get(alias_target) {
                for (entry_index, result) in target_container.results.clone() {
                    if result.excerpts.is_empty() {
                        output.push(IntermediateExcerpt {
                            query: self.query.to_string(),
                            entry_index,
                            score: result.score,
                            source: super::WordListSource::Contents,
                            word_index: 0,
                            internal_annotations: vec![],
                            fields: HashMap::new(),
                        });
                    }

                    for excerpt in result.excerpts.clone() {
                        output.push(IntermediateExcerpt {
                            query: alias_target.to_string(),
                            entry_index,
                            score: *alias_score,
                            internal_annotations: excerpt.internal_annotations,
                            source: excerpt.source,
                            word_index: excerpt.word_index,
                            fields: excerpt.fields,
                        });
                    }
                }
            }
        }

        output
    }
}

impl From<Entry> for crate::Entry {
    fn from(entry: Entry) -> Self {
        crate::Entry {
            url: entry.url.clone(),
            title: entry.title.clone(),
            fields: entry.fields,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use pretty_assertions::assert_eq;
    use std::convert::TryFrom;
    use std::fs;
    use std::io::{BufReader, Read};

    #[test]
    fn e2e_v3_search_works() {
        let file = fs::File::open("../test-assets/federalist-min-0.7.0.st").unwrap();
        let mut buf_reader = BufReader::new(file);
        let mut index_bytes: Vec<u8> = Vec::new();
        let _bytes_read = buf_reader.read_to_end(&mut index_bytes);

        let index = Index::try_from(index_bytes.as_slice()).unwrap();
        let generated = search(&index, "liber old world");
        let expected = serde_json::from_str("{\"results\":[{\"entry\":{\"url\":\"https://www.congress.gov/resources/display/content/The+Federalist+Papers#TheFederalistPapers-1\",\"title\":\"Introduction\",\"fields\":{}},\"excerpts\":[{\"text\":\"in many respects the most interesting in the world. It has been frequently remarked that it\",\"highlight_ranges\":[{\"beginning\":45,\"end\":51}],\"score\":128,\"internal_annotations\":[],\"fields\":{}},{\"text\":\"despotic power and hostile to the principles of liberty. An over-scrupulous jealousy of danger to the\",\"highlight_ranges\":[{\"beginning\":48,\"end\":56}],\"score\":125,\"internal_annotations\":[],\"fields\":{}},{\"text\":\"of love, and that the noble enthusiasm of liberty is apt to be infected with a\",\"highlight_ranges\":[{\"beginning\":42,\"end\":49}],\"score\":125,\"internal_annotations\":[],\"fields\":{}},{\"text\":\"of government is essential to the security of liberty; that, in the contemplation of a sound\",\"highlight_ranges\":[{\"beginning\":46,\"end\":54}],\"score\":125,\"internal_annotations\":[],\"fields\":{}},{\"text\":\"that this is the safest course for your liberty, your dignity, and your happiness. I affect\",\"highlight_ranges\":[{\"beginning\":40,\"end\":48}],\"score\":125,\"internal_annotations\":[],\"fields\":{}}],\"title_highlight_ranges\":[],\"score\":878}],\"total_hit_count\":1,\"url_prefix\":\"\"}").unwrap();

        assert_eq!(generated, expected, "{:?}", generated);
    }
}