summa-core 0.22.6

Summa Core library
Documentation
use std::collections::HashMap;

use regex::Regex;
use tantivy::query::{BooleanQuery, Occur, Query};
use tantivy::schema::{Field, FieldType, Schema};
use tantivy::tokenizer::TokenizerManager;
use tantivy::Term;

use crate::components::query_parser::utils::cast_field_to_term;

pub trait TermFieldMapper {
    fn map(&self, value: &str, fields: &[String]) -> Option<Box<dyn Query>>;
}

fn tokenize_value(schema: &Schema, field: &Field, full_path: &str, value: &str, tokenizer_manager: &TokenizerManager) -> Vec<Term> {
    let field_entry = schema.get_field_entry(*field);
    let mut terms = vec![];
    match field_entry.field_type() {
        FieldType::Str(ref str_options) => {
            let option = str_options.get_indexing_options().expect("no options");
            let mut text_analyzer = tokenizer_manager.get(option.tokenizer()).expect("unknown tokenizer");
            let mut token_stream = text_analyzer.token_stream(value);
            token_stream.process(&mut |token| {
                let term = cast_field_to_term(field, full_path, schema.get_field_entry(*field).field_type(), &token.text, true);
                terms.push(term);
            });
        }
        _ => terms.push(cast_field_to_term(field, full_path, schema.get_field_entry(*field).field_type(), value, true)),
    };
    terms
}

pub struct DoiMapper {
    schema: Schema,
    tokenizer_manager: TokenizerManager,
}

impl DoiMapper {
    pub fn new(schema: Schema, tokenizer_manager: TokenizerManager) -> Self {
        DoiMapper { schema, tokenizer_manager }
    }
}

impl TermFieldMapper for DoiMapper {
    fn map(&self, value: &str, fields: &[String]) -> Option<Box<dyn Query>> {
        let terms = fields
            .iter()
            .flat_map(|field_name| {
                let (field, full_path) = self.schema.find_field(field_name).expect("inconsistent state");
                tokenize_value(&self.schema, &field, full_path, value, &self.tokenizer_manager)
            })
            .collect();
        Some(Box::new(BooleanQuery::new_multiterms_query(terms)) as Box<dyn Query>)
    }
}

pub struct DoiIsbnMapper {
    schema: Schema,
    tokenizer_manager: TokenizerManager,
}

impl DoiIsbnMapper {
    pub fn new(schema: Schema, tokenizer_manager: TokenizerManager) -> Self {
        DoiIsbnMapper { schema, tokenizer_manager }
    }
}

impl TermFieldMapper for DoiIsbnMapper {
    fn map(&self, value: &str, fields: &[String]) -> Option<Box<dyn Query>> {
        thread_local! {
            static MATCHER: Regex = Regex::new(r"(10.[0-9]+)/((?:cbo)?(?:(?:978[0-9]{10})|(?:978[0-9]{13})|(?:978[-0-9]+)))(.*)").expect("cannot compile regex");
        }
        MATCHER.with(|matcher| {
            let lowercased_doi = value.to_lowercase();
            for cap in matcher.captures_iter(&lowercased_doi) {
                let (_, isbn, _) = (&cap[1], &cap[2], &cap[3]);
                let corrected_isbn = isbn.replace('-', "").replace("cbo", "");
                if corrected_isbn.len() == 10 || corrected_isbn.len() == 13 {
                    let subqueries: Vec<_> = fields
                        .iter()
                        .map(|field_name| {
                            let (field, full_path) = self.schema.find_field(field_name).expect("inconsistent state");
                            let terms = tokenize_value(&self.schema, &field, full_path, &corrected_isbn, &self.tokenizer_manager);
                            let query = Box::new(BooleanQuery::new_multiterms_query(terms)) as Box<dyn Query>;
                            (Occur::Should, query)
                        })
                        .collect();
                    return Some(Box::new(BooleanQuery::new(subqueries)) as Box<dyn Query>);
                }
            }
            None
        })
    }
}

pub struct IsbnMapper {
    schema: Schema,
    tokenizer_manager: TokenizerManager,
}

impl IsbnMapper {
    pub fn new(schema: Schema, tokenizer_manager: TokenizerManager) -> Self {
        IsbnMapper { schema, tokenizer_manager }
    }
}

impl TermFieldMapper for IsbnMapper {
    fn map(&self, value: &str, fields: &[String]) -> Option<Box<dyn Query>> {
        let subqueries: Vec<_> = fields
            .iter()
            .map(|field_name| {
                let (field, full_path) = self.schema.find_field(field_name).expect("inconsistent state");
                let terms = tokenize_value(&self.schema, &field, full_path, &value.replace('-', ""), &self.tokenizer_manager);
                (Occur::Should, Box::new(BooleanQuery::new_multiterms_query(terms)) as Box<dyn Query>)
            })
            .collect();
        Some(Box::new(BooleanQuery::new(subqueries)) as Box<dyn Query>)
    }
}

pub struct TermFieldMappersManager {
    term_field_mappers: HashMap<String, Box<dyn TermFieldMapper>>,
}

impl TermFieldMappersManager {
    pub fn new(schema: &Schema, tokenizer_manager: &TokenizerManager) -> Self {
        let mut term_field_mappers = HashMap::new();
        term_field_mappers.insert(
            "doi".to_string(),
            Box::new(DoiMapper::new(schema.clone(), tokenizer_manager.clone())) as Box<dyn TermFieldMapper>,
        );
        term_field_mappers.insert(
            "doi_isbn".to_string(),
            Box::new(DoiIsbnMapper::new(schema.clone(), tokenizer_manager.clone())) as Box<dyn TermFieldMapper>,
        );
        term_field_mappers.insert(
            "isbn".to_string(),
            Box::new(IsbnMapper::new(schema.clone(), tokenizer_manager.clone())) as Box<dyn TermFieldMapper>,
        );
        TermFieldMappersManager { term_field_mappers }
    }

    pub fn get(&self, name: &str) -> Option<&Box<dyn TermFieldMapper>> {
        self.term_field_mappers.get(name)
    }
}