use crate::error::Result;
#[derive(Debug, Clone)]
pub struct TermStats {
pub term: String,
pub doc_freq: u64,
pub total_term_freq: u64,
}
pub trait TermsEnum: Send + Sync {
fn next(&mut self) -> Result<Option<TermStats>>;
fn seek(&mut self, target: &str) -> Result<bool>;
fn seek_exact(&mut self, term: &str) -> Result<bool>;
fn current(&self) -> Option<&TermStats>;
fn term_stats(&self) -> Option<TermStats> {
self.current().cloned()
}
}
pub trait Terms: Send + Sync {
fn iterator(&self) -> Result<Box<dyn TermsEnum>>;
fn size(&self) -> Option<u64>;
fn sum_doc_freq(&self) -> Option<u64>;
fn sum_total_term_freq(&self) -> Option<u64>;
fn has_freqs(&self) -> bool {
true
}
fn has_positions(&self) -> bool {
false
}
fn has_offsets(&self) -> bool {
false
}
fn has_payloads(&self) -> bool {
false
}
}
pub trait TermDictionaryAccess {
fn terms(&self, field: &str) -> Result<Option<Box<dyn Terms>>>;
fn term_exists(&self, field: &str, term: &str) -> Result<bool> {
if let Some(terms) = self.terms(field)? {
let mut iter = terms.iterator()?;
iter.seek_exact(term)
} else {
Ok(false)
}
}
}
impl TermsEnum for Box<dyn TermsEnum> {
fn next(&mut self) -> Result<Option<TermStats>> {
(**self).next()
}
fn seek(&mut self, target: &str) -> Result<bool> {
(**self).seek(target)
}
fn seek_exact(&mut self, term: &str) -> Result<bool> {
(**self).seek_exact(term)
}
fn current(&self) -> Option<&TermStats> {
(**self).current()
}
}
use std::collections::BTreeMap;
use std::sync::Arc;
use crate::lexical::index::structures::dictionary::{HybridTermDictionary, TermInfo};
pub struct InvertedIndexTermsEnum {
terms: Vec<(String, TermInfo)>,
position: usize,
current: Option<TermStats>,
}
impl InvertedIndexTermsEnum {
pub fn new(field: &str, dict: &Arc<HybridTermDictionary>) -> Self {
let field_prefix = format!("{}:", field);
let mut terms = Vec::new();
for (term, info) in dict.iter() {
if let Some(term_text) = term.strip_prefix(&field_prefix) {
terms.push((term_text.to_string(), info.clone()));
}
}
InvertedIndexTermsEnum {
terms,
position: 0,
current: None,
}
}
}
impl TermsEnum for InvertedIndexTermsEnum {
fn next(&mut self) -> Result<Option<TermStats>> {
if self.position >= self.terms.len() {
self.current = None;
return Ok(None);
}
let (term, info) = &self.terms[self.position];
let stats = TermStats {
term: term.clone(),
doc_freq: info.doc_frequency,
total_term_freq: info.total_frequency,
};
self.current = Some(stats.clone());
self.position += 1;
Ok(Some(stats))
}
fn seek(&mut self, target: &str) -> Result<bool> {
let result = self
.terms
.binary_search_by(|(term, _)| term.as_str().cmp(target));
match result {
Ok(index) => {
self.position = index;
let (term, info) = &self.terms[index];
self.current = Some(TermStats {
term: term.clone(),
doc_freq: info.doc_frequency,
total_term_freq: info.total_frequency,
});
Ok(true)
}
Err(index) => {
self.position = index;
if index < self.terms.len() {
let (term, info) = &self.terms[index];
self.current = Some(TermStats {
term: term.clone(),
doc_freq: info.doc_frequency,
total_term_freq: info.total_frequency,
});
Ok(false)
} else {
self.current = None;
Ok(false)
}
}
}
}
fn seek_exact(&mut self, term: &str) -> Result<bool> {
let result = self.terms.binary_search_by(|(t, _)| t.as_str().cmp(term));
match result {
Ok(index) => {
self.position = index;
let (term, info) = &self.terms[index];
self.current = Some(TermStats {
term: term.clone(),
doc_freq: info.doc_frequency,
total_term_freq: info.total_frequency,
});
Ok(true)
}
Err(_) => {
self.current = None;
Ok(false)
}
}
}
fn current(&self) -> Option<&TermStats> {
self.current.as_ref()
}
}
pub struct InvertedIndexTerms {
field: String,
dict: Arc<HybridTermDictionary>,
size: Option<u64>,
sum_doc_freq: Option<u64>,
sum_total_term_freq: Option<u64>,
}
impl InvertedIndexTerms {
pub fn new(field: &str, dict: Arc<HybridTermDictionary>) -> Self {
let field_prefix = format!("{}:", field);
let mut size = 0u64;
let mut sum_doc_freq = 0u64;
let mut sum_total_term_freq = 0u64;
for (term, info) in dict.iter() {
if term.starts_with(&field_prefix) {
size += 1;
sum_doc_freq += info.doc_frequency;
sum_total_term_freq += info.total_frequency;
}
}
InvertedIndexTerms {
field: field.to_string(),
dict,
size: Some(size),
sum_doc_freq: Some(sum_doc_freq),
sum_total_term_freq: Some(sum_total_term_freq),
}
}
}
impl Terms for InvertedIndexTerms {
fn iterator(&self) -> Result<Box<dyn TermsEnum>> {
Ok(Box::new(InvertedIndexTermsEnum::new(
&self.field,
&self.dict,
)))
}
fn size(&self) -> Option<u64> {
self.size
}
fn sum_doc_freq(&self) -> Option<u64> {
self.sum_doc_freq
}
fn sum_total_term_freq(&self) -> Option<u64> {
self.sum_total_term_freq
}
}
pub struct MergedInvertedIndexTerms {
merged: Vec<(String, u64, u64)>,
size: u64,
sum_doc_freq: u64,
sum_total_term_freq: u64,
}
impl MergedInvertedIndexTerms {
pub fn new(field: &str, dicts: &[Arc<HybridTermDictionary>]) -> Self {
let field_prefix = format!("{}:", field);
let mut map: BTreeMap<String, (u64, u64)> = BTreeMap::new();
for dict in dicts {
for (term, info) in dict.iter() {
if let Some(term_text) = term.strip_prefix(&field_prefix) {
let entry = map.entry(term_text.to_string()).or_insert((0, 0));
entry.0 += info.doc_frequency;
entry.1 += info.total_frequency;
}
}
}
let mut merged = Vec::with_capacity(map.len());
let mut sum_doc_freq = 0u64;
let mut sum_total_term_freq = 0u64;
for (term, (df, ttf)) in map {
sum_doc_freq += df;
sum_total_term_freq += ttf;
merged.push((term, df, ttf));
}
MergedInvertedIndexTerms {
size: merged.len() as u64,
merged,
sum_doc_freq,
sum_total_term_freq,
}
}
}
impl Terms for MergedInvertedIndexTerms {
fn iterator(&self) -> Result<Box<dyn TermsEnum>> {
Ok(Box::new(MergedTermsEnum {
terms: self.merged.clone(),
position: 0,
current: None,
}))
}
fn size(&self) -> Option<u64> {
Some(self.size)
}
fn sum_doc_freq(&self) -> Option<u64> {
Some(self.sum_doc_freq)
}
fn sum_total_term_freq(&self) -> Option<u64> {
Some(self.sum_total_term_freq)
}
}
struct MergedTermsEnum {
terms: Vec<(String, u64, u64)>,
position: usize,
current: Option<TermStats>,
}
impl TermsEnum for MergedTermsEnum {
fn next(&mut self) -> Result<Option<TermStats>> {
if self.position >= self.terms.len() {
self.current = None;
return Ok(None);
}
let (term, df, ttf) = &self.terms[self.position];
let stats = TermStats {
term: term.clone(),
doc_freq: *df,
total_term_freq: *ttf,
};
self.current = Some(stats.clone());
self.position += 1;
Ok(Some(stats))
}
fn seek(&mut self, target: &str) -> Result<bool> {
let result = self
.terms
.binary_search_by(|(t, _, _)| t.as_str().cmp(target));
match result {
Ok(idx) => {
self.position = idx;
let (term, df, ttf) = &self.terms[idx];
self.current = Some(TermStats {
term: term.clone(),
doc_freq: *df,
total_term_freq: *ttf,
});
Ok(true)
}
Err(idx) => {
self.position = idx;
if idx < self.terms.len() {
let (term, df, ttf) = &self.terms[idx];
self.current = Some(TermStats {
term: term.clone(),
doc_freq: *df,
total_term_freq: *ttf,
});
}
Ok(false)
}
}
}
fn seek_exact(&mut self, term: &str) -> Result<bool> {
let result = self
.terms
.binary_search_by(|(t, _, _)| t.as_str().cmp(term));
match result {
Ok(idx) => {
self.position = idx;
let (term, df, ttf) = &self.terms[idx];
self.current = Some(TermStats {
term: term.clone(),
doc_freq: *df,
total_term_freq: *ttf,
});
Ok(true)
}
Err(_) => {
self.current = None;
Ok(false)
}
}
}
fn current(&self) -> Option<&TermStats> {
self.current.as_ref()
}
}