use crate::INDEX_RUNTIME;
use crate::geo_search::{decode_morton_2_d, point_distance_to_morton_range};
use crate::index::{
DOCUMENT_LENGTH_COMPRESSION, DistanceUnit, Facet, FieldType, NgramType, ResultFacet, Shard,
ShardArc,
};
use crate::iterator::{search_iterator_index, search_iterator_shard};
use crate::min_heap::{Result, result_ordering_root};
use crate::tokenizer::{tokenizer, tokenizer_lite};
use crate::union::{union_docid_2, union_docid_3};
use crate::utils::{
read_f32, read_f64, read_i8, read_i16, read_i32, read_i64, read_u8, read_u16, read_u32,
read_u64,
};
#[cfg(feature = "vb")]
use crate::vector::ResultSource;
use crate::vector::{Embedding, Inference, Quantization, SearchVectorShard};
use crate::vector_similarity::{
AnnMode, QuantizedVector, VectorSimilarity, normalize_f32, normalize_f32_simd,
quantize_f32_to_i8, quantize_f32_to_i8_simd,
};
use crate::{
index::{
AccessType, BlockObjectIndex, DUMMY_VEC, DUMMY_VEC_8, Index, IndexArc, LexicalSimilarity,
MAX_POSITIONS_PER_TERM, NonUniquePostingListObjectQuery, NonUniqueTermObject,
PostingListObjectIndex, PostingListObjectQuery, QueueObject, SPEEDUP_FLAG, SegmentIndex,
TermObject, get_max_score,
},
intersection::intersection_blockid,
min_heap::MinHeap,
single::single_blockid,
union::union_blockid,
};
use ahash::{AHashMap, AHashSet};
use itertools::Itertools;
use num::FromPrimitive;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::mem::discriminant;
use std::ops::Range;
use std::sync::{
Arc,
atomic::{AtomicUsize, Ordering},
};
use std::{cmp, mem};
use utoipa::ToSchema;
use symspell_complete_rs::Suggestion;
#[derive(Default, PartialEq, Clone, Debug, Serialize, Deserialize, ToSchema)]
pub enum QueryType {
#[default]
Union = 0,
Intersection = 1,
Phrase = 2,
Not = 3,
}
#[derive(Default, PartialEq, Clone, Debug, Serialize, Deserialize, ToSchema)]
pub enum SearchMode {
#[default]
Lexical,
Vector {
similarity_threshold: Option<f32>,
ann_mode: AnnMode,
},
Hybrid {
similarity_threshold: Option<f32>,
ann_mode: AnnMode,
},
}
#[derive(Default, PartialEq, Clone, Debug, Serialize, Deserialize, ToSchema)]
pub enum QueryRewriting {
#[default]
SearchOnly,
SearchSuggest {
correct: Option<usize>,
distance: usize,
term_length_threshold: Option<Vec<usize>>,
complete: Option<usize>,
length: Option<usize>,
},
SearchRewrite {
correct: Option<usize>,
distance: usize,
term_length_threshold: Option<Vec<usize>>,
complete: Option<usize>,
length: Option<usize>,
},
SuggestOnly {
correct: Option<usize>,
distance: usize,
term_length_threshold: Option<Vec<usize>>,
complete: Option<usize>,
length: Option<usize>,
},
}
#[derive(Default, PartialEq, Clone, Debug, Serialize, Deserialize, ToSchema)]
pub enum ResultType {
Count = 0,
Topk = 1,
#[default]
TopkCount = 2,
}
pub(crate) struct SearchResult<'a> {
pub topk_candidates: MinHeap<'a>,
pub query_facets: Vec<ResultFacet>,
pub skip_facet_count: bool,
}
#[derive(Default, Debug, Deserialize, Serialize, Clone)]
pub struct ResultObject {
pub original_query: String,
pub query: String,
pub query_terms: Vec<String>,
pub result_count: usize,
pub result_count_total: usize,
pub observed_vector_count: usize,
pub observed_cluster_count: usize,
pub results: Vec<Result>,
pub facets: AHashMap<String, Facet>,
pub suggestions: Vec<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize, Default, PartialEq, ToSchema)]
pub enum RangeType {
#[default]
CountWithinRange,
CountAboveRange,
CountBelowRange,
}
#[derive(Debug, Clone, Deserialize, Serialize, Default, PartialEq, ToSchema)]
pub enum QueryFacet {
U8 {
field: String,
range_type: RangeType,
ranges: Vec<(String, u8)>,
},
U16 {
field: String,
range_type: RangeType,
ranges: Vec<(String, u16)>,
},
U32 {
field: String,
range_type: RangeType,
ranges: Vec<(String, u32)>,
},
U64 {
field: String,
range_type: RangeType,
ranges: Vec<(String, u64)>,
},
I8 {
field: String,
range_type: RangeType,
ranges: Vec<(String, i8)>,
},
I16 {
field: String,
range_type: RangeType,
ranges: Vec<(String, i16)>,
},
I32 {
field: String,
range_type: RangeType,
ranges: Vec<(String, i32)>,
},
I64 {
field: String,
range_type: RangeType,
ranges: Vec<(String, i64)>,
},
Timestamp {
field: String,
range_type: RangeType,
ranges: Vec<(String, i64)>,
},
F32 {
field: String,
range_type: RangeType,
ranges: Vec<(String, f32)>,
},
F64 {
field: String,
range_type: RangeType,
ranges: Vec<(String, f64)>,
},
String16 {
field: String,
prefix: String,
length: u16,
},
String32 {
field: String,
prefix: String,
length: u32,
},
StringSet16 {
field: String,
prefix: String,
length: u16,
},
StringSet32 {
field: String,
prefix: String,
length: u32,
},
Point {
field: String,
range_type: RangeType,
ranges: Vec<(String, f64)>,
base: Point,
unit: DistanceUnit,
},
#[default]
None,
}
#[derive(Debug, Clone, Deserialize, Serialize, Default, PartialEq)]
pub enum Ranges {
U8(RangeType, Vec<(String, u8)>),
U16(RangeType, Vec<(String, u16)>),
U32(RangeType, Vec<(String, u32)>),
U64(RangeType, Vec<(String, u64)>),
I8(RangeType, Vec<(String, i8)>),
I16(RangeType, Vec<(String, i16)>),
I32(RangeType, Vec<(String, i32)>),
I64(RangeType, Vec<(String, i64)>),
Timestamp(RangeType, Vec<(String, i64)>),
F32(RangeType, Vec<(String, f32)>),
F64(RangeType, Vec<(String, f64)>),
Point(RangeType, Vec<(String, f64)>, Point, DistanceUnit),
#[default]
None,
}
#[derive(Clone, PartialEq, Serialize, Deserialize, ToSchema, Debug)]
pub enum FacetValue {
Bool(bool),
U8(u8),
U16(u16),
U32(u32),
U64(u64),
I8(i8),
I16(i16),
I32(i32),
I64(i64),
Timestamp(i64),
F32(f32),
F64(f64),
String(String),
StringSet(Vec<String>),
Point(Point),
None,
}
impl Index {
#[inline]
pub async fn get_facet_value(self: &Index, field: &str, doc_id: usize) -> FacetValue {
let shard_id = doc_id % self.shard_number;
let doc_id_shard = doc_id / self.shard_number;
self.shard_vec[shard_id]
.read()
.await
.get_facet_value_shard(field, doc_id_shard)
}
}
impl Shard {
#[inline]
pub(crate) fn get_facet_value_shard(self: &Shard, field: &str, doc_id: usize) -> FacetValue {
if let Some(field_idx) = self.facets_map.get(field) {
match &self.facets[*field_idx].field_type {
FieldType::U8 => {
let facet_value = &self.facets_file_mmap
[(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset];
FacetValue::U8(*facet_value)
}
FieldType::U16 => {
let facet_value = read_u16(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::U16(facet_value)
}
FieldType::U32 => {
let facet_value = read_u32(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::U32(facet_value)
}
FieldType::U64 => {
let facet_value = read_u64(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::U64(facet_value)
}
FieldType::I8 => {
let facet_value = read_i8(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::I8(facet_value)
}
FieldType::I16 => {
let facet_value = read_i16(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::I16(facet_value)
}
FieldType::I32 => {
let facet_value = read_i32(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::I32(facet_value)
}
FieldType::I64 => {
let facet_value = read_i64(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::I64(facet_value)
}
FieldType::Timestamp => {
let facet_value = read_i64(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::Timestamp(facet_value)
}
FieldType::F32 => {
let facet_value = read_f32(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::F32(facet_value)
}
FieldType::F64 => {
let facet_value = read_f64(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
FacetValue::F64(facet_value)
}
FieldType::String16 => {
let facet_id = read_u16(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
let facet_value = self.facets[*field_idx]
.values
.get_index((facet_id).into())
.unwrap();
FacetValue::String(facet_value.1.0[0].clone())
}
FieldType::StringSet16 => {
let facet_id = read_u16(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
let facet_value = self.facets[*field_idx]
.values
.get_index((facet_id).into())
.unwrap();
FacetValue::StringSet(facet_value.1.0.clone())
}
FieldType::String32 => {
let facet_id = read_u32(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
let facet_value = self.facets[*field_idx]
.values
.get_index(facet_id as usize)
.unwrap();
FacetValue::String(facet_value.1.0[0].clone())
}
FieldType::StringSet32 => {
let facet_id = read_u32(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
let facet_value = self.facets[*field_idx]
.values
.get_index(facet_id as usize)
.unwrap();
FacetValue::StringSet(facet_value.1.0.clone())
}
FieldType::Point => {
let code = read_u64(
&self.facets_file_mmap,
(self.facets_size_sum * doc_id) + self.facets[*field_idx].offset,
);
let x = decode_morton_2_d(code);
FacetValue::Point(x.clone())
}
_ => FacetValue::None,
}
} else {
FacetValue::None
}
}
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeU8 {
pub start: u8,
pub end: u8,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeU16 {
pub start: u16,
pub end: u16,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeU32 {
pub start: u32,
pub end: u32,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeU64 {
pub start: u64,
pub end: u64,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeI8 {
pub start: i8,
pub end: i8,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeI16 {
pub start: i16,
pub end: i16,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeI32 {
pub start: i32,
pub end: i32,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeI64 {
pub start: i64,
pub end: i64,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeF32 {
pub start: f32,
pub end: f32,
}
#[allow(dead_code)]
#[derive(ToSchema)]
pub struct RangeF64 {
pub start: f64,
pub end: f64,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, ToSchema)]
pub enum FacetFilter {
U8 {
field: String,
#[schema(value_type=RangeU8)]
filter: Range<u8>,
},
U16 {
field: String,
#[schema(value_type=RangeU16)]
filter: Range<u16>,
},
U32 {
field: String,
#[schema(value_type=RangeU32)]
filter: Range<u32>,
},
U64 {
field: String,
#[schema(value_type=RangeU64)]
filter: Range<u64>,
},
I8 {
field: String,
#[schema(value_type=RangeI8)]
filter: Range<i8>,
},
I16 {
field: String,
#[schema(value_type=RangeI16)]
filter: Range<i16>,
},
I32 {
field: String,
#[schema(value_type=RangeI32)]
filter: Range<i32>,
},
I64 {
field: String,
#[schema(value_type=RangeI64)]
filter: Range<i64>,
},
Timestamp {
field: String,
#[schema(value_type=RangeI64)]
filter: Range<i64>,
},
F32 {
field: String,
#[schema(value_type=RangeF32)]
filter: Range<f32>,
},
F64 {
field: String,
#[schema(value_type=RangeF64)]
filter: Range<f64>,
},
String16 {
field: String,
filter: Vec<String>,
},
StringSet16 {
field: String,
filter: Vec<String>,
},
String32 {
field: String,
filter: Vec<String>,
},
StringSet32 {
field: String,
filter: Vec<String>,
},
Point {
field: String,
#[schema(value_type=(Point, RangeF64, DistanceUnit))]
filter: (Point, Range<f64>, DistanceUnit),
},
}
#[derive(Debug, Clone, Deserialize, Serialize, Default, PartialEq)]
pub(crate) enum FilterSparse {
U8(Range<u8>),
U16(Range<u16>),
U32(Range<u32>),
U64(Range<u64>),
I8(Range<i8>),
I16(Range<i16>),
I32(Range<i32>),
I64(Range<i64>),
Timestamp(Range<i64>),
F32(Range<f32>),
F64(Range<f64>),
String16(Vec<u16>),
String32(Vec<u32>),
Point(Point, Range<f64>, DistanceUnit, Range<u64>),
#[default]
None,
}
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Eq, ToSchema)]
pub enum SortOrder {
Ascending = 0,
Descending = 1,
}
#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
pub struct ResultSort {
pub field: String,
pub order: SortOrder,
pub base: FacetValue,
}
#[derive(Clone, Serialize)]
pub(crate) struct ResultSortIndex<'a> {
pub idx: usize,
pub order: SortOrder,
pub base: &'a FacetValue,
}
pub type Point = Vec<f64>;
#[allow(clippy::too_many_arguments)]
#[allow(async_fn_in_trait)]
pub trait Search {
async fn search(
&self,
query_string: String,
query_vector: Option<Embedding>,
query_type_default: QueryType,
search_mode: SearchMode,
enable_empty_query: bool,
offset: usize,
length: usize,
result_type: ResultType,
include_uncommitted: bool,
field_filter: Vec<String>,
query_facets: Vec<QueryFacet>,
facet_filter: Vec<FacetFilter>,
result_sort: Vec<ResultSort>,
query_rewriting: QueryRewriting,
) -> ResultObject;
}
impl Search for IndexArc {
async fn search(
&self,
query_string: String,
query_vector: Option<Embedding>,
query_type_default: QueryType,
search_mode: SearchMode,
enable_empty_query: bool,
offset: usize,
length: usize,
result_type: ResultType,
include_uncommitted: bool,
field_filter: Vec<String>,
query_facets: Vec<QueryFacet>,
facet_filter: Vec<FacetFilter>,
result_sort: Vec<ResultSort>,
query_rewriting: QueryRewriting,
) -> ResultObject {
let index_ref = self.read().await;
let original_query = query_string.clone();
let (edit_distance_max, term_length_threshold, correct, complete, suggestion_length) =
match &query_rewriting {
QueryRewriting::SearchSuggest {
distance,
term_length_threshold,
correct,
complete,
length,
} => (distance, term_length_threshold, correct, complete, length),
QueryRewriting::SuggestOnly {
distance,
term_length_threshold,
correct,
complete,
length,
} => (distance, term_length_threshold, correct, complete, length),
QueryRewriting::SearchRewrite {
distance,
term_length_threshold,
correct,
complete,
length,
} => (distance, term_length_threshold, correct, complete, length),
_ => (&0, &None, &None, &None, &None),
};
let (query_string, suggestions) = if correct.is_some() || complete.is_some() {
let mut query_string = query_string;
let mut allow_loop = true;
let mut previous_qac: Option<(String, Vec<Suggestion>)> = None;
loop {
let shard = index_ref.shard_vec[0].read().await;
let query_terms = tokenizer_lite(&query_string, &index_ref.meta.tokenizer, &shard);
drop(shard);
let qac = if !query_terms.is_empty() {
let query_terms_vec: Vec<String> =
query_terms.iter().map(|s| s.0.to_string()).collect();
let suffix = if query_string.ends_with(" ") { " " } else { "" };
let (query_terms_prefix, query_terms_str) = if query_terms.len() + suffix.len()
> 3
{
(
query_terms_vec[..query_terms.len() - 3 + suffix.len()].join(" ") + " ",
query_terms_vec[query_terms.len() - 3 + suffix.len()..].join(" ")
+ suffix,
)
} else {
(String::new(), query_terms_vec.join(" ") + suffix)
};
let is_phrase =
!query_terms.is_empty() && query_terms[0].1 == QueryType::Phrase;
let qac: Option<(String, Vec<Suggestion>)> = if let Some(completion_option) =
index_ref.completion_option.as_ref()
&& complete.is_some()
&& query_string.len() >= complete.unwrap()
&& query_rewriting != QueryRewriting::SearchOnly
{
let trie = completion_option.read().await;
let completions =
trie.lookup_completions(&query_terms_str, suggestion_length.to_owned());
if completions.is_empty() {
previous_qac.clone()
} else {
let mut suggestions: Vec<Suggestion> = Vec::new();
for qc in completions.iter() {
suggestions.push(Suggestion {
term: if is_phrase {
["\"", &query_terms_prefix, &qc.term, "\""].join("")
} else {
[query_terms_prefix.as_str(), &qc.term].join("")
},
distance: qc.term.len() - query_string.len(),
count: *qc.count,
});
}
if let Some(suggestion_length) = suggestion_length.as_ref()
&& suggestions.len() < *suggestion_length
&& query_terms.len() >= 2
{
let mut position = 0;
let mut completion_term_vec = Vec::new();
for (i, completion) in completions.iter().enumerate() {
completion_term_vec =
completion.term.split(" ").collect::<Vec<_>>();
if completion_term_vec.len() >= 3 {
position = i + 1;
break;
}
}
if completion_term_vec.len() >= 3 {
let completion_term_str =
completion_term_vec[1..].join(" ") + " ";
let additional_completions = trie.lookup_completions(
&completion_term_str,
Some(suggestion_length - suggestions.len() + 5),
);
let query_terms_prefix = query_terms_vec[..query_terms.len()
- if query_terms.len() == 2 { 1 } else { 2 }]
.join(" ")
+ " ";
let mut j = 0;
for qc in additional_completions.iter() {
if let Some(p) = qc.term.rfind(' ')
&& p + 1 < qc.term.len()
{
let suffix = qc.term[p + 1..].to_string();
if index_ref.frequent_hashset.contains(&suffix) {
continue;
}
};
suggestions.insert(
position + j,
Suggestion {
term: if is_phrase {
["\"", &query_terms_prefix, &qc.term, "\""]
.join("")
} else {
[query_terms_prefix.as_str(), &qc.term].join("")
},
distance: qc.term.len() - query_string.len(),
count: *qc.count,
},
);
j += 1;
if suggestions.len() >= *suggestion_length {
break;
}
}
}
}
let completed_query = suggestions[0].term.to_string();
Some((completed_query, suggestions))
}
} else {
previous_qac.clone()
};
let qac: Option<(String, Vec<Suggestion>)> = if let Some(symspell) =
&index_ref.symspell_option
&& correct.is_some()
&& query_string.len() >= correct.unwrap()
&& query_rewriting != QueryRewriting::SearchOnly
&& qac.is_none()
&& allow_loop
{
if let Ok(symspell) = symspell.try_read()
&& (term_length_threshold.is_none()
|| term_length_threshold.as_ref().unwrap().is_empty()
|| query_string.len() >= term_length_threshold.as_ref().unwrap()[0])
{
let mut corrections = symspell.lookup_compound_vec(
&query_terms_vec,
edit_distance_max.to_owned(),
term_length_threshold,
false,
);
if corrections.is_empty() {
None
} else {
if is_phrase {
for suggestion in corrections.iter_mut() {
suggestion.term = ["\"", &suggestion.term, "\""].join("");
}
}
query_string = corrections[0].term.clone();
allow_loop = false;
previous_qac = Some((corrections[0].term.clone(), corrections));
continue;
}
} else {
None
}
} else {
qac
};
if let Some((corrected_query, suggestions)) = qac {
if discriminant(&query_rewriting)
== discriminant(&QueryRewriting::SearchRewrite {
distance: 0,
term_length_threshold: None,
correct: None,
complete: None,
length: None,
})
{
(corrected_query, Some(suggestions))
} else {
(query_string, Some(suggestions))
}
} else {
(query_string, None)
}
} else {
(query_string, None)
};
break qac;
}
} else {
(query_string, None)
};
if discriminant(&query_rewriting)
== discriminant(&QueryRewriting::SuggestOnly {
distance: 0,
term_length_threshold: None,
correct: None,
complete: None,
length: None,
})
{
let mut result_object = ResultObject {
original_query,
query: query_string.clone(),
..Default::default()
};
if let Some(suggestions) = suggestions.as_ref() {
result_object.suggestions = suggestions.iter().map(|s| s.term.clone()).collect();
}
return result_object;
}
if enable_empty_query
&& query_string.is_empty()
&& query_vector.is_none()
&& query_facets.is_empty()
&& facet_filter.is_empty()
&& (result_sort.is_empty()
|| (result_sort.len() == 1
&& (result_sort.first().unwrap().field == "_id"
|| result_sort.first().unwrap().field == "_score")))
{
return search_iterator_index(
self,
offset,
length,
result_type,
include_uncommitted,
result_sort,
)
.await;
}
if index_ref.shard_number == 1 && matches!(search_mode, SearchMode::Lexical) {
let mut result_object = index_ref.shard_vec[0]
.search_lexical_shard(
query_string.clone(),
query_type_default,
enable_empty_query,
offset,
length,
result_type,
include_uncommitted,
field_filter,
query_facets,
facet_filter,
result_sort,
)
.await;
result_object.original_query = original_query;
result_object.query = query_string.clone();
if let Some(suggestions) = suggestions.as_ref() {
result_object.suggestions = suggestions.iter().map(|s| s.term.clone()).collect();
}
return result_object;
}
let mut result_object_list = Vec::new();
let shard_number = index_ref.shard_number;
let aggregate_results = result_type != ResultType::Count;
let query_vector = if index_ref.is_vector_indexing {
Some(if let Some(mut qv) = query_vector {
if index_ref.vector_similarity == VectorSimilarity::Cosine
&& matches!(index_ref.meta.inference, Inference::External { .. })
&& let Embedding::F32(ref mut fvecs) = qv
{
if index_ref.is_simd {
unsafe {
normalize_f32_simd(fvecs);
}
} else {
normalize_f32(fvecs);
}
};
if (index_ref.quantization == Quantization::ScalarQuantizationI8
|| index_ref.quantization == Quantization::TurboQuantI8)
&& let Embedding::F32(ref fvecs) = qv
{
match (
index_ref.vector_similarity,
index_ref.quantization,
index_ref.is_simd,
) {
(VectorSimilarity::Cosine, Quantization::ScalarQuantizationI8, true) => {
(unsafe { quantize_f32_to_i8_simd(fvecs) }, 1.0, 0.0, 0, 0)
}
(VectorSimilarity::Cosine, Quantization::ScalarQuantizationI8, false) => {
(quantize_f32_to_i8(fvecs), 1.0, 0.0, 0, 0)
}
(VectorSimilarity::Dot, Quantization::ScalarQuantizationI8, true) => {
let quantized_vector = QuantizedVector::new_scale_simd(fvecs);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
0,
0,
)
}
(VectorSimilarity::Dot, Quantization::ScalarQuantizationI8, false) => {
let quantized_vector = QuantizedVector::new_scale(fvecs);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
0,
0,
)
}
(VectorSimilarity::Euclidean, Quantization::ScalarQuantizationI8, true) => {
let non_affine =
index_ref.shard_vec[0].read().await.max_vector_value == f32::MIN;
if non_affine {
let quantized_vector = QuantizedVector::new_scale_norm_simd(fvecs);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
0,
0,
)
} else {
let mut min_vector_value =
index_ref.shard_vec[0].read().await.min_vector_value;
let mut max_vector_value =
index_ref.shard_vec[0].read().await.max_vector_value;
let quantized_vector = QuantizedVector::new_scale_norm_affine_simd(
&mut min_vector_value,
&mut max_vector_value,
fvecs,
);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
quantized_vector.zero_point,
quantized_vector.sum_q,
)
}
}
(_, Quantization::TurboQuantI8, true) => {
let quantized_vector =
index_ref.turbo_quant.quantize_f32_i8_simd(fvecs);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
quantized_vector.zero_point,
quantized_vector.sum_q,
)
}
(
VectorSimilarity::Euclidean,
Quantization::ScalarQuantizationI8,
false,
) => {
let non_affine =
index_ref.shard_vec[0].read().await.max_vector_value == f32::MIN;
if non_affine {
let quantized_vector = QuantizedVector::new_scale_norm(fvecs);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
0,
0,
)
} else {
let mut min_vector_value =
index_ref.shard_vec[0].read().await.min_vector_value;
let mut max_vector_value =
index_ref.shard_vec[0].read().await.max_vector_value;
let quantized_vector = QuantizedVector::new_scale_norm_affine(
&mut min_vector_value,
&mut max_vector_value,
fvecs,
);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
quantized_vector.zero_point,
quantized_vector.sum_q,
)
}
}
(_, Quantization::TurboQuantI8, false) => {
let quantized_vector = index_ref.turbo_quant.quantize_f32_i8(fvecs);
(
Embedding::I8(quantized_vector.data),
quantized_vector.scale,
quantized_vector.norm,
quantized_vector.zero_point,
quantized_vector.sum_q,
)
}
(_, Quantization::None, _) => (qv, 0.0, 0.0, 0, 0),
}
} else {
(qv, 0.0, 0.0, 0, 0)
}
} else if let Some(embedding_model) = index_ref.embedding_model_option.as_ref() {
let fvecs = embedding_model
.encode(&[query_string.to_string()])
.remove(0);
if index_ref.quantization == Quantization::ScalarQuantizationI8
|| index_ref.quantization == Quantization::TurboQuantI8
{
(
if index_ref.is_simd {
unsafe { quantize_f32_to_i8_simd(&fvecs) }
} else {
quantize_f32_to_i8(&fvecs)
},
1.0,
0.0,
0,
0,
)
} else {
(Embedding::F32(fvecs), 0.0, 0.0, 0, 0)
}
} else {
let result_object: ResultObject = Default::default();
return result_object;
})
} else {
None
};
for shard in index_ref.shard_vec.iter() {
let query_string_clone = query_string.clone();
let query_vector_clone = query_vector.clone();
let shard_clone = shard.clone();
let query_type_clone = query_type_default.clone();
let query_mode_clone = search_mode.clone();
let result_type_clone = result_type.clone();
let field_filter_clone = field_filter.clone();
let query_facets_clone = query_facets.clone();
let facet_filter_clone = facet_filter.clone();
let result_sort_clone = result_sort.clone();
let shard_id = shard.read().await.meta.id;
result_object_list.push(INDEX_RUNTIME.handle().spawn(async move {
match query_mode_clone {
SearchMode::Lexical => {
let mut rlo_lexical = shard_clone
.search_lexical_shard(
query_string_clone,
query_type_clone,
enable_empty_query,
0,
offset + length,
result_type_clone,
include_uncommitted,
field_filter_clone,
query_facets_clone,
facet_filter_clone,
result_sort_clone,
)
.await;
if aggregate_results {
for result in rlo_lexical.results.iter_mut() {
result.doc_id = (result.doc_id * shard_number) + shard_id as usize;
}
}
(Some(rlo_lexical), None)
}
SearchMode::Vector {
similarity_threshold,
ann_mode: cluster_search,
} => {
let mut rlo_vector = shard_clone
.search_vector_shard(
query_vector_clone,
offset + length,
include_uncommitted,
similarity_threshold,
cluster_search,
field_filter_clone,
)
.await;
if aggregate_results {
for result in rlo_vector.results.iter_mut() {
result.doc_id = (result.doc_id * shard_number) + shard_id as usize;
}
}
(None, Some(rlo_vector))
}
SearchMode::Hybrid {
similarity_threshold,
ann_mode,
} => {
let mut rlo_lexical = shard_clone
.search_lexical_shard(
query_string_clone.clone(),
query_type_clone,
enable_empty_query,
0,
offset + length,
result_type_clone,
include_uncommitted,
field_filter_clone.clone(),
query_facets_clone,
facet_filter_clone,
result_sort_clone,
)
.await;
if aggregate_results {
for result in rlo_lexical.results.iter_mut() {
result.doc_id = (result.doc_id * shard_number) + shard_id as usize;
}
}
let mut rlo_vector = shard_clone
.search_vector_shard(
query_vector_clone,
offset + length,
include_uncommitted,
similarity_threshold,
ann_mode,
field_filter_clone,
)
.await;
if aggregate_results {
for result in rlo_vector.results.iter_mut() {
result.doc_id = (result.doc_id * shard_number) + shard_id as usize;
}
}
(Some(rlo_lexical), Some(rlo_vector))
}
}
}));
}
let mut result_object: ResultObject = Default::default();
let mut result_facets: AHashMap<String, (AHashMap<String, usize>, u32)> = AHashMap::new();
if result_type != ResultType::Topk {
for query_facet in query_facets.iter() {
match query_facet {
QueryFacet::String16 {
field,
prefix: _,
length,
} => {
result_facets.insert(field.into(), (AHashMap::new(), *length as u32));
}
QueryFacet::StringSet16 {
field,
prefix: _,
length,
} => {
result_facets.insert(field.into(), (AHashMap::new(), *length as u32));
}
QueryFacet::String32 {
field,
prefix: _,
length,
} => {
result_facets.insert(field.into(), (AHashMap::new(), *length));
}
QueryFacet::StringSet32 {
field,
prefix: _,
length,
} => {
result_facets.insert(field.into(), (AHashMap::new(), *length));
}
QueryFacet::Timestamp {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::U8 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::U16 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::U32 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::U64 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::I8 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::I16 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::I32 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::I64 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::F32 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::F64 {
field,
range_type: _,
ranges: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
QueryFacet::Point {
field,
range_type: _,
ranges: _,
base: _,
unit: _,
} => {
result_facets.insert(field.into(), (AHashMap::new(), u16::MAX as u32));
}
_ => {}
}
}
}
let mut result_object_results_lexical: Vec<Result> = Vec::new();
let mut result_object_results_vector: Vec<Result> = Vec::new();
for result_object_shard in result_object_list {
let mut rlo_shard_hybrid_options = result_object_shard.await.unwrap();
match search_mode {
SearchMode::Lexical => {
let rlo_shard_lexical = rlo_shard_hybrid_options.0.as_mut().unwrap();
if aggregate_results {
result_object_results_lexical.append(&mut rlo_shard_lexical.results)
};
result_object.result_count_total += rlo_shard_lexical.result_count_total;
if result_object.query_terms.is_empty() {
result_object.query_terms = rlo_shard_lexical.query_terms.clone()
};
}
SearchMode::Vector {
similarity_threshold: _,
ann_mode: _,
} => {
let rlo_shard_vector = rlo_shard_hybrid_options.1.as_mut().unwrap();
if aggregate_results {
result_object_results_vector.append(&mut rlo_shard_vector.results)
};
result_object.observed_vector_count += rlo_shard_vector.observed_vector_count;
result_object.observed_cluster_count += rlo_shard_vector.observed_cluster_count;
result_object.result_count_total += rlo_shard_vector.result_count_total;
if result_object.query_terms.is_empty() {
result_object.query_terms = query_string
.to_lowercase()
.split_whitespace()
.map(|s| s.trim_matches(['\"', '+', '-']).to_string())
.filter(|s| !s.is_empty())
.collect::<Vec<String>>();
};
}
SearchMode::Hybrid {
similarity_threshold: _,
ann_mode: _,
} => {
let rlo_shard_lexical = rlo_shard_hybrid_options.0.as_mut().unwrap();
let rlo_shard_vector = rlo_shard_hybrid_options.1.as_mut().unwrap();
if aggregate_results {
result_object_results_lexical.append(&mut rlo_shard_lexical.results);
result_object_results_vector.append(&mut rlo_shard_vector.results);
}
result_object.result_count_total += rlo_shard_lexical
.result_count_total
.max(rlo_shard_vector.result_count_total);
result_object.observed_vector_count += rlo_shard_vector.observed_vector_count;
result_object.observed_cluster_count += rlo_shard_vector.observed_cluster_count;
if result_object.query_terms.is_empty() {
result_object.query_terms = rlo_shard_lexical.query_terms.clone()
};
}
};
if let Some(rlo_shard_lexical) = rlo_shard_hybrid_options.0
&& !rlo_shard_lexical.facets.is_empty()
{
for facet in rlo_shard_lexical.facets.iter() {
if let Some(existing) = result_facets.get_mut(facet.0) {
for (key, value) in facet.1.iter() {
*existing.0.entry(key.clone()).or_insert(0) += value;
}
};
}
}
}
if aggregate_results {
match search_mode {
SearchMode::Lexical => {
#[cfg(feature = "vb")]
result_object_results_lexical
.iter_mut()
.for_each(|r| r.lexical_score = r.score);
#[cfg(feature = "vb")]
result_object_results_lexical
.iter_mut()
.for_each(|r| r.source = ResultSource::Lexical);
result_object.results = result_object_results_lexical;
}
SearchMode::Vector {
similarity_threshold: _,
ann_mode: _,
} => {
result_object.results = result_object_results_vector;
}
SearchMode::Hybrid {
similarity_threshold: _,
ann_mode: _,
} => {
let k = 0.6;
let mut rrf_results: AHashMap<usize, Result> = AHashMap::new();
for (i, result) in result_object_results_lexical
.iter()
.sorted_by(|a, b| b.score.partial_cmp(&a.score).unwrap())
.enumerate()
{
rrf_results.insert(
result.doc_id,
Result {
doc_id: result.doc_id,
score: 1.0 / (k + i as f32),
#[cfg(feature = "vb")]
lexical_score: result.score,
#[cfg(feature = "vb")]
source: ResultSource::Lexical,
..Default::default()
},
);
}
for (i, result) in result_object_results_vector
.iter()
.sorted_by(|a, b| b.score.partial_cmp(&a.score).unwrap())
.enumerate()
{
let rrf_score = 1.0 / (k + i as f32);
#[cfg(feature = "vb")]
rrf_results
.entry(result.doc_id)
.and_modify(|e| {
e.score += rrf_score;
e.field_id = result.field_id;
e.chunk_id = result.chunk_id;
e.level_id = result.level_id;
e.shard_id = result.shard_id;
e.cluster_id = result.cluster_id;
e.cluster_score = result.cluster_score;
e.vector_score = result.vector_score;
e.source = ResultSource::Hybrid;
})
.or_insert(Result {
doc_id: result.doc_id,
score: rrf_score,
field_id: result.field_id,
chunk_id: result.chunk_id,
level_id: result.level_id,
shard_id: result.shard_id,
cluster_id: result.cluster_id,
cluster_score: result.cluster_score,
vector_score: result.vector_score,
lexical_score: 0.0,
source: ResultSource::Vector,
});
#[cfg(not(feature = "vb"))]
rrf_results
.entry(result.doc_id)
.and_modify(|e| {
e.score += rrf_score;
})
.or_insert(Result {
doc_id: result.doc_id,
score: rrf_score,
});
}
result_object.results = rrf_results.into_values().collect();
}
};
}
for (key, value) in result_facets.iter_mut() {
let sum = value
.0
.iter()
.sorted_unstable_by(|a, b| b.1.cmp(a.1))
.map(|(a, c)| (a.clone(), *c))
.take(value.1 as usize)
.collect::<Vec<_>>();
result_object.facets.insert(key.clone(), sum);
}
if aggregate_results {
let mut result_sort_index: Vec<ResultSortIndex> = Vec::new();
if !result_sort.is_empty() {
for rs in result_sort.iter() {
if rs.field == "_id" {
result_sort_index.push(ResultSortIndex {
idx: usize::MAX,
order: rs.order.clone(),
base: &rs.base,
});
continue;
}
if rs.field == "_score" {
result_sort_index.push(ResultSortIndex {
idx: usize::MAX - 1,
order: rs.order.clone(),
base: &rs.base,
});
continue;
}
if let Some(idx) = index_ref.shard_vec[0]
.read()
.await
.facets_map
.get(&rs.field)
{
result_sort_index.push(ResultSortIndex {
idx: *idx,
order: rs.order.clone(),
base: &rs.base,
});
}
}
let shard_vec =
futures::future::join_all(index_ref.shard_vec.iter().map(|s| s.read())).await;
result_object.results.sort_by(|a, b| {
result_ordering_root(
&shard_vec,
shard_number,
query_string.is_empty() && query_vector.is_none(),
&result_sort_index,
*b,
*a,
)
});
} else {
if query_string.is_empty() && query_vector.is_none() {
result_object
.results
.sort_by_key(|b| cmp::Reverse(b.doc_id));
} else {
result_object
.results
.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
}
}
if offset > 0 {
result_object.results = if offset >= result_object.results.len() {
Vec::new()
} else {
result_object.results.split_off(offset)
};
}
if result_object.results.len() > length {
result_object.results.truncate(length);
}
result_object.result_count = result_object.results.len();
}
result_object.original_query = original_query;
result_object.query = query_string.clone();
if let Some(suggestions) = suggestions {
result_object.suggestions = suggestions.into_iter().map(|s| s.term.clone()).collect();
}
result_object
}
}
#[inline(never)]
pub(crate) fn binary_search(
byte_array: &[u8],
len: usize,
key_hash: u64,
key_head_size: usize,
) -> i64 {
let mut left = 0i64;
let mut right = len as i64 - 1;
while left <= right {
let mid = (left + right) / 2;
let pivot = read_u64(byte_array, mid as usize * key_head_size);
match pivot.cmp(&key_hash) {
cmp::Ordering::Equal => {
return mid;
}
cmp::Ordering::Less => left = mid + 1,
cmp::Ordering::Greater => right = mid - 1,
}
}
-1
}
#[inline(always)]
pub(crate) fn decode_posting_list_count(
segment: &SegmentIndex,
index: &Shard,
key_hash1: u64,
previous: bool,
) -> Option<u32> {
let offset = if previous { 1 } else { 0 };
let mut posting_count_list = 0u32;
let mut found = false;
if segment.byte_array_blocks_pointer.len() <= offset {
return None;
}
let block_id_last = segment.byte_array_blocks_pointer.len() - 1 - offset;
for pointer in segment
.byte_array_blocks_pointer
.iter()
.take(block_id_last + 1)
{
let key_count = pointer.2 as usize;
let byte_array =
&index.index_file_mmap[pointer.0 - (key_count * index.key_head_size)..pointer.0];
let key_index = binary_search(byte_array, key_count, key_hash1, index.key_head_size);
if key_index >= 0 {
found = true;
let key_address = key_index as usize * index.key_head_size;
let posting_count = read_u16(byte_array, key_address + 8);
posting_count_list += posting_count as u32 + 1;
}
}
if found {
Some(posting_count_list)
} else {
None
}
}
#[inline(always)]
pub(crate) fn decode_posting_list_counts(
segment: &SegmentIndex,
index: &Shard,
key_hash1: u64,
) -> Option<(u32, u32, u32, u32)> {
let mut posting_count_list = 0u32;
let mut posting_count_ngram_1_compressed = 0;
let mut posting_count_ngram_2_compressed = 0;
let mut posting_count_ngram_3_compressed = 0;
let mut posting_count_ngram_1 = 0;
let mut posting_count_ngram_2 = 0;
let mut posting_count_ngram_3 = 0;
let mut found = false;
let read_flag = key_hash1 & 0b111 > 0;
if segment.byte_array_blocks_pointer.is_empty() {
return None;
}
for pointer in segment.byte_array_blocks_pointer.iter() {
let key_count = pointer.2 as usize;
let byte_array =
&index.index_file_mmap[pointer.0 - (key_count * index.key_head_size)..pointer.0];
let key_index = binary_search(byte_array, key_count, key_hash1, index.key_head_size);
if key_index >= 0 {
found = true;
let key_address = key_index as usize * index.key_head_size;
let posting_count = read_u16(byte_array, key_address + 8);
match index.key_head_size {
20 => {}
22 => {
if read_flag {
posting_count_ngram_1_compressed = read_u8(byte_array, key_address + 14);
posting_count_ngram_2_compressed = read_u8(byte_array, key_address + 15);
}
}
_ => {
if read_flag {
posting_count_ngram_1_compressed = read_u8(byte_array, key_address + 14);
posting_count_ngram_2_compressed = read_u8(byte_array, key_address + 15);
posting_count_ngram_3_compressed = read_u8(byte_array, key_address + 16);
}
}
}
posting_count_list += posting_count as u32 + 1;
}
}
if found {
match index.key_head_size {
20 => {}
22 => {
if read_flag {
posting_count_ngram_1 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_1_compressed as usize];
posting_count_ngram_2 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_2_compressed as usize];
}
}
_ => {
if read_flag {
posting_count_ngram_1 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_1_compressed as usize];
posting_count_ngram_2 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_2_compressed as usize];
posting_count_ngram_3 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_3_compressed as usize];
}
}
}
Some((
posting_count_list,
posting_count_ngram_1,
posting_count_ngram_2,
posting_count_ngram_3,
))
} else {
None
}
}
#[inline(always)]
pub(crate) fn decode_posting_list_object(
segment: &SegmentIndex,
shard: &Shard,
key_hash1: u64,
calculate_score: bool,
) -> Option<PostingListObjectIndex> {
let mut posting_count_list = 0u32;
let mut max_list_score = 0.0;
let mut blocks_owned: Vec<BlockObjectIndex> = Vec::new();
let mut posting_count_ngram_1_compressed = 0;
let mut posting_count_ngram_2_compressed = 0;
let mut posting_count_ngram_3_compressed = 0;
let mut posting_count_ngram_1 = 0;
let mut posting_count_ngram_2 = 0;
let mut posting_count_ngram_3 = 0;
let read_flag = key_hash1 & 0b111 > 0;
for (block_id, pointer) in segment.byte_array_blocks_pointer.iter().enumerate() {
let key_count = pointer.2 as usize;
let byte_array =
&shard.index_file_mmap[pointer.0 - (key_count * shard.key_head_size)..pointer.0];
let key_index = binary_search(byte_array, key_count, key_hash1, shard.key_head_size);
if key_index >= 0 {
let key_address = key_index as usize * shard.key_head_size;
let posting_count = read_u16(byte_array, key_address + 8);
let max_docid = read_u16(byte_array, key_address + 10);
let max_p_docid = read_u16(byte_array, key_address + 12);
match shard.key_head_size {
20 => {}
22 => {
if read_flag {
posting_count_ngram_1_compressed = read_u8(byte_array, key_address + 14);
posting_count_ngram_2_compressed = read_u8(byte_array, key_address + 15);
}
}
_ => {
if read_flag {
posting_count_ngram_1_compressed = read_u8(byte_array, key_address + 14);
posting_count_ngram_2_compressed = read_u8(byte_array, key_address + 15);
posting_count_ngram_3_compressed = read_u8(byte_array, key_address + 16);
}
}
}
let pointer_pivot_p_docid = read_u16(byte_array, key_address + shard.key_head_size - 6);
let compression_type_pointer =
read_u32(byte_array, key_address + shard.key_head_size - 4);
posting_count_list += posting_count as u32 + 1;
let block_object_index = BlockObjectIndex {
max_block_score: 0.0,
block_id: block_id as u32,
posting_count,
max_docid,
max_p_docid,
pointer_pivot_p_docid,
compression_type_pointer,
};
blocks_owned.push(block_object_index);
}
}
if !blocks_owned.is_empty() {
if calculate_score {
match shard.key_head_size {
20 => {}
22 => {
if read_flag {
posting_count_ngram_1 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_1_compressed as usize];
posting_count_ngram_2 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_2_compressed as usize];
}
}
_ => {
if read_flag {
posting_count_ngram_1 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_1_compressed as usize];
posting_count_ngram_2 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_2_compressed as usize];
posting_count_ngram_3 =
DOCUMENT_LENGTH_COMPRESSION[posting_count_ngram_3_compressed as usize];
}
}
}
let ngram_type =
FromPrimitive::from_u64(key_hash1 & 0b111).unwrap_or(NgramType::SingleTerm);
for block in blocks_owned.iter_mut() {
block.max_block_score = get_max_score(
shard,
segment,
posting_count_ngram_1,
posting_count_ngram_2,
posting_count_ngram_3,
posting_count_list,
block.block_id as usize,
block.max_docid as usize,
block.max_p_docid as usize,
block.pointer_pivot_p_docid as usize,
block.compression_type_pointer,
&ngram_type,
);
if block.max_block_score > max_list_score {
max_list_score = block.max_block_score
}
}
}
let posting_list_object_index = PostingListObjectIndex {
posting_count: posting_count_list,
posting_count_ngram_1,
posting_count_ngram_2,
posting_count_ngram_3,
max_list_score,
blocks: blocks_owned,
position_range_previous: 0,
..Default::default()
};
Some(posting_list_object_index)
} else {
None
}
}
#[allow(clippy::too_many_arguments)]
#[allow(async_fn_in_trait)]
pub(crate) trait SearchLexicalShard {
async fn search_lexical_shard(
&self,
query_string: String,
query_type_default: QueryType,
enable_empty_query: bool,
offset: usize,
length: usize,
result_type: ResultType,
include_uncommitted: bool,
field_filter: Vec<String>,
query_facets: Vec<QueryFacet>,
facet_filter: Vec<FacetFilter>,
result_sort: Vec<ResultSort>,
) -> ResultObject;
}
impl SearchLexicalShard for ShardArc {
async fn search_lexical_shard(
&self,
query_string: String,
query_type_default: QueryType,
enable_empty_query: bool,
offset: usize,
length: usize,
result_type: ResultType,
include_uncommitted: bool,
field_filter: Vec<String>,
query_facets: Vec<QueryFacet>,
facet_filter: Vec<FacetFilter>,
result_sort: Vec<ResultSort>,
) -> ResultObject {
let mut result_object: ResultObject = Default::default();
let shard_ref = self.read().await;
if !shard_ref.is_lexical_indexing {
return result_object;
}
let mut query_type_mut = query_type_default;
let facet_cap = if shard_ref.shard_number == 1 {
0
} else {
u32::MAX
};
let mut result_type = result_type;
if length == 0 && result_type != ResultType::Count {
if result_type == ResultType::Topk {
return result_object;
}
result_type = ResultType::Count;
}
if shard_ref.segments_index.is_empty() {
return result_object;
}
let mut field_filter_set: AHashSet<u16> = AHashSet::new();
for item in field_filter.iter() {
match shard_ref.schema_map.get(item) {
Some(value) => {
if value.index_lexical {
field_filter_set.insert(value.indexed_field_id as u16);
}
}
None => {
println!("field not found: {}", item)
}
}
}
let mut result_sort_index: Vec<ResultSortIndex> = Vec::new();
if !result_sort.is_empty() && result_type != ResultType::Count {
for rs in result_sort.iter() {
if rs.field == "_id" {
result_sort_index.push(ResultSortIndex {
idx: usize::MAX,
order: rs.order.clone(),
base: &rs.base,
});
continue;
}
if rs.field == "_score" {
result_sort_index.push(ResultSortIndex {
idx: usize::MAX - 1,
order: rs.order.clone(),
base: &rs.base,
});
continue;
}
if let Some(idx) = shard_ref.facets_map.get(&rs.field) {
result_sort_index.push(ResultSortIndex {
idx: *idx,
order: rs.order.clone(),
base: &rs.base,
});
}
}
}
let heap_size = if result_type != ResultType::Count {
cmp::min(offset + length, shard_ref.indexed_doc_count)
} else {
0
};
let mut search_result = SearchResult {
topk_candidates: MinHeap::new(
heap_size,
&shard_ref,
query_string.is_empty(),
&result_sort_index,
),
query_facets: Vec::new(),
skip_facet_count: false,
};
let mut facet_filter_sparse: Vec<FilterSparse> = Vec::new();
if !facet_filter.is_empty() {
facet_filter_sparse = vec![FilterSparse::None; shard_ref.facets.len()];
for facet_filter_item in facet_filter.iter() {
match &facet_filter_item {
FacetFilter::U8 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U8
{
facet_filter_sparse[*idx] = FilterSparse::U8(filter.clone())
}
}
FacetFilter::U16 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U16
{
facet_filter_sparse[*idx] = FilterSparse::U16(filter.clone())
}
}
FacetFilter::U32 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U32
{
facet_filter_sparse[*idx] = FilterSparse::U32(filter.clone())
}
}
FacetFilter::U64 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U64
{
facet_filter_sparse[*idx] = FilterSparse::U64(filter.clone())
}
}
FacetFilter::I8 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I8
{
facet_filter_sparse[*idx] = FilterSparse::I8(filter.clone())
}
}
FacetFilter::I16 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I16
{
facet_filter_sparse[*idx] = FilterSparse::I16(filter.clone())
}
}
FacetFilter::I32 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I32
{
facet_filter_sparse[*idx] = FilterSparse::I32(filter.clone())
}
}
FacetFilter::I64 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I64
{
facet_filter_sparse[*idx] = FilterSparse::I64(filter.clone())
}
}
FacetFilter::Timestamp { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::Timestamp
{
facet_filter_sparse[*idx] = FilterSparse::Timestamp(filter.clone())
}
}
FacetFilter::F32 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::F32
{
facet_filter_sparse[*idx] = FilterSparse::F32(filter.clone())
}
}
FacetFilter::F64 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::F64
{
facet_filter_sparse[*idx] = FilterSparse::F64(filter.clone())
}
}
FacetFilter::String16 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field) {
let facet = &shard_ref.facets[*idx];
if shard_ref.facets[*idx].field_type == FieldType::String16 {
let mut string_id_vec = Vec::new();
for value in filter.iter() {
let key = [value.clone()];
if let Some(facet_value_id) = facet.values.get_index_of(&key[0])
{
string_id_vec.push(facet_value_id as u16);
}
}
facet_filter_sparse[*idx] = FilterSparse::String16(string_id_vec);
}
}
}
FacetFilter::StringSet16 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field) {
let facet = &shard_ref.facets[*idx];
if shard_ref.facets[*idx].field_type == FieldType::StringSet16 {
let mut string_id_vec = Vec::new();
for value in filter.iter() {
let key = [value.clone()];
if let Some(facet_value_id) =
facet.values.get_index_of(&key.join("_"))
{
string_id_vec.push(facet_value_id as u16);
}
if let Some(facet_value_ids) = shard_ref
.string_set_to_single_term_id_vec[*idx]
.get(&value.clone())
{
for code in facet_value_ids.iter() {
string_id_vec.push(*code as u16);
}
}
}
facet_filter_sparse[*idx] = FilterSparse::String16(string_id_vec);
}
}
}
FacetFilter::String32 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field) {
let facet = &shard_ref.facets[*idx];
if shard_ref.facets[*idx].field_type == FieldType::String32 {
let mut string_id_vec = Vec::new();
for value in filter.iter() {
let key = [value.clone()];
if let Some(facet_value_id) = facet.values.get_index_of(&key[0])
{
string_id_vec.push(facet_value_id as u32);
}
}
facet_filter_sparse[*idx] = FilterSparse::String32(string_id_vec);
}
}
}
FacetFilter::StringSet32 { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field) {
let facet = &shard_ref.facets[*idx];
if shard_ref.facets[*idx].field_type == FieldType::StringSet32 {
let mut string_id_vec = Vec::new();
for value in filter.iter() {
let key = [value.clone()];
if let Some(facet_value_id) =
facet.values.get_index_of(&key.join("_"))
{
string_id_vec.push(facet_value_id as u32);
}
if let Some(facet_value_ids) = shard_ref
.string_set_to_single_term_id_vec[*idx]
.get(&value.clone())
{
for code in facet_value_ids.iter() {
string_id_vec.push(*code);
}
}
}
facet_filter_sparse[*idx] = FilterSparse::String32(string_id_vec);
}
}
}
FacetFilter::Point { field, filter } => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::Point
{
facet_filter_sparse[*idx] = FilterSparse::Point(
filter.0.clone(),
filter.1.clone(),
filter.2.clone(),
point_distance_to_morton_range(&filter.0, filter.1.end, &filter.2),
);
}
}
}
}
}
let mut is_range_facet = false;
if !query_facets.is_empty() {
search_result.query_facets = vec![ResultFacet::default(); shard_ref.facets.len()];
for query_facet in query_facets.iter() {
match &query_facet {
QueryFacet::U8 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U8
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::U8(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::U16 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U16
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::U16(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::U32 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U32
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::U32(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::U64 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::U64
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::U64(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::I8 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I8
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::I8(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::I16 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I16
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::I16(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::I32 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I32
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::I32(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::I64 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::I64
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::I64(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::Timestamp {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::Timestamp
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::Timestamp(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::F32 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::F32
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::F32(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::F64 {
field,
range_type,
ranges,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::F64
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::F64(range_type.clone(), ranges.clone()),
..Default::default()
};
}
}
QueryFacet::String16 {
field,
prefix,
length,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::String16
{
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
prefix: prefix.clone(),
length: *length as u32,
..Default::default()
}
}
}
QueryFacet::StringSet16 {
field,
prefix,
length,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::StringSet16
{
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
prefix: prefix.clone(),
length: *length as u32,
..Default::default()
}
}
}
QueryFacet::String32 {
field,
prefix,
length,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::String32
{
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
prefix: prefix.clone(),
length: *length,
..Default::default()
}
}
}
QueryFacet::StringSet32 {
field,
prefix,
length,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::StringSet32
{
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
prefix: prefix.clone(),
length: *length,
..Default::default()
}
}
}
QueryFacet::Point {
field,
range_type,
ranges,
base,
unit,
} => {
if let Some(idx) = shard_ref.facets_map.get(field)
&& shard_ref.facets[*idx].field_type == FieldType::Point
{
is_range_facet = true;
search_result.query_facets[*idx] = ResultFacet {
field: field.clone(),
length: u16::MAX as u32,
ranges: Ranges::Point(
range_type.clone(),
ranges.clone(),
base.clone(),
unit.clone(),
),
..Default::default()
};
}
}
QueryFacet::None => {}
};
}
}
let result_count_arc = Arc::new(AtomicUsize::new(0));
let result_count_uncommitted_arc = Arc::new(AtomicUsize::new(0));
'fallback: loop {
let mut unique_terms: AHashMap<String, TermObject> = AHashMap::new();
let mut non_unique_terms: Vec<NonUniqueTermObject> = Vec::new();
let mut nonunique_terms_count = 0u32;
tokenizer(
&shard_ref,
&query_string,
&mut unique_terms,
&mut non_unique_terms,
shard_ref.meta.tokenizer,
shard_ref.segment_number_mask1,
&mut nonunique_terms_count,
u16::MAX as u32,
MAX_POSITIONS_PER_TERM,
true,
&mut query_type_mut,
shard_ref.meta.ngram_indexing,
0,
1,
)
.await;
if include_uncommitted && shard_ref.uncommitted {
shard_ref.search_lexical_shard_uncommitted(
&unique_terms,
&non_unique_terms,
&mut query_type_mut,
&result_type,
&field_filter_set,
&facet_filter_sparse,
&mut search_result,
&result_count_uncommitted_arc,
offset + length,
);
}
let mut query_list_map: AHashMap<u64, PostingListObjectQuery> = AHashMap::new();
let mut query_list: Vec<PostingListObjectQuery>;
let mut not_query_list_map: AHashMap<u64, PostingListObjectQuery> = AHashMap::new();
let mut not_query_list: Vec<PostingListObjectQuery>;
let mut non_unique_query_list: Vec<NonUniquePostingListObjectQuery> = Vec::new();
let mut preceding_ngram_count = 0;
let mut blocks_vec: Vec<Vec<BlockObjectIndex>> = Vec::new();
let mut not_found_terms_hashset: AHashSet<u64> = AHashSet::new();
for non_unique_term in non_unique_terms.iter() {
let term = unique_terms.get(&non_unique_term.term).unwrap();
let key0: u32 = term.key0;
let key_hash: u64 = term.key_hash;
let term_no_diacritics_umlaut_case = &non_unique_term.term;
let mut idf = 0.0;
let mut idf_ngram1 = 0.0;
let mut idf_ngram2 = 0.0;
let mut idf_ngram3 = 0.0;
let mut term_index_unique = 0;
if non_unique_term.op == QueryType::Not {
let query_list_map_len = not_query_list_map.len();
let not_query_list_option = not_query_list_map.get(&key_hash);
if not_query_list_option.is_none()
&& !not_found_terms_hashset.contains(&key_hash)
{
let posting_count;
let max_list_score;
let blocks;
let blocks_len;
let found_plo = if shard_ref.meta.access_type == AccessType::Mmap {
let posting_list_object_index_option = decode_posting_list_object(
&shard_ref.segments_index[key0 as usize],
&shard_ref,
key_hash,
false,
);
if let Some(plo) = posting_list_object_index_option {
posting_count = plo.posting_count;
max_list_score = plo.max_list_score;
blocks = &DUMMY_VEC;
blocks_len = plo.blocks.len();
blocks_vec.push(plo.blocks);
true
} else {
posting_count = 0;
max_list_score = 0.0;
blocks = &DUMMY_VEC;
blocks_len = 0;
false
}
} else {
let posting_list_object_index_option = shard_ref.segments_index
[key0 as usize]
.segment
.get(&key_hash);
if let Some(plo) = posting_list_object_index_option {
posting_count = plo.posting_count;
max_list_score = plo.max_list_score;
blocks_len = plo.blocks.len();
blocks = &plo.blocks;
true
} else {
posting_count = 0;
max_list_score = 0.0;
blocks = &DUMMY_VEC;
blocks_len = 0;
false
}
};
if found_plo {
let value_new = PostingListObjectQuery {
posting_count,
max_list_score,
blocks,
blocks_index: blocks_vec.len(),
p_block_max: blocks_len as i32,
term: term_no_diacritics_umlaut_case.clone(),
key0,
term_index_unique: query_list_map_len,
idf,
idf_ngram1,
idf_ngram2,
idf_ngram3,
ngram_type: non_unique_term.ngram_type.clone(),
..Default::default()
};
not_query_list_map.insert(key_hash, value_new);
} else {
not_found_terms_hashset.insert(key_hash);
}
}
} else {
let query_list_map_len = query_list_map.len();
let mut found = true;
let query_list_option = query_list_map.get(&key_hash);
match query_list_option {
None => {
if !not_found_terms_hashset.contains(&key_hash) {
let posting_count;
let posting_count_ngram_1;
let posting_count_ngram_2;
let posting_count_ngram_3;
let max_list_score;
let blocks;
let blocks_len;
let found_plo = if shard_ref.meta.access_type == AccessType::Mmap {
let posting_list_object_index_option =
decode_posting_list_object(
&shard_ref.segments_index[key0 as usize],
&shard_ref,
key_hash,
true,
);
if let Some(plo) = posting_list_object_index_option {
posting_count = plo.posting_count;
posting_count_ngram_1 = plo.posting_count_ngram_1;
posting_count_ngram_2 = plo.posting_count_ngram_2;
posting_count_ngram_3 = plo.posting_count_ngram_3;
max_list_score = plo.max_list_score;
blocks = &DUMMY_VEC;
blocks_len = plo.blocks.len();
blocks_vec.push(plo.blocks);
true
} else {
posting_count = 0;
posting_count_ngram_1 = 0;
posting_count_ngram_2 = 0;
posting_count_ngram_3 = 0;
max_list_score = 0.0;
blocks = &DUMMY_VEC;
blocks_len = 0;
false
}
} else {
let posting_list_object_index_option = shard_ref.segments_index
[key0 as usize]
.segment
.get(&key_hash);
if let Some(plo) = posting_list_object_index_option {
posting_count = plo.posting_count;
posting_count_ngram_1 = plo.posting_count_ngram_1;
posting_count_ngram_2 = plo.posting_count_ngram_2;
posting_count_ngram_3 = plo.posting_count_ngram_3;
max_list_score = plo.max_list_score;
blocks_len = plo.blocks.len();
blocks = &plo.blocks;
true
} else {
posting_count = 0;
posting_count_ngram_1 = 0;
posting_count_ngram_2 = 0;
posting_count_ngram_3 = 0;
max_list_score = 0.0;
blocks = &DUMMY_VEC;
blocks_len = 0;
false
}
};
if found_plo {
if result_type != ResultType::Count {
if non_unique_term.ngram_type == NgramType::SingleTerm
|| shard_ref.meta.lexical_similarity
== LexicalSimilarity::Bm25fProximity
{
idf = (((shard_ref.indexed_doc_count as f32
- posting_count as f32
+ 0.5)
/ (posting_count as f32 + 0.5))
+ 1.0)
.ln();
} else if non_unique_term.ngram_type == NgramType::NgramFF
|| non_unique_term.ngram_type == NgramType::NgramRF
|| non_unique_term.ngram_type == NgramType::NgramFR
{
idf_ngram1 = (((shard_ref.indexed_doc_count as f32
- posting_count_ngram_1 as f32
+ 0.5)
/ (posting_count_ngram_1 as f32 + 0.5))
+ 1.0)
.ln();
idf_ngram2 = (((shard_ref.indexed_doc_count as f32
- posting_count_ngram_2 as f32
+ 0.5)
/ (posting_count_ngram_2 as f32 + 0.5))
+ 1.0)
.ln();
} else {
idf_ngram1 = (((shard_ref.indexed_doc_count as f32
- posting_count_ngram_1 as f32
+ 0.5)
/ (posting_count_ngram_1 as f32 + 0.5))
+ 1.0)
.ln();
idf_ngram2 = (((shard_ref.indexed_doc_count as f32
- posting_count_ngram_2 as f32
+ 0.5)
/ (posting_count_ngram_2 as f32 + 0.5))
+ 1.0)
.ln();
idf_ngram3 = (((shard_ref.indexed_doc_count as f32
- posting_count_ngram_3 as f32
+ 0.5)
/ (posting_count_ngram_3 as f32 + 0.5))
+ 1.0)
.ln();
}
}
let value_new = PostingListObjectQuery {
posting_count,
max_list_score,
blocks,
blocks_index: blocks_vec.len(),
p_block_max: blocks_len as i32,
term: term_no_diacritics_umlaut_case.clone(),
key0,
term_index_unique: query_list_map_len,
idf,
idf_ngram1,
idf_ngram2,
idf_ngram3,
ngram_type: non_unique_term.ngram_type.clone(),
..Default::default()
};
term_index_unique = value_new.term_index_unique;
query_list_map.insert(key_hash, value_new);
} else {
if non_unique_term.op == QueryType::Intersection
|| non_unique_term.op == QueryType::Phrase
{
break 'fallback;
}
not_found_terms_hashset.insert(key_hash);
found = false;
}
}
}
Some(value) => {
term_index_unique = value.term_index_unique;
}
}
if found && non_unique_term.op == QueryType::Phrase {
let nu_plo = NonUniquePostingListObjectQuery {
term_index_unique,
term_index_nonunique: non_unique_query_list.len()
+ preceding_ngram_count,
pos: 0,
p_pos: 0,
positions_pointer: 0,
positions_count: 0,
byte_array: &DUMMY_VEC_8,
field_vec: SmallVec::new(),
p_field: 0,
key0,
is_embedded: false,
embedded_positions: [0; 4],
};
match non_unique_term.ngram_type {
NgramType::SingleTerm => {}
NgramType::NgramFF | NgramType::NgramRF | NgramType::NgramFR => {
preceding_ngram_count += 1
}
_ => preceding_ngram_count += 2,
};
non_unique_query_list.push(nu_plo);
}
}
match term.ngram_type {
NgramType::SingleTerm => {}
NgramType::NgramFF | NgramType::NgramRF | NgramType::NgramFR => {
result_object
.query_terms
.push(term.term_ngram_1.to_string());
result_object
.query_terms
.push(term.term_ngram_0.to_string());
}
_ => {
result_object
.query_terms
.push(term.term_ngram_2.to_string());
result_object
.query_terms
.push(term.term_ngram_1.to_string());
result_object
.query_terms
.push(term.term_ngram_0.to_string());
}
};
{
result_object.query_terms.push(term.term.to_string());
}
}
not_query_list = not_query_list_map.into_values().collect();
query_list = query_list_map.into_values().collect();
if shard_ref.meta.access_type == AccessType::Mmap {
for plo in query_list.iter_mut() {
plo.blocks = &blocks_vec[plo.blocks_index - 1]
}
for plo in not_query_list.iter_mut() {
plo.blocks = &blocks_vec[plo.blocks_index - 1]
}
}
let query_list_len = query_list.len();
let non_unique_query_list_len = non_unique_query_list.len();
let mut matching_blocks: i32 = 0;
let query_term_count = non_unique_terms.len();
if query_list_len == 0 {
if enable_empty_query && query_string.is_empty() {
search_iterator_shard(
&shard_ref,
result_type,
include_uncommitted,
&result_count_arc,
&mut search_result,
offset + length,
&facet_filter_sparse,
)
.await;
}
} else if query_list_len == 1 {
if !(shard_ref.uncommitted && include_uncommitted)
&& offset + length <= 1000
&& not_query_list.is_empty()
&& field_filter_set.is_empty()
&& shard_ref.delete_hashset.is_empty()
&& facet_filter_sparse.is_empty()
&& !is_range_facet
&& result_sort_index.is_empty()
&& let Some(stopword_result_object) = shard_ref
.frequentword_results
.get(&non_unique_terms[0].term)
{
result_object.query = stopword_result_object.query.clone();
result_object
.query_terms
.clone_from(&stopword_result_object.query_terms);
result_object.result_count = stopword_result_object.result_count;
result_object.result_count_total = stopword_result_object.result_count_total;
if result_type != ResultType::Count {
result_object
.results
.clone_from(&stopword_result_object.results);
if offset > 0 {
result_object.results.drain(..offset);
}
if length < 1000 {
result_object.results.truncate(length);
}
}
if !search_result.query_facets.is_empty() && result_type != ResultType::Topk {
let mut facets: AHashMap<String, Facet> = AHashMap::new();
for facet in search_result.query_facets.iter() {
if facet.length == 0
|| stopword_result_object.facets[&facet.field].is_empty()
{
continue;
}
let v = stopword_result_object.facets[&facet.field]
.iter()
.sorted_unstable_by(|a, b| b.1.cmp(&a.1))
.map(|(a, c)| (a.clone(), *c))
.filter(|(a, _c)| {
facet.prefix.is_empty() || a.starts_with(&facet.prefix)
})
.take(facet.length.max(facet_cap) as usize)
.collect::<Vec<_>>();
if !v.is_empty() {
facets.insert(facet.field.clone(), v);
}
}
result_object.facets = facets;
};
return result_object;
}
single_blockid(
&shard_ref,
&mut non_unique_query_list,
&mut query_list,
&mut not_query_list,
&result_count_arc,
&mut search_result,
offset + length,
&result_type,
&field_filter_set,
&facet_filter_sparse,
&mut matching_blocks,
)
.await;
} else if query_type_mut == QueryType::Union {
search_result.skip_facet_count = true;
if result_type == ResultType::Count && query_list_len != 2 {
union_blockid(
&shard_ref,
&mut non_unique_query_list,
&mut query_list,
&mut not_query_list,
&result_count_arc,
&mut search_result,
offset + length,
&result_type,
&field_filter_set,
&facet_filter_sparse,
)
.await;
} else if SPEEDUP_FLAG
&& query_list_len == 2
&& search_result.query_facets.is_empty()
&& facet_filter_sparse.is_empty()
&& search_result.topk_candidates.result_sort.is_empty()
{
union_docid_2(
&shard_ref,
&mut non_unique_query_list,
&mut query_list,
&mut not_query_list,
&result_count_arc,
&mut search_result,
offset + length,
&result_type,
&field_filter_set,
&facet_filter_sparse,
&mut matching_blocks,
query_term_count,
)
.await;
} else if SPEEDUP_FLAG
&& search_result.topk_candidates.result_sort.is_empty()
&& query_list_len <= 10
{
union_docid_3(
&shard_ref,
&mut non_unique_query_list,
&mut Vec::from([QueueObject {
query_list: query_list.clone(),
query_index: 0,
max_score: f32::MAX,
}]),
&mut not_query_list,
&result_count_arc,
&mut search_result,
offset + length,
&result_type,
&field_filter_set,
&facet_filter_sparse,
&mut matching_blocks,
0,
query_term_count,
)
.await;
} else {
union_blockid(
&shard_ref,
&mut non_unique_query_list,
&mut query_list,
&mut not_query_list,
&result_count_arc,
&mut search_result,
offset + length,
&result_type,
&field_filter_set,
&facet_filter_sparse,
)
.await;
}
} else {
intersection_blockid(
&shard_ref,
&mut non_unique_query_list,
&mut query_list,
&mut not_query_list,
&result_count_arc,
&mut search_result,
offset + length,
&result_type,
&field_filter_set,
&facet_filter_sparse,
&mut matching_blocks,
query_type_mut == QueryType::Phrase && non_unique_query_list_len >= 2,
query_term_count,
)
.await;
if shard_ref.enable_fallback
&& (result_count_arc.load(Ordering::Relaxed) < offset + length)
{
continue 'fallback;
}
}
break;
}
result_object.result_count = search_result.topk_candidates.current_heap_size;
if search_result.topk_candidates.current_heap_size > offset {
result_object.results = mem::take(&mut search_result.topk_candidates._elements);
if search_result.topk_candidates.current_heap_size < offset + length {
result_object
.results
.truncate(search_result.topk_candidates.current_heap_size);
}
if result_sort.is_empty() {
if query_string.is_empty() {
result_object
.results
.sort_by_key(|b| cmp::Reverse(b.doc_id));
} else {
result_object
.results
.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
}
} else {
result_object
.results
.sort_by(|a, b| search_result.topk_candidates.result_ordering_shard(*b, *a));
}
if offset > 0 {
result_object.results.drain(..offset);
}
}
result_object.result_count_total = result_count_uncommitted_arc.load(Ordering::Relaxed)
+ result_count_arc.load(Ordering::Relaxed);
if !search_result.query_facets.is_empty() {
result_object.facets = if result_object.query_terms.is_empty() {
shard_ref
.get_index_string_facets_shard(query_facets)
.unwrap_or_default()
} else {
let mut facets: AHashMap<String, Facet> = AHashMap::new();
for (i, facet) in search_result.query_facets.iter_mut().enumerate() {
if facet.length == 0 || facet.values.is_empty() {
continue;
}
let v = if facet.ranges == Ranges::None {
if shard_ref.facets[i].values.is_empty() {
continue;
}
if shard_ref.facets[i].field_type == FieldType::StringSet16
|| shard_ref.facets[i].field_type == FieldType::StringSet32
{
let mut hash_map: AHashMap<String, usize> = AHashMap::new();
for value in facet.values.iter() {
let value2 = shard_ref.facets[i]
.values
.get_index(*value.0 as usize)
.unwrap();
for term in value2.1.0.iter() {
*hash_map.entry(term.clone()).or_insert(0) += value.1;
}
}
hash_map
.iter()
.sorted_unstable_by(|a, b| b.1.cmp(a.1))
.map(|(a, c)| (a.clone(), *c))
.filter(|(a, _c)| {
facet.prefix.is_empty() || a.starts_with(&facet.prefix)
})
.take(facet.length.max(facet_cap) as usize)
.collect::<Vec<_>>()
} else {
facet
.values
.iter()
.sorted_unstable_by(|a, b| b.1.cmp(a.1))
.map(|(a, c)| {
(
shard_ref.facets[i]
.values
.get_index(*a as usize)
.unwrap()
.0
.clone(),
*c,
)
})
.filter(|(a, _c)| {
facet.prefix.is_empty() || a.starts_with(&facet.prefix)
})
.take(facet.length.max(facet_cap) as usize)
.collect::<Vec<_>>()
}
} else {
let range_type = match &facet.ranges {
Ranges::U8(range_type, _ranges) => range_type.clone(),
Ranges::U16(range_type, _ranges) => range_type.clone(),
Ranges::U32(range_type, _ranges) => range_type.clone(),
Ranges::U64(range_type, _ranges) => range_type.clone(),
Ranges::I8(range_type, _ranges) => range_type.clone(),
Ranges::I16(range_type, _ranges) => range_type.clone(),
Ranges::I32(range_type, _ranges) => range_type.clone(),
Ranges::I64(range_type, _ranges) => range_type.clone(),
Ranges::Timestamp(range_type, _ranges) => range_type.clone(),
Ranges::F32(range_type, _ranges) => range_type.clone(),
Ranges::F64(range_type, _ranges) => range_type.clone(),
Ranges::Point(range_type, _ranges, _base, _unit) => range_type.clone(),
_ => RangeType::CountWithinRange,
};
match range_type {
RangeType::CountAboveRange => {
let mut sum = 0usize;
for value in facet
.values
.iter_mut()
.sorted_unstable_by(|a, b| b.0.cmp(a.0))
{
sum += *value.1;
*value.1 = sum;
}
}
RangeType::CountBelowRange => {
let mut sum = 0usize;
for value in facet
.values
.iter_mut()
.sorted_unstable_by(|a, b| a.0.cmp(b.0))
{
sum += *value.1;
*value.1 = sum;
}
}
RangeType::CountWithinRange => {}
}
facet
.values
.iter()
.sorted_unstable_by(|a, b| a.0.cmp(b.0))
.map(|(a, c)| {
(
match &facet.ranges {
Ranges::U8(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::U16(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::U32(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::U64(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::I8(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::I16(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::I32(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::I64(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::Timestamp(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::F32(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::F64(_range_type, ranges) => {
ranges[*a as usize].0.clone()
}
Ranges::Point(_range_type, ranges, _base, _unit) => {
ranges[*a as usize].0.clone()
}
_ => "".into(),
},
*c,
)
})
.filter(|(a, _c)| {
facet.prefix.is_empty() || a.starts_with(&facet.prefix)
})
.collect::<Vec<_>>()
};
if !v.is_empty() {
facets.insert(facet.field.clone(), v);
}
}
facets
};
}
result_object
}
}