use std::iter::FromIterator;
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use crate::indexer::output_config::InfiOutputConfig;
use crate::utils::escape_json;
pub static RELATIVE_FP_FIELD: &str = "_relative_fp";
pub static ADD_FILES_FIELD: &str = "_add_files";
pub type EnumKind = String;
fn get_default_num_docs_per_store() -> u32 {
100000000
}
fn get_default_num_field_stores_per_dir() -> u32 {
1000
}
fn get_default_cache_all_field_stores() -> bool {
true
}
fn get_default_merge_default_fields() -> bool {
true
}
fn get_default_fields() -> FxHashMap<String, Option<FieldConfig>> {
FxHashMap::from_iter(vec![
("title".to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 2.0, k: 1.2, b: 0.15
})),
("h1".to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 2.0, k: 1.2, b: 0.15
})),
("heading".to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 1.5, k: 1.2, b: 0.25
})),
("body".to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 1.0, k: 1.2, b: 0.75
})),
("headingLink".to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 0.0, k: 1.2, b: 0.75
})),
(RELATIVE_FP_FIELD.to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 0.0, k: 1.2, b: 0.75
})),
("link".to_owned(), Some(FieldConfig {
storage: get_default_storage(),
weight: 0.0, k: 1.2, b: 0.75
})),
])
}
#[derive(Deserialize)]
pub struct FieldsConfig {
#[serde(default = "get_default_num_docs_per_store")]
pub num_docs_per_store: u32,
#[serde(default = "get_default_num_field_stores_per_dir")]
pub num_stores_per_dir: u32,
#[serde(default="get_default_cache_all_field_stores")]
pub cache_all_field_stores: bool,
#[serde(default="FxHashMap::default")]
pub fields: FxHashMap<String, Option<FieldConfig>>,
#[serde(default="get_default_merge_default_fields")]
pub merge_default_fields: bool,
}
impl Default for FieldsConfig {
fn default() -> Self {
FieldsConfig {
num_docs_per_store: get_default_num_docs_per_store(),
num_stores_per_dir: get_default_num_field_stores_per_dir(),
cache_all_field_stores: get_default_cache_all_field_stores(),
fields: FxHashMap::default(),
merge_default_fields: get_default_merge_default_fields(),
}
}
}
impl FieldsConfig {
pub fn merge_default_fields(&mut self) {
if self.merge_default_fields {
let mut fields = get_default_fields();
std::mem::swap(&mut fields, &mut self.fields);
self.fields.extend(fields)
}
}
pub fn get_field_infos(
&self,
output_folder_path_inner: &Path,
incremental_output_config: Option<&InfiOutputConfig>,
) -> Arc<FieldInfos> {
let mut field_infos_by_name: FxHashMap<String, FieldInfo> = FxHashMap::default();
let mut field_infos_by_id: Vec<FieldInfo> = Vec::with_capacity(self.fields.len());
let mut num_scored_fields = 0;
let mut num_enum_fields = 0;
let mut num_i64_fields = 0;
for (field_name, field_config) in self.fields.iter() {
if field_config.is_none() {
continue;
}
let field_config = unsafe { field_config.as_ref().unwrap_unchecked() };
if field_config.weight != 0.0 {
num_scored_fields += 1;
}
field_infos_by_id.push(FieldInfo {
name: field_name.to_owned(),
escaped_name: escape_json::escape(field_name).into_owned(),
id: 0,
enum_info: if field_config.storage.iter().any(|s| matches!(s, StorageType::Enum)) {
let (enum_id, enum_values) = if let Some(incremental_output_config) = incremental_output_config {
let old_enum_info = incremental_output_config.field_infos
.iter()
.find_map(|fi| if fi.name.as_str() == field_name {
fi.enum_info.as_ref()
} else {
None
})
.unwrap();
(old_enum_info.enum_id, old_enum_info.enum_values.clone())
} else {
(num_enum_fields, Vec::new())
};
num_enum_fields += 1;
Some(EnumInfo {
enum_id,
enum_values,
})
} else {
None
},
store_text: field_config.storage.iter().any(|s| matches!(s, StorageType::Text)),
i64_info: field_config.storage
.iter()
.find_map(|s| if let StorageType::I64(num_info) = s {
let info = if let Some(incremental_output_config) = incremental_output_config {
let old_num_info = incremental_output_config.field_infos
.iter()
.find_map(|fi| if fi.name.as_str() == field_name {
fi.i64_info.as_ref()
} else {
None
})
.unwrap();
old_num_info.clone()
} else {
I64Info {
id: num_i64_fields,
..num_info.clone()
}
};
num_i64_fields += 1;
Some(info)
} else {
None
}),
weight: field_config.weight,
k: field_config.k,
b: field_config.b,
});
}
field_infos_by_id.sort_by(|a, b| {
b.weight.partial_cmp(&a.weight).unwrap_or(a.name.cmp(&b.name))
});
for (field_id, field_info) in field_infos_by_id.iter_mut().enumerate() {
field_info.id = field_id as u8;
field_infos_by_name.insert(field_info.name.to_owned(), field_info.clone());
}
let field_output_folder_path = output_folder_path_inner.join("field_store");
std::fs::create_dir_all(&field_output_folder_path)
.expect("Failed to create field store output folder in output directory");
Arc::new(FieldInfos {
field_infos_by_name,
field_infos_by_id,
num_scored_fields,
num_enum_fields,
num_i64_fields,
num_docs_per_store: self.num_docs_per_store,
num_stores_per_dir: self.num_stores_per_dir,
field_output_folder_path,
})
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct EnumInfo {
pub enum_id: usize,
pub enum_values: Vec<String>,
}
fn get_default_weight() -> f32 {
0.0
}
fn get_default_k() -> f32 {
1.2
}
fn get_default_b() -> f32 {
0.75
}
fn get_default_i64_parse() -> I64ParseStrategy {
I64ParseStrategy::Integer
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "lowercase", tag = "method")]
pub enum I64ParseStrategy {
Integer,
Round,
Datetime {
datetime_fmt: String,
#[serde(default)]
timezone: Option<i32>,
#[serde(default)]
time: Option<u32>,
},
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct I64Info {
#[serde(default)]
pub id: usize,
#[serde(default = "get_default_i64_parse")]
pub parse: I64ParseStrategy,
pub default: i64,
}
#[derive(Deserialize)]
#[serde(rename_all = "lowercase", tag = "type")]
pub enum StorageType {
Text,
Enum,
I64(I64Info),
}
fn get_default_storage() -> Vec<StorageType> {
vec![StorageType::Text]
}
#[derive(Deserialize)]
pub struct FieldConfig {
#[serde(default = "get_default_storage")]
pub storage: Vec<StorageType>,
#[serde(default = "get_default_weight")]
pub weight: f32,
#[serde(default = "get_default_k")]
pub k: f32,
#[serde(default = "get_default_b")]
pub b: f32,
}
#[derive(Debug, Clone)]
pub struct FieldInfo {
pub name: String,
pub escaped_name: String,
pub id: u8,
pub weight: f32,
pub k: f32,
pub b: f32,
pub enum_info: Option<EnumInfo>,
pub store_text: bool,
pub i64_info: Option<I64Info>,
}
pub struct FieldInfos {
pub field_infos_by_name: FxHashMap<String, FieldInfo>,
pub field_infos_by_id: Vec<FieldInfo>,
pub num_scored_fields: usize,
pub num_enum_fields: usize,
pub num_i64_fields: usize,
pub num_docs_per_store: u32,
pub num_stores_per_dir: u32,
pub field_output_folder_path: PathBuf,
}
#[derive(Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct FieldInfoOutput {
pub id: u8,
pub name: String,
pub weight: f32,
pub k: f32,
pub b: f32,
pub store_text: bool,
pub enum_info: Option<EnumInfo>,
pub i64_info: Option<I64Info>,
}
impl FieldInfos {
pub fn to_output(&self) -> Vec<FieldInfoOutput> {
let mut field_infos: Vec<FieldInfoOutput> = Vec::with_capacity(self.field_infos_by_name.len());
for (field_name, field_info) in self.field_infos_by_name.iter() {
field_infos.push(FieldInfoOutput {
id: field_info.id,
enum_info: field_info.enum_info.clone(),
name: field_name.to_owned(),
store_text: field_info.store_text,
i64_info: field_info.i64_info.clone(),
weight: field_info.weight,
k: field_info.k, b: field_info.b,
})
}
field_infos.sort_by_key(|fi| fi.id);
field_infos
}
}