use columnar::NumericalValue;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
use crate::indexer::indexing_term::IndexingTerm;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
use crate::{DateTime, DocId, Term};
#[derive(Default)]
pub(crate) struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>,
}
impl IndexingPositionsPerPath {
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
self.positions_per_path.entry(id).or_default()
}
pub fn clear(&mut self) {
self.positions_per_path.clear();
}
}
pub fn json_path_sep_to_dot(path: &mut str) {
unsafe {
replace_in_place(JSON_PATH_SEGMENT_SEP, b'.', path.as_bytes_mut());
}
}
#[expect(clippy::too_many_arguments)]
fn index_json_object<'a, V: Value<'a>>(
doc: DocId,
json_visitor: V::ObjectIter,
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut IndexingTerm,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
for (json_path_segment, json_value_visitor) in json_visitor {
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
continue;
}
json_path_writer.push(json_path_segment);
index_json_value(
doc,
json_value_visitor,
text_analyzer,
term_buffer,
json_path_writer,
postings_writer,
ctx,
positions_per_path,
);
json_path_writer.pop();
}
}
#[expect(clippy::too_many_arguments)]
pub(crate) fn index_json_value<'a, V: Value<'a>>(
doc: DocId,
json_value: V,
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut IndexingTerm,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
let set_path_id = |term_buffer: &mut IndexingTerm, unordered_id: u32| {
term_buffer.truncate_value_bytes(0);
term_buffer.append_bytes(&unordered_id.to_be_bytes());
};
let set_type = |term_buffer: &mut IndexingTerm, typ: Type| {
term_buffer.append_bytes(&[typ.to_code()]);
};
match json_value.as_value() {
ReferenceValue::Leaf(leaf) => match leaf {
ReferenceValueLeaf::Null => {}
ReferenceValueLeaf::Str(val) => {
let mut token_stream = text_analyzer.token_stream(val);
let unordered_id = ctx
.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str());
set_path_id(term_buffer, unordered_id);
set_type(term_buffer, Type::Str);
let indexing_position = positions_per_path.get_position_from_id(unordered_id);
postings_writer.index_text(
doc,
&mut *token_stream,
term_buffer,
ctx,
indexing_position,
);
}
ReferenceValueLeaf::U64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value::<u64>(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::I64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
if !val.is_finite() {
return;
};
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
match NumericalValue::F64(val).normalize() {
NumericalValue::I64(val_i64) => {
term_buffer.append_type_and_fast_value::<i64>(val_i64);
}
NumericalValue::U64(val_u64) => {
term_buffer.append_type_and_fast_value::<u64>(val_u64);
}
NumericalValue::F64(val_f64) => {
term_buffer.append_type_and_fast_value::<f64>(val_f64);
}
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Date(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::PreTokStr(_) => {
unimplemented!(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
}
ReferenceValueLeaf::Bytes(_) => {
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
},
ReferenceValue::Array(elements) => {
for val in elements {
index_json_value(
doc,
val,
text_analyzer,
term_buffer,
json_path_writer,
postings_writer,
ctx,
positions_per_path,
);
}
}
ReferenceValue::Object(object) => {
index_json_object::<V>(
doc,
object,
text_analyzer,
term_buffer,
json_path_writer,
postings_writer,
ctx,
positions_per_path,
);
}
}
}
pub fn convert_to_fast_value_and_append_to_json_term(
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
term.value()
.as_json_value_bytes()
.expect("expecting a Term with a json type and json path")
.as_serialized()
.len(),
0,
"JSON value bytes should be empty"
);
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
}
fn try_convert_to_datetime_and_append_to_json_term(
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(dt);
Some(term_clone)
}
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
let mut term_clone = term.clone();
match numerical_value.normalize() {
NumericalValue::I64(i64_value) => {
term_clone.append_type_and_fast_value::<i64>(i64_value);
}
NumericalValue::U64(u64_value) => {
term_clone.append_type_and_fast_value::<u64>(u64_value);
}
NumericalValue::F64(f64_value) => {
term_clone.append_type_and_fast_value::<f64>(f64_value);
}
}
Some(term_clone)
}
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
let val = str::parse::<bool>(text).ok()?;
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(val);
Some(term_clone)
}
pub fn split_json_path(json_path: &str) -> Vec<String> {
let mut escaped_state: bool = false;
let mut json_path_segments = Vec::new();
let mut buffer = String::new();
for ch in json_path.chars() {
if escaped_state {
buffer.push(ch);
escaped_state = false;
continue;
}
match ch {
'\\' => {
escaped_state = true;
}
'.' => {
let new_segment = std::mem::take(&mut buffer);
json_path_segments.push(new_segment);
}
_ => {
buffer.push(ch);
}
}
}
json_path_segments.push(buffer);
json_path_segments
}
pub(crate) fn encode_column_name(
field_name: &str,
json_path: &str,
expand_dots_enabled: bool,
) -> String {
let mut path = JsonPathWriter::default();
path.push(field_name);
path.set_expand_dots(expand_dots_enabled);
for segment in split_json_path(json_path) {
path.push(&segment);
}
path.into()
}
#[cfg(test)]
mod tests {
use super::split_json_path;
use crate::schema::Field;
use crate::Term;
#[test]
fn test_json_writer() {
let field = Field::from_field_id(1);
let mut term = Term::from_field_json_path(field, "attributes.color", false);
term.append_type_and_str("red");
assert_eq!(
format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
);
let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
term.append_type_and_fast_value(400i64);
assert_eq!(
format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
);
}
#[test]
fn test_string_term() {
let field = Field::from_field_id(1);
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_str("red");
assert_eq!(term.serialized_value_bytes(), b"color\x00sred".to_vec())
}
#[test]
fn test_i64_term() {
let field = Field::from_field_id(1);
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(-4i64);
assert_eq!(
term.serialized_value_bytes(),
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc".to_vec()
)
}
#[test]
fn test_u64_term() {
let field = Field::from_field_id(1);
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(4u64);
assert_eq!(
term.serialized_value_bytes(),
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04".to_vec()
)
}
#[test]
fn test_f64_term() {
let field = Field::from_field_id(1);
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(4.0f64);
assert_eq!(
term.serialized_value_bytes(),
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00".to_vec()
)
}
#[test]
fn test_bool_term() {
let field = Field::from_field_id(1);
let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(true);
assert_eq!(
term.serialized_value_bytes(),
b"color\x00o\x00\x00\x00\x00\x00\x00\x00\x01".to_vec()
)
}
#[test]
fn test_split_json_path_simple() {
let json_path = split_json_path("titi.toto");
assert_eq!(&json_path, &["titi", "toto"]);
}
#[test]
fn test_split_json_path_single_segment() {
let json_path = split_json_path("toto");
assert_eq!(&json_path, &["toto"]);
}
#[test]
fn test_split_json_path_trailing_dot() {
let json_path = split_json_path("toto.");
assert_eq!(&json_path, &["toto", ""]);
}
#[test]
fn test_split_json_path_heading_dot() {
let json_path = split_json_path(".toto");
assert_eq!(&json_path, &["", "toto"]);
}
#[test]
fn test_split_json_path_escaped_dot() {
let json_path = split_json_path(r"toto\.titi");
assert_eq!(&json_path, &["toto.titi"]);
let json_path_2 = split_json_path(r"k8s\.container\.name");
assert_eq!(&json_path_2, &["k8s.container.name"]);
}
#[test]
fn test_split_json_path_escaped_backslash() {
let json_path = split_json_path(r"toto\\titi");
assert_eq!(&json_path, &[r"toto\titi"]);
}
#[test]
fn test_split_json_path_escaped_normal_letter() {
let json_path = split_json_path(r"toto\titi");
assert_eq!(&json_path, &[r#"tototiti"#]);
}
}