#[cfg(not(test))]
use alloc::vec::Vec;
use crate::trees::BalancedParens;
use crate::util::broadword::select_in_word;
#[derive(Clone, Debug)]
pub struct SimpleJsonIndex<W = Vec<u64>> {
ib: W,
ib_len: usize,
bp: BalancedParens<W>,
}
impl SimpleJsonIndex<Vec<u64>> {
pub fn build(json: &[u8]) -> Self {
#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
let semi = crate::json::simd::build_semi_index_simple(json);
#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
let semi = crate::json::simple::build_semi_index(json);
let ib_len = json.len();
let bp_bit_count = count_bp_bits(&semi.bp, &semi.ib);
Self {
ib: semi.ib,
ib_len,
bp: BalancedParens::new(semi.bp, bp_bit_count),
}
}
}
impl<W: AsRef<[u64]>> SimpleJsonIndex<W> {
pub fn from_parts(ib: W, ib_len: usize, bp: W, bp_len: usize) -> Self {
Self {
ib,
ib_len,
bp: BalancedParens::from_words(bp, bp_len),
}
}
#[inline]
pub fn ib(&self) -> &[u64] {
self.ib.as_ref()
}
#[inline]
pub fn ib_len(&self) -> usize {
self.ib_len
}
#[inline]
pub fn bp(&self) -> &BalancedParens<W> {
&self.bp
}
#[inline]
pub fn structural_pos(&self, k: usize) -> Option<usize> {
self.ib_select1(k)
}
pub fn structural_count(&self) -> usize {
self.ib
.as_ref()
.iter()
.map(|w| w.count_ones() as usize)
.sum()
}
pub fn structural_index(&self, pos: usize) -> Option<usize> {
if pos >= self.ib_len {
return None;
}
let word_idx = pos / 64;
let bit_idx = pos % 64;
let words = self.ib.as_ref();
if word_idx >= words.len() {
return None;
}
if (words[word_idx] >> bit_idx) & 1 == 0 {
return None; }
Some(self.ib_rank1(pos))
}
pub fn find_close(&self, json: &[u8], pos: usize) -> Option<usize> {
if pos >= json.len() {
return None;
}
let c = json[pos];
if c != b'{' && c != b'[' {
return None;
}
let struct_idx = self.structural_index(pos)?;
let bp_pos = struct_idx * 2;
let close_bp_pos = self.bp.find_close(bp_pos)?;
let close_struct_idx = close_bp_pos / 2;
self.structural_pos(close_struct_idx)
}
pub fn skip_value(&self, json: &[u8], pos: usize) -> Option<usize> {
if pos >= json.len() {
return None;
}
match json[pos] {
b'{' | b'[' => {
let close_pos = self.find_close(json, pos)?;
Some(close_pos + 1)
}
b'"' => {
Some(find_string_end(json, pos) + 1)
}
b't' => {
if pos + 4 <= json.len() && &json[pos..pos + 4] == b"true" {
Some(pos + 4)
} else {
None
}
}
b'f' => {
if pos + 5 <= json.len() && &json[pos..pos + 5] == b"false" {
Some(pos + 5)
} else {
None
}
}
b'n' => {
if pos + 4 <= json.len() && &json[pos..pos + 4] == b"null" {
Some(pos + 4)
} else {
None
}
}
c if c == b'-' || c.is_ascii_digit() => {
Some(find_number_end(json, pos))
}
_ => None,
}
}
#[inline]
pub fn structural_positions<'a>(&'a self, _json: &'a [u8]) -> StructuralPositions<'a, W> {
StructuralPositions { index: self, k: 0 }
}
pub fn children<'a>(&'a self, json: &'a [u8], container_pos: usize) -> Option<Children<'a, W>> {
if container_pos >= json.len() {
return None;
}
let c = json[container_pos];
if c != b'{' && c != b'[' {
return None;
}
let close_pos = self.find_close(json, container_pos)?;
let start_idx = self.structural_index(container_pos)? + 1;
let end_idx = self.structural_index(close_pos)?;
Some(Children {
index: self,
json,
current_idx: start_idx,
end_idx,
})
}
fn ib_select1(&self, k: usize) -> Option<usize> {
let words = self.ib.as_ref();
let mut remaining = k;
for (word_idx, &word) in words.iter().enumerate() {
let ones = word.count_ones() as usize;
if ones > remaining {
let bit_pos = select_in_word(word, remaining as u32) as usize;
let result = word_idx * 64 + bit_pos;
return if result < self.ib_len {
Some(result)
} else {
None
};
}
remaining -= ones;
}
None
}
fn ib_rank1(&self, pos: usize) -> usize {
if pos == 0 {
return 0;
}
let words = self.ib.as_ref();
let word_idx = pos / 64;
let bit_idx = pos % 64;
let mut count = 0usize;
for &word in words.iter().take(word_idx) {
count += word.count_ones() as usize;
}
if word_idx < words.len() && bit_idx > 0 {
let mask = (1u64 << bit_idx) - 1;
count += (words[word_idx] & mask).count_ones() as usize;
}
count
}
}
fn count_bp_bits(_bp_words: &[u64], ib_words: &[u64]) -> usize {
let structural_count: usize = ib_words.iter().map(|w| w.count_ones() as usize).sum();
structural_count * 2
}
fn find_string_end(json: &[u8], start: usize) -> usize {
let mut i = start + 1; while i < json.len() {
match json[i] {
b'"' => return i,
b'\\' => i += 2, _ => i += 1,
}
}
json.len()
}
fn find_number_end(json: &[u8], start: usize) -> usize {
let mut i = start;
while i < json.len() {
match json[i] {
b'0'..=b'9' | b'-' | b'+' | b'.' | b'e' | b'E' => i += 1,
_ => break,
}
}
i
}
pub struct StructuralPositions<'a, W = Vec<u64>> {
index: &'a SimpleJsonIndex<W>,
k: usize,
}
impl<'a, W: AsRef<[u64]>> Iterator for StructuralPositions<'a, W> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
let pos = self.index.structural_pos(self.k)?;
self.k += 1;
Some(pos)
}
}
pub struct Children<'a, W = Vec<u64>> {
index: &'a SimpleJsonIndex<W>,
json: &'a [u8],
current_idx: usize,
end_idx: usize,
}
impl<'a, W: AsRef<[u64]>> Iterator for Children<'a, W> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
if self.current_idx >= self.end_idx {
return None;
}
let pos = self.index.structural_pos(self.current_idx)?;
self.current_idx += 1;
if pos < self.json.len() {
let c = self.json[pos];
if c == b':' || c == b',' {
return self.next();
}
}
Some(pos)
}
}
pub type OwnedSimpleJsonIndex = SimpleJsonIndex<Vec<u64>>;
pub type BorrowedSimpleJsonIndex<'a> = SimpleJsonIndex<&'a [u64]>;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_index() {
let json = br#"{"a": 1}"#;
let index = SimpleJsonIndex::build(json);
assert!(!index.bp().is_empty());
}
#[test]
fn test_structural_pos_empty_object() {
let json = b"{}";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.structural_pos(0), Some(0));
assert_eq!(index.structural_pos(1), Some(1));
assert_eq!(index.structural_pos(2), None);
}
#[test]
fn test_structural_pos_simple_object() {
let json = br#"{"a":1}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.structural_pos(0), Some(0)); assert_eq!(index.structural_pos(1), Some(4)); assert_eq!(index.structural_pos(2), Some(6)); assert_eq!(index.structural_pos(3), None);
}
#[test]
fn test_structural_count() {
let json = br#"{"a":1,"b":2}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.structural_count(), 5);
}
#[test]
fn test_structural_index() {
let json = br#"{"a":1}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.structural_index(0), Some(0));
assert_eq!(index.structural_index(4), Some(1));
assert_eq!(index.structural_index(6), Some(2));
assert_eq!(index.structural_index(1), None);
}
#[test]
fn test_find_close_empty_object() {
let json = b"{}";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(1));
}
#[test]
fn test_find_close_empty_array() {
let json = b"[]";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(1));
}
#[test]
fn test_find_close_nested() {
let json = br#"{"a":{"b":1}}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(12));
assert_eq!(index.find_close(json, 5), Some(11));
}
#[test]
fn test_find_close_array_in_object() {
let json = br#"{"a":[1,2,3]}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(12));
assert_eq!(index.find_close(json, 5), Some(11));
}
#[test]
fn test_skip_value_object() {
let json = br#"{"a":1}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.skip_value(json, 0), Some(7));
}
#[test]
fn test_skip_value_string() {
let json = br#""hello""#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.skip_value(json, 0), Some(7));
}
#[test]
fn test_skip_value_number() {
let json = b"12345";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.skip_value(json, 0), Some(5));
}
#[test]
fn test_skip_value_boolean() {
let json_true = b"true";
let index_true = SimpleJsonIndex::build(json_true);
assert_eq!(index_true.skip_value(json_true, 0), Some(4));
let json_false = b"false";
let index_false = SimpleJsonIndex::build(json_false);
assert_eq!(index_false.skip_value(json_false, 0), Some(5));
}
#[test]
fn test_skip_value_null() {
let json = b"null";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.skip_value(json, 0), Some(4));
}
#[test]
fn test_structural_positions_iterator() {
let json = br#"[1,2,3]"#;
let index = SimpleJsonIndex::build(json);
let positions: Vec<_> = index.structural_positions(json).collect();
assert_eq!(positions, vec![0, 2, 4, 6]);
}
#[test]
fn test_children_object() {
let json = br#"{"a":1,"b":2}"#;
let index = SimpleJsonIndex::build(json);
let children: Vec<_> = index.children(json, 0).unwrap().collect();
assert!(children.is_empty());
}
#[test]
fn test_children_array() {
let json = b"[1,2,3]";
let index = SimpleJsonIndex::build(json);
let children: Vec<_> = index.children(json, 0).unwrap().collect();
assert!(children.is_empty());
}
#[test]
fn test_nested_arrays() {
let json = b"[[1,2],[3]]";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(10));
assert_eq!(index.find_close(json, 1), Some(5));
assert_eq!(index.find_close(json, 7), Some(9));
}
#[test]
fn test_from_parts() {
let json = br#"{"a":1}"#;
let index = SimpleJsonIndex::build(json);
let ib = index.ib().to_vec();
let bp = index.bp().words().to_vec();
let ib_len = index.ib_len();
let bp_len = index.bp().len();
let index2 = SimpleJsonIndex::from_parts(ib, ib_len, bp, bp_len);
assert_eq!(index2.structural_pos(0), Some(0));
assert_eq!(index2.structural_pos(1), Some(4));
assert_eq!(index2.structural_pos(2), Some(6));
}
#[test]
fn test_escaped_string() {
let json = br#"{"a":"hello\"world"}"#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(19));
}
#[test]
fn test_skip_value_with_escapes() {
let json = br#""hello\"world""#;
let index = SimpleJsonIndex::build(json);
assert_eq!(index.skip_value(json, 0), Some(14));
}
#[test]
fn test_deeply_nested() {
let json = b"[[[[1]]]]";
let index = SimpleJsonIndex::build(json);
assert_eq!(index.find_close(json, 0), Some(8));
assert_eq!(index.find_close(json, 1), Some(7));
assert_eq!(index.find_close(json, 2), Some(6));
assert_eq!(index.find_close(json, 3), Some(5));
}
}