#[cfg(not(test))]
use alloc::vec::Vec;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use super::BitWriter;
const DOUBLE_QUOTE: u8 = b'"';
const BACKSLASH: u8 = b'\\';
const OPEN_BRACE: u8 = b'{';
const CLOSE_BRACE: u8 = b'}';
const OPEN_BRACKET: u8 = b'[';
const CLOSE_BRACKET: u8 = b']';
const COMMA: u8 = b',';
const COLON: u8 = b':';
const PERIOD: u8 = b'.';
const MINUS: u8 = b'-';
const PLUS: u8 = b'+';
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum State {
InJson,
InString,
InEscape,
InValue,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SemiIndex {
pub state: State,
pub ib: Vec<u64>,
pub bp: Vec<u64>,
}
impl SemiIndex {
#[inline]
pub fn ib_as_bytes(&self) -> &[u8] {
crate::binary::words_to_bytes(&self.ib)
}
#[inline]
pub fn bp_as_bytes(&self) -> &[u8] {
crate::binary::words_to_bytes(&self.bp)
}
pub fn from_bytes(ib_bytes: &[u8], bp_bytes: &[u8]) -> Self {
Self {
state: State::InJson, ib: crate::binary::bytes_to_words_vec(ib_bytes),
bp: crate::binary::bytes_to_words_vec(bp_bytes),
}
}
}
#[inline]
fn is_open(c: u8) -> bool {
c == OPEN_BRACKET || c == OPEN_BRACE
}
#[inline]
fn is_close(c: u8) -> bool {
c == CLOSE_BRACKET || c == CLOSE_BRACE
}
#[inline]
fn is_delim(c: u8) -> bool {
c == COMMA || c == COLON
}
#[inline]
fn is_alphabetic(c: u8) -> bool {
c.is_ascii_alphabetic()
}
#[inline]
fn is_digit(c: u8) -> bool {
c.is_ascii_digit()
}
#[inline]
fn is_value_char(c: u8) -> bool {
is_alphabetic(c) || is_digit(c) || c == PERIOD || c == MINUS || c == PLUS
}
#[derive(Debug, Clone, Copy)]
struct Phi(u8);
impl Phi {
const NONE: Phi = Phi(0b000);
const CLOSE: Phi = Phi(0b001);
const OPEN: Phi = Phi(0b110);
const LEAF: Phi = Phi(0b111);
#[inline]
fn ib(self) -> bool {
(self.0 & 0b100) != 0
}
#[inline]
fn bp_open(self) -> bool {
(self.0 & 0b010) != 0
}
#[inline]
fn bp_close(self) -> bool {
(self.0 & 0b001) != 0
}
}
#[inline]
fn state_machine(c: u8, state: State) -> (State, Phi) {
match state {
State::InJson => {
if is_open(c) {
(State::InJson, Phi::OPEN)
} else if is_close(c) {
(State::InJson, Phi::CLOSE)
} else if is_delim(c) {
(State::InJson, Phi::NONE)
} else if is_value_char(c) {
(State::InValue, Phi::LEAF)
} else if c == DOUBLE_QUOTE {
(State::InString, Phi::LEAF)
} else {
(State::InJson, Phi::NONE)
}
}
State::InString => {
if c == DOUBLE_QUOTE {
(State::InJson, Phi::NONE)
} else if c == BACKSLASH {
(State::InEscape, Phi::NONE)
} else {
(State::InString, Phi::NONE)
}
}
State::InEscape => {
(State::InString, Phi::NONE)
}
State::InValue => {
if is_open(c) {
(State::InJson, Phi::OPEN)
} else if is_close(c) {
(State::InJson, Phi::CLOSE)
} else if is_delim(c) {
(State::InJson, Phi::NONE)
} else if is_value_char(c) {
(State::InValue, Phi::NONE)
} else {
(State::InJson, Phi::NONE)
}
}
}
}
pub fn build_semi_index(json: &[u8]) -> SemiIndex {
use crate::json::pfsm_optimized;
use crate::json::pfsm_tables::PfsmState;
let word_capacity = json.len().div_ceil(64);
let mut ib = BitWriter::with_capacity(word_capacity);
let mut bp = BitWriter::with_capacity(word_capacity * 2);
let state =
pfsm_optimized::pfsm_process_chunk_optimized(json, PfsmState::InJson, &mut ib, &mut bp);
SemiIndex {
state: match state {
PfsmState::InJson => State::InJson,
PfsmState::InString => State::InString,
PfsmState::InEscape => State::InEscape,
PfsmState::InValue => State::InValue,
},
ib: ib.finish(),
bp: bp.finish(),
}
}
pub fn build_semi_index_scalar(json: &[u8]) -> SemiIndex {
let word_capacity = json.len().div_ceil(64);
let mut ib = BitWriter::with_capacity(word_capacity);
let mut bp = BitWriter::with_capacity(word_capacity * 2);
let mut state = State::InJson;
for &c in json {
let (new_state, phi) = state_machine(c, state);
state = new_state;
ib.write_bit(phi.ib());
if phi.bp_open() {
bp.write_1();
}
if phi.bp_close() {
bp.write_0();
}
}
SemiIndex {
state,
ib: ib.finish(),
bp: bp.finish(),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn get_bit(words: &[u64], i: usize) -> bool {
let word_idx = i / 64;
let bit_idx = i % 64;
if word_idx < words.len() {
(words[word_idx] >> bit_idx) & 1 == 1
} else {
false
}
}
fn bits_to_string(words: &[u64], n: usize) -> String {
(0..n)
.map(|i| if get_bit(words, i) { '1' } else { '0' })
.collect()
}
#[test]
fn test_empty_object() {
let semi = build_semi_index(b"{}");
assert_eq!(bits_to_string(&semi.ib, 2), "10");
assert_eq!(bits_to_string(&semi.bp, 2), "10");
assert_eq!(semi.state, State::InJson);
}
#[test]
fn test_empty_array() {
let semi = build_semi_index(b"[]");
assert_eq!(bits_to_string(&semi.ib, 2), "10");
assert_eq!(bits_to_string(&semi.bp, 2), "10");
}
#[test]
fn test_simple_object_with_string() {
let semi = build_semi_index(br#"{"a":"b"}"#);
assert_eq!(bits_to_string(&semi.ib, 9), "110001000");
assert_eq!(bits_to_string(&semi.bp, 6), "110100");
}
#[test]
fn test_array_with_numbers() {
let semi = build_semi_index(b"[1,2,3]");
assert_eq!(bits_to_string(&semi.ib, 7), "1101010");
assert_eq!(bits_to_string(&semi.bp, 8), "11010100");
}
#[test]
fn test_boolean_true() {
let semi = build_semi_index(b"[true]");
assert_eq!(bits_to_string(&semi.ib, 6), "110000");
assert_eq!(bits_to_string(&semi.bp, 4), "1100");
}
#[test]
fn test_boolean_false() {
let semi = build_semi_index(b"[false]");
assert_eq!(bits_to_string(&semi.ib, 7), "1100000");
}
#[test]
fn test_null() {
let semi = build_semi_index(b"[null]");
assert_eq!(bits_to_string(&semi.ib, 6), "110000");
}
#[test]
fn test_negative_number() {
let semi = build_semi_index(b"[-123]");
assert_eq!(bits_to_string(&semi.ib, 6), "110000");
}
#[test]
fn test_decimal_number() {
let semi = build_semi_index(b"[3.14]");
assert_eq!(bits_to_string(&semi.ib, 6), "110000");
}
#[test]
fn test_nested_object() {
let semi = build_semi_index(br#"{"a":{"b":1}}"#);
assert_eq!(bits_to_string(&semi.ib, 13), "1100011000100");
}
#[test]
fn test_escaped_quote() {
let semi = build_semi_index(br#""a\"b""#);
assert_eq!(bits_to_string(&semi.ib, 6), "100000");
assert_eq!(bits_to_string(&semi.bp, 2), "10");
}
#[test]
fn test_whitespace() {
let semi = build_semi_index(b"{ \"a\" : 1 }");
assert_eq!(bits_to_string(&semi.ib, 11), "10100000100");
}
#[test]
fn test_final_state_in_value() {
let semi = build_semi_index(b"[123");
assert_eq!(semi.state, State::InValue);
}
#[test]
fn test_final_state_in_string() {
let semi = build_semi_index(br#"["abc"#);
assert_eq!(semi.state, State::InString);
}
#[test]
fn test_scientific_notation() {
let semi = build_semi_index(b"[1e+10]");
assert_eq!(bits_to_string(&semi.ib, 7), "1100000");
}
#[test]
fn test_complex_json() {
let json = br#"{"items":[1,2],"flag":true}"#;
let semi = build_semi_index(json);
assert_eq!(semi.state, State::InJson);
assert!(!semi.bp.is_empty());
}
}