1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
//! Classification of bytes withing JSON quote sequences.
//!
//! Provides the [`QuoteClassifiedBlock`] struct and [`QuoteClassifiedIterator`] trait
//! that allow effectively enriching JSON inputs with quote sequence information.
//!
//! The output of quote classification is an iterator of [`QuoteClassifiedBlock`]
//! which contain bitmasks whose lit bits signify characters that are within quotes
//! in the source document. These characters need to be ignored.
//!
//! Note that the actual quote characters are not guaranteed to be classified
//! as "within themselves" or otherwise. In particular the current implementation
//! marks _opening_ quotes with lit bits, but _closing_ quotes are always unmarked.
//! This behavior should not be presumed to be stable, though, and can change
//! without a major semver bump.
//!
//! # Examples
//! ```
//! use rsonpath::classification::quotes::{classify_quoted_sequences, QuoteClassifiedIterator};
//! use rsonpath::input::{Input, OwnedBytes};
//! use rsonpath::result::empty::EmptyRecorder;
//! use rsonpath::FallibleIterator;
//!
//! let json = r#"{"x": "string", "y": {"z": "\"escaped\""}}"#.to_owned();
//! // 011000111111100011000011000111111111111000
//! // The mask below appears reversed due to endianness.
//! let expd = 0b000111111111111000110000110001111111000110;
//! let input = OwnedBytes::try_from(json).unwrap();
//! let iter = input.iter_blocks::<_, 64>(&EmptyRecorder);
//! let mut quote_classifier = classify_quoted_sequences(iter);
//!
//! let block = quote_classifier.next().unwrap().unwrap();
//! assert_eq!(expd, block.within_quotes_mask);
//! ```
use crate::input::error::InputError;
use crate::input::{InputBlock, InputBlockIterator};
use crate::{FallibleIterator, BLOCK_SIZE};
use cfg_if::cfg_if;
/// Input block with a bitmask signifying which characters are within quotes.
///
/// Characters within quotes in the input are guaranteed to have their corresponding
/// bit in `within_quotes_mask` set. The $0$-th bit of the mask corresponds to the
/// last character in `block`, the $1$-st bit to the second-to-last character, etc.
///
/// There is no guarantee on how the boundary quote characters are classified,
/// their bits might be lit or not lit depending on the implementation.
pub struct QuoteClassifiedBlock<B, const N: usize> {
/// The block that was classified.
pub block: B,
/// Mask marking characters within a quoted sequence.
pub within_quotes_mask: u64,
}
/// Trait for quote classifier iterators, i.e. finite iterators
/// enriching blocks of input with quote bitmasks.
/// Iterator is allowed to hold a reference to the JSON document valid for `'a`.
pub trait QuoteClassifiedIterator<'i, I: InputBlockIterator<'i, N>, const N: usize>:
FallibleIterator<Item = QuoteClassifiedBlock<I::Block, N>, Error = InputError>
{
/// Get the total offset in bytes from the beginning of input.
fn get_offset(&self) -> usize;
/// Move the iterator `count` blocks forward.
/// Effectively skips `count * Twice<BlockAlignment>::size()` bytes.
fn offset(&mut self, count: isize);
/// Flip the bit representing whether the last block ended with a nonescaped quote.
///
/// This should be done only in very specific circumstances where the previous-block
/// state could have been damaged due to stopping and resuming the classification at a later point.
fn flip_quotes_bit(&mut self);
}
/// Higher-level classifier that can be consumed to retrieve the inner
/// [`Input::BlockIterator`](crate::input::Input::BlockIterator).
pub trait InnerIter<I> {
/// Consume `self` and return the wrapped [`Input::BlockIterator`](crate::input::Input::BlockIterator).
fn into_inner(self) -> I;
}
impl<'i, B, const N: usize> QuoteClassifiedBlock<B, N>
where
B: InputBlock<'i, N>,
{
/// Returns the length of the classified block.
#[must_use]
#[inline(always)]
pub fn len(&self) -> usize {
self.block.len()
}
/// Whether the classified block is empty.
#[must_use]
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.block.is_empty()
}
}
cfg_if! {
if #[cfg(any(doc, not(feature = "simd")))] {
mod nosimd;
type ClassifierImpl<'i, I, const N: usize> = nosimd::SequentialQuoteClassifier<'i, I, N>;
}
else if #[cfg(simd = "avx2")] {
mod avx2;
type ClassifierImpl<'i, I> = avx2::Avx2QuoteClassifier<'i, I>;
}
else {
compile_error!("Target architecture is not supported by SIMD features of this crate. Disable the default `simd` feature.");
}
}
/// Walk through the JSON document represented by `bytes`
/// and classify quoted sequences.
#[must_use]
#[inline(always)]
pub fn classify_quoted_sequences<'i, I>(iter: I) -> impl QuoteClassifiedIterator<'i, I, BLOCK_SIZE> + InnerIter<I>
where
I: InputBlockIterator<'i, BLOCK_SIZE>,
{
ClassifierImpl::new(iter)
}
pub(crate) fn resume_quote_classification<'i, I>(
iter: I,
first_block: Option<I::Block>,
) -> (
impl QuoteClassifiedIterator<'i, I, BLOCK_SIZE> + InnerIter<I>,
Option<QuoteClassifiedBlock<I::Block, BLOCK_SIZE>>,
)
where
I: InputBlockIterator<'i, BLOCK_SIZE>,
{
ClassifierImpl::resume(iter, first_block)
}