use super::avx;
use super::error::{Error, ErrorKind};
use super::index_builder;
use super::parser;
use super::query::Query;
use super::result::Result;
use super::utf8::{BACKSLASH, COLON, DOLLAR, DOT, LEFT_BRACE, QUOTE, RIGHT_BRACE};
use std::cmp;
use fnv::{FnvHashMap, FnvHashSet};
use x86intrin::m256i;
const ROOT_QUERY_STR_OFFSET: usize = 2;
pub struct Pikkr<'a> {
backslash: m256i,
quote: m256i,
colon: m256i,
left_brace: m256i,
right_brace: m256i,
query_strs_len: usize,
queries: FnvHashMap<&'a [u8], Query<'a>>,
queries_len: usize,
level: usize,
b_backslash: Vec<u64>,
b_quote: Vec<u64>,
b_colon: Vec<u64>,
b_left: Vec<u64>,
b_right: Vec<u64>,
b_string_mask: Vec<u64>,
index: Vec<Vec<u64>>,
train_num: usize,
trained_num: usize,
trained: bool,
stats: Vec<FnvHashSet<usize>>,
}
impl<'a> Pikkr<'a> {
#[inline]
pub fn new<S: ?Sized + AsRef<[u8]>>(query_strs: &[&'a S], train_num: usize) -> Result<Pikkr<'a>> {
if query_strs.iter().any(|s| !is_valid_query_str(s.as_ref())) {
return Err(Error::from(ErrorKind::InvalidQuery));
}
let mut p = Pikkr {
backslash: avx::mm256i(BACKSLASH as i8),
quote: avx::mm256i(QUOTE as i8),
colon: avx::mm256i(COLON as i8),
left_brace: avx::mm256i(LEFT_BRACE as i8),
right_brace: avx::mm256i(RIGHT_BRACE as i8),
query_strs_len: query_strs.len(),
queries: FnvHashMap::default(),
queries_len: 0,
level: 0,
b_backslash: Vec::new(),
b_quote: Vec::new(),
b_colon: Vec::new(),
b_left: Vec::new(),
b_right: Vec::new(),
b_string_mask: Vec::new(),
index: Vec::new(),
train_num,
trained_num: 0,
trained: false,
stats: Vec::new(),
};
let mut qi = 0;
for (ri, query_str) in query_strs.iter().enumerate() {
let (level, next_qi) = set_queries(
&mut p.queries,
(*query_str).as_ref(),
ROOT_QUERY_STR_OFFSET,
qi,
ri,
);
p.level = cmp::max(p.level, level);
qi = next_qi;
}
p.queries_len = p.queries.len();
p.index = vec![Vec::new(); p.level];
for _ in 0..qi {
p.stats.push(FnvHashSet::default());
}
Ok(p)
}
#[inline(always)]
fn build_structural_indices(&mut self, rec: &[u8]) -> Result<()> {
let b_len = (rec.len() + 63) / 64;
self.b_backslash.clear();
self.b_quote.clear();
self.b_colon.clear();
self.b_left.clear();
self.b_right.clear();
self.b_string_mask.clear();
for b in self.index.iter_mut() {
b.clear();
}
if b_len > self.b_backslash.capacity() {
self.b_backslash.reserve_exact(b_len);
self.b_quote.reserve_exact(b_len);
self.b_colon.reserve_exact(b_len);
self.b_left.reserve_exact(b_len);
self.b_right.reserve_exact(b_len);
self.b_string_mask.reserve_exact(b_len);
for b in self.index.iter_mut() {
b.reserve_exact(b_len);
}
}
index_builder::build_structural_character_bitmap(
rec,
&mut self.b_backslash,
&mut self.b_quote,
&mut self.b_colon,
&mut self.b_left,
&mut self.b_right,
&self.backslash,
&self.quote,
&self.colon,
&self.left_brace,
&self.right_brace,
);
index_builder::build_structural_quote_bitmap(&self.b_backslash, &mut self.b_quote);
index_builder::build_string_mask_bitmap(&self.b_quote, &mut self.b_string_mask);
for (i, b) in self.b_string_mask.iter().enumerate() {
self.b_colon[i] &= *b;
self.b_left[i] &= *b;
self.b_right[i] &= *b;
}
index_builder::build_leveled_colon_bitmap(
&self.b_colon,
&self.b_left,
&self.b_right,
self.level,
&mut self.index,
)
}
#[inline]
pub fn parse<'b, S: ?Sized + AsRef<[u8]>>(&mut self, rec: &'b S) -> Result<Vec<Option<&'b [u8]>>> {
let rec = rec.as_ref();
if rec.len() == 0 {
return Err(Error::from(ErrorKind::InvalidRecord));
}
self.build_structural_indices(rec)?;
let mut results = vec![None; self.query_strs_len];
if self.trained {
let found = parser::speculative_parse(
rec,
&self.index,
&self.queries,
0,
rec.len() - 1,
0,
&self.stats,
&mut results,
&self.b_quote,
)?;
if !found {
parser::basic_parse(
rec,
&self.index,
&mut self.queries,
0,
rec.len() - 1,
0,
self.queries_len,
&mut self.stats,
false,
&mut results,
&self.b_quote,
)?;
}
} else {
parser::basic_parse(
rec,
&self.index,
&mut self.queries,
0,
rec.len() - 1,
0,
self.queries_len,
&mut self.stats,
true,
&mut results,
&self.b_quote,
)?;
self.trained_num += 1;
if self.trained_num >= self.train_num {
self.trained = true;
}
}
Ok(results)
}
}
#[inline]
fn is_valid_query_str(query_str: &[u8]) -> bool {
if query_str.len() < ROOT_QUERY_STR_OFFSET + 1 || query_str[0] != DOLLAR || query_str[1] != DOT {
return false;
}
let mut s = ROOT_QUERY_STR_OFFSET - 1;
for i in s + 1..query_str.len() {
if query_str[i] != DOT {
continue;
}
if i == s + 1 || i == query_str.len() - 1 {
return false;
}
s = i;
}
true
}
#[inline]
fn set_queries<'a>(queries: &mut FnvHashMap<&'a [u8], Query<'a>>, s: &'a [u8], i: usize, qi: usize, ri: usize) -> (usize, usize) {
for j in i..s.len() {
if s[j] == DOT {
let t = &s[i..j];
let query = queries.entry(t).or_insert(Query {
i: qi,
ri: ri,
target: false,
children: None,
children_len: 0,
});
let mut children = query.children.get_or_insert(FnvHashMap::default());
let (child_level, next_qi) = set_queries(
&mut children,
s,
j + 1,
if qi == query.i { qi + 1 } else { qi },
ri,
);
query.children_len = children.len();
return (child_level + 1, next_qi);
}
}
let t = &s[i..];
if !queries.contains_key(t) {
queries.insert(
t,
Query {
i: qi,
ri: ri,
target: true,
children: None,
children_len: 0,
},
);
return (1, qi + 1);
} else {
queries.get_mut(t).unwrap().target = true;
}
(1, qi)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pikkr_new() {
struct TestCase<'a> {
query_strs: Vec<&'a [u8]>,
err: bool,
}
let test_cases = vec![
TestCase {
query_strs: vec![],
err: false,
},
TestCase {
query_strs: vec!["".as_bytes()],
err: true,
},
TestCase {
query_strs: vec!["$".as_bytes()],
err: true,
},
TestCase {
query_strs: vec!["$.".as_bytes()],
err: true,
},
TestCase {
query_strs: vec!["$.aaaa".as_bytes()],
err: false,
},
TestCase {
query_strs: vec!["$.aaaa".as_bytes(), "".as_bytes()],
err: true,
},
TestCase {
query_strs: vec!["$.aaaa".as_bytes(), "$".as_bytes()],
err: true,
},
TestCase {
query_strs: vec!["$.aaaa".as_bytes(), "$.".as_bytes()],
err: true,
},
TestCase {
query_strs: vec!["$.aaaa".as_bytes(), "$.bbbb".as_bytes()],
err: false,
},
];
for t in test_cases {
let err = Pikkr::new(&t.query_strs, 1).is_err();
assert_eq!(t.err, err);
}
}
#[test]
fn test_pikkr_basic_parse() {
let queries = vec![
"$.f1".as_bytes(),
"$.f2".as_bytes(),
"$.f2.f1".as_bytes(),
"$.f2.f2.f1".as_bytes(),
"$.f2.f3".as_bytes(),
"$.f3".as_bytes(),
"$.f4".as_bytes(),
];
let mut p = Pikkr::new(&queries, 1000000000).unwrap();
struct TestCase<'a> {
rec: &'a str,
want: Result<Vec<Option<&'a [u8]>>>,
}
let test_cases = vec![
TestCase {
rec: r#"{}"#,
want: Ok(vec![None, None, None, None, None, None, None]),
},
TestCase {
rec: r#"{"f0": "a"}"#,
want: Ok(vec![None, None, None, None, None, None, None]),
},
TestCase {
rec: r#"{"f0": "a", "f1": "b"}"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
None,
None,
None,
None,
None,
None,
]),
},
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}, "f3": [1, 2, 3]}"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
None,
Some(r#"[1, 2, 3]"#.as_bytes()),
None,
]),
},
TestCase {
rec: r#"{"f1": "Português do Brasil,Català,Deutsch,Español,Français,Bahasa,Italiano,עִבְרִית,日本語,한국어,Română,中文(简体),中文(繁體),Українська,Ўзбекча,Türkçe"}"#,
want: Ok(vec![
Some(
r#""Português do Brasil,Català,Deutsch,Español,Français,Bahasa,Italiano,עִבְרִית,日本語,한국어,Română,中文(简体),中文(繁體),Українська,Ўзбекча,Türkçe""#.as_bytes(),
),
None,
None,
None,
None,
None,
None,
]),
},
TestCase {
rec: r#"{"f1": "\"f1\": \\"}"#,
want: Ok(vec![
Some(r#""\"f1\": \\""#.as_bytes()),
None,
None,
None,
None,
None,
None,
]),
},
TestCase {
rec: r#"
{
"f1" : "b"
}
"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
None,
None,
None,
None,
None,
None,
]),
},
];
for t in test_cases {
let got = p.parse(t.rec.as_bytes());
assert_eq!(t.want, got);
}
}
#[test]
fn test_pikkr_speculative_parse() {
let queries = vec![
"$.f1".as_bytes(),
"$.f2".as_bytes(),
"$.f2.f1".as_bytes(),
"$.f2.f2.f1".as_bytes(),
"$.f3".as_bytes(),
];
let mut p = Pikkr::new(&queries, 1).unwrap();
struct TestCase<'a> {
rec: &'a str,
want: Result<Vec<Option<&'a [u8]>>>,
}
let test_cases = vec![
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}, "f3": [1, 2, 3]}"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
Some(r#"[1, 2, 3]"#.as_bytes()),
]),
},
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}, "f3": [1, 2, 3]}"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
Some(r#"[1, 2, 3]"#.as_bytes()),
]),
},
TestCase {
rec: r#"{"f1": "b", "f0": "a", "f3": [1, 2, 3], "f2": {"f2": {"f2": "d", "f1": "c"}, "f1": 1}}"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f2": {"f2": "d", "f1": "c"}, "f1": 1}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
Some(r#"[1, 2, 3]"#.as_bytes()),
]),
},
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}}"#,
want: Ok(vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
None,
]),
},
TestCase {
rec: r#"{}"#,
want: Ok(vec![None, None, None, None, None]),
},
];
for t in test_cases {
let got = p.parse(t.rec.as_bytes());
assert_eq!(t.want, got);
}
}
#[test]
fn test_is_valid_query_str() {
struct TestCase<'a> {
query_str: &'a str,
want: bool,
}
let test_cases = vec![
TestCase {
query_str: "",
want: false,
},
TestCase {
query_str: "$",
want: false,
},
TestCase {
query_str: "$.",
want: false,
},
TestCase {
query_str: "$..",
want: false,
},
TestCase {
query_str: "a.a",
want: false,
},
TestCase {
query_str: "$aa",
want: false,
},
TestCase {
query_str: "$.a",
want: true,
},
TestCase {
query_str: "$.aaaa",
want: true,
},
TestCase {
query_str: "$.aaaa.",
want: false,
},
TestCase {
query_str: "$.aaaa.b",
want: true,
},
TestCase {
query_str: "$.aaaa.bbbb",
want: true,
},
TestCase {
query_str: "$.aaaa.bbbb.",
want: false,
},
];
for t in test_cases {
let got = is_valid_query_str(t.query_str.as_bytes());
assert_eq!(t.want, got);
}
}
}