use super::avx;
use super::bit;
use super::index_builder;
use super::parser;
use super::query::Query;
use super::stat::Stat;
use super::utf8::{BACKSLASH, COLON, DOT, LEFT_BRACE, QUOTE, RIGHT_BRACE};
use std::cmp;
use fnv::{FnvHashMap, FnvHashSet};
use x86intrin::m256i;
const ROOT_QUERY_STR_OFFSET: usize = 2;
pub struct Pikkr<'a> {
backslash: m256i,
quote: m256i,
colon: m256i,
left_brace: m256i,
right_brace: m256i,
query_strs: &'a Vec<&'a[u8]>,
query_strs_len: usize,
queries: FnvHashMap<&'a[u8], Query<'a>>,
query_num: usize,
level: usize,
train_num: usize,
trained_num: usize,
trained: bool,
stats: FnvHashMap<&'a[u8], Stat<'a>>,
}
impl<'a> Pikkr<'a> {
#[inline]
pub fn new(query_strs: &'a Vec<&'a[u8]>, train_num: usize) -> Pikkr<'a> {
let mut p = Pikkr {
backslash: avx::mm256i(BACKSLASH as i8),
quote: avx::mm256i(QUOTE as i8),
colon: avx::mm256i(COLON as i8),
left_brace: avx::mm256i(LEFT_BRACE as i8),
right_brace: avx::mm256i(RIGHT_BRACE as i8),
query_strs: query_strs,
query_strs_len: query_strs.len(),
queries: FnvHashMap::default(),
query_num: 0,
level: 0,
train_num: train_num,
trained_num: 0,
trained: false,
stats: FnvHashMap::default(),
};
let mut level = 0;
for query_str in query_strs {
let query_num = set_queries(&mut p.queries, query_str, ROOT_QUERY_STR_OFFSET);
p.query_num += query_num;
level = cmp::max(level, query_num);
}
p.level = level;
p
}
#[inline]
pub fn parse<'b>(&mut self, rec: &'b[u8]) -> Vec<Option<&'b[u8]>> {
let rec_len = rec.len();
let rec_m256i_len = (rec_len + 31) / 32;
let mut rec_m256i = Vec::with_capacity(rec_m256i_len);
avx::u8_to_m256i(rec, &mut rec_m256i);
let b_len = (rec_m256i_len + 1) / 2;
let mut b_backslash = Vec::with_capacity(b_len);
index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_backslash, self.backslash);
let mut b_quote = Vec::with_capacity(b_len);
index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_quote, self.quote);
let mut b_colon = Vec::with_capacity(b_len);
index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_colon, self.colon);
let mut b_left = Vec::with_capacity(b_len);
index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_left, self.left_brace);
let mut b_right = Vec::with_capacity(b_len);
index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_right, self.right_brace);
index_builder::build_structural_quote_bitmap(&b_backslash, &mut b_quote);
index_builder::build_string_mask_bitmap(&mut b_quote);
let b_string_mask = b_quote;
bit::and(&b_string_mask, &mut b_colon);
bit::and(&b_string_mask, &mut b_left);
bit::and(&b_string_mask, &mut b_right);
let mut index = Vec::with_capacity(self.level);
index_builder::build_leveled_colon_bitmap(&b_colon, &b_left, &b_right, self.level, &mut index);
clear_query_results(&mut self.queries);
if self.trained {
if !parser::speculative_parse(rec, &index, &mut self.queries, 0, rec_len-1, 0, &self.stats) {
parser::basic_parse(rec, &index, &mut self.queries, 0, rec_len-1, 0, self.query_num, 0);
}
} else {
parser::basic_parse(rec, &index, &mut self.queries, 0, rec_len-1, 0, self.query_num, 0);
set_stats(&self.queries, &mut self.stats);
self.trained_num += 1;
if self.trained_num >= self.train_num {
self.trained = true;
}
}
let mut results = Vec::with_capacity(self.query_strs_len);
for query_str in self.query_strs {
set_result(rec, &self.queries, query_str, &mut results, ROOT_QUERY_STR_OFFSET);
}
results
}
}
#[inline]
fn set_queries<'a>(queries: &mut FnvHashMap<&'a[u8], Query<'a>>, s: &'a[u8], i: usize) -> usize {
for j in i..s.len() {
if s[j] == DOT {
let t = s.get(i..j).unwrap();
let query = queries.entry(t).or_insert(Query {
result: None,
children: None,
});
let mut children = query.children.get_or_insert(FnvHashMap::default());
return set_queries(&mut children, s, j+1) + 1;
}
}
let t = s.get(i..s.len()).unwrap();
if !queries.contains_key(t) {
queries.insert(t, Query {
result: None,
children: None,
});
return 1;
}
0
}
#[inline]
fn clear_query_results(queries: &mut FnvHashMap<&[u8], Query>) {
for (_, q) in queries.iter_mut() {
q.result = None;
if let Some(ref mut children) = q.children {
clear_query_results(children);
}
}
}
#[inline]
fn set_stats<'a>(queries: &FnvHashMap<&'a[u8], Query<'a>>, stats: &mut FnvHashMap<&'a[u8], Stat<'a>>) {
for (s, q) in queries.iter() {
if let Some(result) = q.result {
let st = stats.entry(s).or_insert(Stat {
locs: FnvHashSet::default(),
children: None,
});
st.locs.insert(result.2);
if let Some(ref children) = q.children {
let st_children = st.children.get_or_insert(FnvHashMap::default());
set_stats(&children, st_children);
}
}
}
}
#[inline]
fn set_result<'a>(rec: &'a[u8], queries: &FnvHashMap<&[u8], Query>, s: &[u8], d: &mut Vec<Option<&'a[u8]>>, i: usize) {
for j in i..s.len() {
if s[j] == DOT {
let t = s.get(i..j).unwrap();
match queries.get(t) {
Some(query) => {
match query.children {
Some(ref children) => set_result(rec, children, s, d, j+1),
_ => d.push(None)
}
},
_ => d.push(None)
}
return;
}
}
let t = s.get(i..s.len()).unwrap();
d.push(match queries.get(t) {
Some(query) => {
match query.result {
Some(result) => {
Some(rec.get(result.0..result.1).unwrap())
},
_ => None,
}
},
_ => None,
});
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pikkr_basic_parse() {
let queries = vec![
"$.f1".as_bytes(),
"$.f2".as_bytes(),
"$.f2.f1".as_bytes(),
"$.f2.f2.f1".as_bytes(),
"$.f2.f3".as_bytes(),
"$.f3".as_bytes(),
"$.f4".as_bytes(),
];
let mut p = Pikkr::new(&queries, 1000000000);
struct TestCase<'a> {
rec: &'a str,
want: Vec<Option<&'a[u8]>>,
}
let test_cases = vec![
TestCase {
rec: r#"{}"#,
want: vec![None, None, None, None, None, None, None],
},
TestCase {
rec: r#"{"f0": "a"}"#,
want: vec![None, None, None, None, None, None, None],
},
TestCase {
rec: r#"{"f0": "a", "f1": "b"}"#,
want: vec![Some(r#""b""#.as_bytes()), None, None, None, None, None, None],
},
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}, "f3": [1, 2, 3]}"#,
want: vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
None,
Some(r#"[1, 2, 3]"#.as_bytes()),
None,
]
},
TestCase {
rec: r#"{"f1": "Português do Brasil,Català,Deutsch,Español,Français,Bahasa,Italiano,עִבְרִית,日本語,한국어,Română,中文(简体),中文(繁體),Українська,Ўзбекча,Türkçe"}"#,
want: vec![Some(r#""Português do Brasil,Català,Deutsch,Español,Français,Bahasa,Italiano,עִבְרִית,日本語,한국어,Română,中文(简体),中文(繁體),Українська,Ўзбекча,Türkçe""#.as_bytes()), None, None, None, None, None, None],
},
TestCase {
rec: r#"{"f1": "\"f1\": \\"}"#,
want: vec![Some(r#""\"f1\": \\""#.as_bytes()), None, None, None, None, None, None],
},
TestCase {
rec: r#"
{
"f1" : "b"
}
"#,
want: vec![Some(r#""b""#.as_bytes()), None, None, None, None, None, None],
}
];
for t in test_cases {
let got = p.parse(t.rec.as_bytes());
assert_eq!(t.want, got);
}
}
#[test]
fn test_pikkr_speculative_parse() {
let queries = vec![
"$.f1".as_bytes(),
"$.f2".as_bytes(),
"$.f2.f1".as_bytes(),
"$.f2.f2.f1".as_bytes(),
"$.f3".as_bytes(),
];
let mut p = Pikkr::new(&queries, 1);
struct TestCase<'a> {
rec: &'a str,
want: Vec<Option<&'a[u8]>>,
}
let test_cases = vec![
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}, "f3": [1, 2, 3]}"#,
want: vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
Some(r#"[1, 2, 3]"#.as_bytes()),
]
},
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}, "f3": [1, 2, 3]}"#,
want: vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
Some(r#"[1, 2, 3]"#.as_bytes()),
]
},
TestCase {
rec: r#"{"f1": "b", "f0": "a", "f3": [1, 2, 3], "f2": {"f2": {"f2": "d", "f1": "c"}, "f1": 1}}"#,
want: vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f2": {"f2": "d", "f1": "c"}, "f1": 1}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
Some(r#"[1, 2, 3]"#.as_bytes()),
]
},
TestCase {
rec: r#"{"f0": "a", "f1": "b", "f2": {"f1": 1, "f2": {"f1": "c", "f2": "d"}}}"#,
want: vec![
Some(r#""b""#.as_bytes()),
Some(r#"{"f1": 1, "f2": {"f1": "c", "f2": "d"}}"#.as_bytes()),
Some(r#"1"#.as_bytes()),
Some(r#""c""#.as_bytes()),
None,
]
},
TestCase {
rec: r#"{}"#,
want: vec![None, None, None, None, None],
}
];
for t in test_cases {
let got = p.parse(t.rec.as_bytes());
assert_eq!(t.want, got);
}
}
}