pikkr 0.4.1

JSON Parser which picks up values directly without performing tokenization
use super::avx;
use super::bit;
use super::index_builder;
use super::parser;
use super::query::Query;
use super::stat::Stat;
use super::utf8::{BACKSLASH, COLON, DOT, LEFT_BRACE, QUOTE, RIGHT_BRACE};
use std::cmp;
use fnv::{FnvHashMap, FnvHashSet};
use x86intrin::m256i;

const ROOT_QUERY_STR_OFFSET: usize = 2;

pub struct Pikkr<'a> {
    backslash: m256i,
    quote: m256i,
    colon: m256i,
    left_brace: m256i,
    right_brace: m256i,

    query_strs: &'a Vec<&'a[u8]>,
    query_strs_len: usize,
    queries: FnvHashMap<&'a[u8], Query<'a>>,
    query_num: usize,
    level: usize,

    train_num: usize,
    trained_num: usize,
    trained: bool,

    stats: FnvHashMap<&'a[u8], Stat<'a>>,
}

impl<'a> Pikkr<'a> {
    #[inline]
    pub fn new(query_strs: &'a Vec<&'a[u8]>, train_num: usize) -> Pikkr<'a> {
        let mut p = Pikkr {
            backslash: avx::mm256i(BACKSLASH as i8),
            quote: avx::mm256i(QUOTE as i8),
            colon: avx::mm256i(COLON as i8),
            left_brace: avx::mm256i(LEFT_BRACE as i8),
            right_brace: avx::mm256i(RIGHT_BRACE as i8),

            query_strs: query_strs,
            query_strs_len: query_strs.len(),
            queries: FnvHashMap::default(),
            query_num: 0,
            level: 0,

            train_num: train_num,
            trained_num: 0,
            trained: false,

            stats: FnvHashMap::default(),
        };

        let mut level = 0;
        for query_str in query_strs {
            let query_num = set_queries(&mut p.queries, query_str, ROOT_QUERY_STR_OFFSET);
            p.query_num += query_num;
            level = cmp::max(level, query_num);
        }
        p.level = level;

        p
    }

    #[inline]
    pub fn parse<'b>(&mut self, rec: &'b[u8]) -> Vec<Option<&'b[u8]>> {
        let rec_len = rec.len();

        let rec_m256i_len = (rec_len + 31) / 32;
        let mut rec_m256i = Vec::with_capacity(rec_m256i_len);
        avx::u8_to_m256i(rec, &mut rec_m256i);

        let b_len = (rec_m256i_len + 1) / 2;
        let mut b_backslash = Vec::with_capacity(b_len);
        index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_backslash, self.backslash);
        let mut b_quote = Vec::with_capacity(b_len);
        index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_quote, self.quote);
        let mut b_colon = Vec::with_capacity(b_len);
        index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_colon, self.colon);
        let mut b_left = Vec::with_capacity(b_len);
        index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_left, self.left_brace);
        let mut b_right = Vec::with_capacity(b_len);
        index_builder::build_structural_character_bitmap(&rec_m256i, &mut b_right, self.right_brace);

        index_builder::build_structural_quote_bitmap(&b_backslash, &mut b_quote);

        index_builder::build_string_mask_bitmap(&mut b_quote);
        let b_string_mask = b_quote;

        bit::and(&b_string_mask, &mut b_colon);
        bit::and(&b_string_mask, &mut b_left);
        bit::and(&b_string_mask, &mut b_right);

        let mut index = Vec::with_capacity(self.level);
        index_builder::build_leveled_colon_bitmap(&b_colon, &b_left, &b_right, self.level, &mut index);

        clear_query_results(&mut self.queries);

        if self.trained {
            if !parser::speculative_parse(rec, &index, &mut self.queries, 0, rec_len-1, 0, &self.stats) {
                parser::basic_parse(rec, &index, &mut self.queries, 0, rec_len-1, 0, self.query_num, 0);
            }
        } else {
            parser::basic_parse(rec, &index, &mut self.queries, 0, rec_len-1, 0, self.query_num, 0);
            set_stats(&self.queries, &mut self.stats);
            self.trained_num += 1;
            if self.trained_num >= self.train_num {
                self.trained = true;
            }
        }

        let mut results = Vec::with_capacity(self.query_strs_len);
        for query_str in self.query_strs {
            set_result(rec, &self.queries, query_str, &mut results, ROOT_QUERY_STR_OFFSET);
        }

        results
    }
}

#[inline]
fn set_queries<'a>(queries: &mut FnvHashMap<&'a[u8], Query<'a>>, s: &'a[u8], i: usize) -> usize {
    for j in i..s.len() {
        if s[j] == DOT {
            let t = s.get(i..j).unwrap();
            let query = queries.entry(t).or_insert(Query {
                result: None,
                children: None,
            });
            let mut children = query.children.get_or_insert(FnvHashMap::default());
            return set_queries(&mut children, s, j+1) + 1;
        }
    }
    let t = s.get(i..s.len()).unwrap();
    if !queries.contains_key(t) {
        queries.insert(t, Query {
            result: None,
            children: None,
        });
        return 1;
    }
    0
}

#[inline]
fn clear_query_results(queries: &mut FnvHashMap<&[u8], Query>) {
    for (_, q) in queries.iter_mut() {
        q.result = None;
        if let Some(ref mut children) = q.children {
            clear_query_results(children);
        }
    }
}

#[inline]
fn set_stats<'a>(queries: &FnvHashMap<&'a[u8], Query<'a>>, stats: &mut FnvHashMap<&'a[u8], Stat<'a>>) {
    for (s, q) in queries.iter() {
        if let Some(result) = q.result {
            let st = stats.entry(s).or_insert(Stat {
                locs: FnvHashSet::default(),
                children: None,
            });
            st.locs.insert(result.2);
            if let Some(ref children) = q.children {
                let st_children = st.children.get_or_insert(FnvHashMap::default());
                set_stats(&children, st_children);
            }
        }
    }
}

#[inline]
fn set_result<'a>(rec: &'a[u8], queries: &FnvHashMap<&[u8], Query>, s: &[u8], d: &mut Vec<Option<&'a[u8]>>, i: usize) {
    for j in i..s.len() {
        if s[j] == DOT {
            let t = s.get(i..j).unwrap();
            match queries.get(t) {
                Some(query) => {
                    match query.children {
                        Some(ref children) => set_result(rec, children, s, d, j+1),
                        _ => d.push(None)
                    }
                },
                _ => d.push(None)
            }
            return;
        }
    }
    let t = s.get(i..s.len()).unwrap();
    d.push(match queries.get(t) {
        Some(query) => {
            match query.result {
                Some(result) => {
                    Some(rec.get(result.0..result.1).unwrap())
                },
                _ => None,
            }
        },
        _ => None,
    });
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pikkr_parse() {
        let queries = vec![
            "$.aaa".as_bytes(),
            "$.bbb.ddd".as_bytes(),
            "$.fff".as_bytes(),
        ];

        let mut p = Pikkr::new(&queries, 2);

        let rec = r#" {"aaa": "AAA", "bbb": {"ccc": "CCC", "ddd":"DDD", "eee": "EEE"}  , "fff":111  } "#.as_bytes();
        let v = p.parse(rec);
        println!("rec: {}", unsafe { String::from_utf8_unchecked(rec.to_vec()) });
        for (i, x) in v.iter().enumerate() {
            println!("{}: {}", unsafe { String::from_utf8_unchecked(queries[i].to_vec()) }, match *x {
                Some(x) => unsafe { String::from_utf8_unchecked(x.to_vec()) },
                _ => String::from("None"),
            });
        }
        println!("====");
        let rec = r#" {"fff": 222, "bbb": {"ccc": "CCC", "ddd":"DDD", "eee": "EEE"}  , "aaa":"AAAA"  } "#.as_bytes();
        let v = p.parse(rec);
        println!("rec: {}", unsafe { String::from_utf8_unchecked(rec.to_vec()) });
        for (i, x) in v.iter().enumerate() {
            println!("{}: {}", unsafe { String::from_utf8_unchecked(queries[i].to_vec()) }, match *x {
                Some(x) => unsafe { String::from_utf8_unchecked(x.to_vec()) },
                _ => String::from("None"),
            });
        }
        println!("====");
        let rec = r#" {"aaa": "AAA", "bbb": {"ccc": "CCC", "ddd":"DDD", "eee": "EEE"}  , "fff":111  } "#.as_bytes();
        let v = p.parse(rec);
        println!("rec: {}", unsafe { String::from_utf8_unchecked(rec.to_vec()) });
        for (i, x) in v.iter().enumerate() {
            println!("{}: {}", unsafe { String::from_utf8_unchecked(queries[i].to_vec()) }, match *x {
                Some(x) => unsafe { String::from_utf8_unchecked(x.to_vec()) },
                _ => String::from("None"),
            });
        }
        println!("====");
        let rec = r#" {"fff": 222, "bbb": {"ccc": "CCC", "ddd":"DDD", "eee": "EEE"}  , "aaa":"AAAA"  } "#.as_bytes();
        let v = p.parse(rec);
        println!("rec: {}", unsafe { String::from_utf8_unchecked(rec.to_vec()) });
        for (i, x) in v.iter().enumerate() {
            println!("{}: {}", unsafe { String::from_utf8_unchecked(queries[i].to_vec()) }, match *x {
                Some(x) => unsafe { String::from_utf8_unchecked(x.to_vec()) },
                _ => String::from("None"),
            });
        }
        println!("====");
        let rec = r#" {"fff": 222, "bbb": {"ccc": "CCC", "eee":"EEEE", "ddd": "DDDD"}  , "aaa":"AAAA"  } "#.as_bytes();
        let v = p.parse(rec);
        println!("rec: {}", unsafe { String::from_utf8_unchecked(rec.to_vec()) });
        for (i, x) in v.iter().enumerate() {
            println!("{}: {}", unsafe { String::from_utf8_unchecked(queries[i].to_vec()) }, match *x {
                Some(x) => unsafe { String::from_utf8_unchecked(x.to_vec()) },
                _ => String::from("None"),
            });
        }
        println!("====");
        let rec = r#" {"fff": 222, "bbb": {"ccc": "CCC", "eee":"EEEE"}  , "aaa":"AAAA"  } "#.as_bytes();
        let v = p.parse(rec);
        println!("rec: {}", unsafe { String::from_utf8_unchecked(rec.to_vec()) });
        for (i, x) in v.iter().enumerate() {
            println!("{}: {}", unsafe { String::from_utf8_unchecked(queries[i].to_vec()) }, match *x {
                Some(x) => unsafe { String::from_utf8_unchecked(x.to_vec()) },
                _ => String::from("None"),
            });
        }

    }
}