use crate::idb::trie::{PrefixIter, Trie};
use crate::ig::docs::{Doc, Element, Query};
use crate::ig::symbols::Symbol;
use std::sync::atomic::{AtomicUsize, Ordering};
pub struct IndexEntry {
field: Symbol,
row: usize,
exact: bool,
}
const MIN_TOKEN_LENGTH: usize = 3;
pub enum IndexType {
LookupIndex(String),
FulltextIndex(String),
}
impl IndexType {
pub fn lookup(path: &str) -> Self {
IndexType::LookupIndex(path.to_string())
}
pub fn fulltext(path: &str) -> Self {
IndexType::FulltextIndex(path.to_string())
}
pub fn field_name(&self) -> &str {
match self {
IndexType::LookupIndex(name) => name.as_ref(),
IndexType::FulltextIndex(name) => name.as_ref(),
}
}
}
impl PartialEq for IndexEntry {
fn eq(&self, other: &Self) -> bool {
self.field == other.field && self.row == other.row
}
}
pub struct Table {
doc: Doc,
index: Trie<IndexEntry>,
default_lang_query: Query,
known_indices: fnv::FnvHashSet<Symbol>,
queries: AtomicUsize,
scan_queries: AtomicUsize,
scans: AtomicUsize,
}
enum SearchFields {
All,
IndexLookup(fnv::FnvHashSet<Symbol>),
Scan(Vec<Query>),
}
const DEFAULT_LANG: &str = "xx";
impl Table {
pub fn new(mut doc: Doc, indices: Vec<IndexType>) -> anyhow::Result<Self> {
let mut trie = Trie::new();
let known_indices = Table::build_indices(&mut doc, indices, &mut trie)?;
let default_lang_query = doc.compile(DEFAULT_LANG);
Ok(Table {
doc,
index: trie,
default_lang_query,
known_indices,
queries: AtomicUsize::new(0),
scan_queries: AtomicUsize::new(0),
scans: AtomicUsize::new(0),
})
}
fn build_indices(
doc: &mut Doc,
indices: Vec<IndexType>,
trie: &mut Trie<IndexEntry>,
) -> anyhow::Result<fnv::FnvHashSet<Symbol>> {
let mut known_indices = fnv::FnvHashSet::default();
for index_type in indices {
let query = doc.compile(index_type.field_name());
let field_name = index_type.field_name();
let field_symbol = doc.symbols_mut().find_or_create(field_name)?;
let _ = known_indices.insert(field_symbol);
let mut row = 0;
let max = doc.root().len();
while row < max {
let element = doc.root().at(row);
let mut dedup_set = fnv::FnvHashSet::default();
let mut additional_index_values = Vec::new();
for child in query.execute_all(element) {
Table::search_values(child, field_name, |field, value| {
if !dedup_set.contains(value) {
trie.insert_unique(
value,
IndexEntry {
field: field_symbol,
row,
exact: true,
},
);
let _ = dedup_set.insert(value.to_string());
}
if field_name != field {
additional_index_values.push((field.to_string(), value.to_string()));
}
});
if let IndexType::FulltextIndex(_) = index_type {
Table::search_values(child, "", |_, value| {
Table::tokenize(value, |token, full_token| {
if !dedup_set.contains(token) {
trie.insert_unique(
token,
IndexEntry {
field: field_symbol,
row,
exact: full_token,
},
);
let _ = dedup_set.insert(token.to_string());
}
});
});
}
}
for (field, value) in additional_index_values {
if let Ok(field_symbol) = doc.symbols_mut().find_or_create(field) {
trie.insert_unique(
value.as_str(),
IndexEntry {
field: field_symbol,
row,
exact: true,
},
);
let _ = known_indices.insert(field_symbol);
}
}
row += 1;
}
}
Ok(known_indices)
}
fn search_values<C>(element: Element, prefix: &str, mut callback: C)
where
C: FnMut(&str, &str),
{
if element.is_list() {
for child in element.iter() {
callback(prefix, child.to_str().as_ref())
}
} else if element.is_object() {
for (key, value) in element.entries() {
let field = format!("{}.{}", prefix, key);
if value.is_list() {
for child in value.iter() {
callback(field.as_str(), child.to_str().as_ref())
}
} else {
callback(field.as_str(), value.to_str().as_ref())
}
}
} else {
callback(prefix, element.to_str().as_ref())
}
}
fn tokenize<C>(value: &str, mut callback: C)
where
C: FnMut(&str, bool),
{
let effective_value = value.to_lowercase();
callback(effective_value.as_str(), true);
for token in effective_value.split(Table::split_pattern) {
let effective_token = token.trim_matches(|ch: char| !ch.is_alphanumeric());
if effective_token.len() >= MIN_TOKEN_LENGTH {
callback(effective_token, false);
for inner_token in effective_token.split(|ch: char| !ch.is_alphanumeric()) {
if inner_token.len() >= MIN_TOKEN_LENGTH {
callback(inner_token, false);
}
}
}
}
}
fn split_pattern(ch: char) -> bool {
if ch.is_alphanumeric() {
false
} else {
!matches!(ch, '-' | '_' | '.' | ':' | '/' | '\\' | '@')
}
}
fn determine_search_fields(&self, fields_string: &str) -> SearchFields {
if fields_string == "*" {
SearchFields::All
} else {
let mut fields = fnv::FnvHashSet::default();
for field in fields_string.split(',') {
if let Some(field_symbol) = self
.doc
.symbols()
.resolve(field)
.filter(|symbol| self.known_indices.contains(symbol))
{
let _ = fields.insert(field_symbol);
} else {
if !self.is_non_existent_inner_index(field) {
return self.compile_scan_fields(fields_string);
}
}
}
SearchFields::IndexLookup(fields)
}
}
fn is_non_existent_inner_index(&self, field: &str) -> bool {
for (index, _) in field.match_indices('.') {
if self
.doc
.symbols()
.resolve(&field[..index])
.filter(|symbol| self.known_indices.contains(symbol))
.is_some()
{
return true;
}
}
false
}
fn compile_scan_fields(&self, fields_string: &str) -> SearchFields {
let mut fields = Vec::new();
for field in fields_string.split(',') {
fields.push(self.doc.compile(field));
}
SearchFields::Scan(fields)
}
pub fn compile(&self, query: impl AsRef<str>) -> Query {
self.doc.compile(query.as_ref())
}
pub fn query<'a>(
&'a self,
fields: &'a str,
value: &'a str,
exact: bool,
) -> anyhow::Result<TableIter<'a>> {
self.queries
.store(self.queries.load(Ordering::Relaxed) + 1, Ordering::Relaxed);
match self.determine_search_fields(fields) {
SearchFields::All => {
if exact {
Ok(TableIter::ExactIndexQuery(
&self.doc,
self.index.query_values(value),
None,
0,
))
} else {
Ok(TableIter::PrefixIndexQuery(
&self.doc,
self.index.prefix_query(value.to_lowercase().as_str()),
fnv::FnvHashSet::default(),
None,
))
}
}
SearchFields::IndexLookup(fields) => {
if exact {
Ok(TableIter::ExactIndexQuery(
&self.doc,
self.index.query_values(value),
Some(fields),
0,
))
} else {
Ok(TableIter::PrefixIndexQuery(
&self.doc,
self.index.prefix_query(value.to_lowercase().as_str()),
fnv::FnvHashSet::default(),
Some(fields),
))
}
}
SearchFields::Scan(fields) => {
if !exact {
Err(anyhow::anyhow!(
"Cannot use a search query on non-indexed fields!"
))
} else {
self.scan_queries.store(
self.scan_queries.load(Ordering::Relaxed) + 1,
Ordering::Relaxed,
);
Ok(TableIter::ScanQuery(&self.doc, fields, value, 0))
}
}
}
}
pub fn table_scan(&'_ self) -> TableIter<'_> {
self.scans
.store(self.scans.load(Ordering::Relaxed) + 1, Ordering::Relaxed);
TableIter::TableScan(&self.doc, 0)
}
pub fn len(&self) -> usize {
self.doc.root().len()
}
pub fn is_empty(&self) -> bool {
self.doc.root().is_empty()
}
pub fn num_queries(&self) -> usize {
self.queries.load(Ordering::Relaxed)
}
pub fn num_scan_queries(&self) -> usize {
self.scan_queries.load(Ordering::Relaxed)
}
pub fn num_scans(&self) -> usize {
self.scans.load(Ordering::Relaxed)
}
pub fn allocated_memory(&self) -> usize {
self.doc.allocated_size() + self.index.allocated_size()
}
pub fn default_lang_query(&self) -> &Query {
&self.default_lang_query
}
}
pub enum TableIter<'a> {
ExactIndexQuery(
&'a Doc,
Option<&'a Vec<IndexEntry>>,
Option<fnv::FnvHashSet<Symbol>>,
usize,
),
PrefixIndexQuery(
&'a Doc,
PrefixIter<'a, IndexEntry>,
fnv::FnvHashSet<usize>,
Option<fnv::FnvHashSet<Symbol>>,
),
ScanQuery(&'a Doc, Vec<Query>, &'a str, usize),
TableScan(&'a Doc, usize),
}
impl<'a> Iterator for TableIter<'a> {
type Item = Element<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self {
TableIter::ExactIndexQuery(doc, hits, fields, next_hit) => {
if let Some(hits) = hits {
while *next_hit < hits.len() {
let hit = &hits[*next_hit];
*next_hit += 1;
if hit.exact
&& (fields.is_none() || fields.as_ref().unwrap().contains(&hit.field))
{
return Some(doc.root().at(hit.row));
}
}
}
None
}
TableIter::PrefixIndexQuery(doc, iter, de_dup, fields) => {
for hit in iter {
if (fields.is_none() || fields.as_ref().unwrap().contains(&hit.field))
&& !de_dup.contains(&hit.row)
{
let _ = de_dup.insert(hit.row);
return Some(doc.root().at(hit.row));
}
}
None
}
TableIter::ScanQuery(doc, fields, value, next_hit) => {
while *next_hit < doc.root().len() {
let row = doc.root().at(*next_hit);
*next_hit += 1;
if TableIter::has_match(row, fields, value) {
return Some(row);
}
}
None
}
TableIter::TableScan(doc, next_hit) => {
if *next_hit < doc.root().len() {
let row = doc.root().at(*next_hit);
*next_hit += 1;
Some(row)
} else {
None
}
}
}
}
}
impl TableIter<'_> {
fn has_match(row: Element, fields: &[Query], value: &str) -> bool {
let mut matching = false;
for field in fields.iter() {
let field_value = field.execute(row);
Table::search_values(field_value, "", |_, search_val| {
if search_val == value {
matching = true
}
});
if matching {
break;
}
}
matching
}
}
#[cfg(test)]
mod tests {
use crate::idb::table::{IndexType, Table};
use crate::ig::docs::Doc;
use crate::ig::yaml::list_to_doc;
use yaml_rust::YamlLoader;
#[test]
fn table_test() {
let dataset = create_example_dataset();
let table = Table::new(
dataset,
vec![
IndexType::lookup("code"),
IndexType::lookup("iso.two"),
IndexType::lookup("mappings"),
IndexType::fulltext("name"),
],
)
.unwrap();
assert_eq!(table.len(), 2);
assert_eq!(table.is_empty(), false);
let row = table.query("code", "D", true).unwrap().next().unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
assert_eq!(
table.query("code", "d", true).unwrap().next().is_none(),
true
);
let row = table
.query("name", "deutschland", true)
.unwrap()
.next()
.unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
let row = table.query("name", "deu", false).unwrap().next().unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
let row = table.query("name", "Deu", false).unwrap().next().unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
assert_eq!(
table.query("name", "de", true).unwrap().next().is_none(),
true
);
assert_eq!(
table.query("name", "xxx", true).unwrap().next().is_none(),
true
);
assert_eq!(table.query("iso.three", "xxx", false).is_err(), true);
let row = table
.query("iso.three", "aut", true)
.unwrap()
.next()
.unwrap();
assert_eq!(row.query("name.en").as_str().unwrap(), "Austria");
let scan_queries_so_far = table.num_scan_queries();
let row = table
.query("mappings.acme", "DD", true)
.unwrap()
.next()
.unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
let _ = table.query("mappings.acme1", "DD", true);
let _ = table.query("mappings.acme", "YY", true);
assert_eq!(table.num_scan_queries(), scan_queries_so_far);
assert_eq!(table.table_scan().count(), 2);
assert_eq!(table.num_queries(), 12);
assert_eq!(table.num_scan_queries(), 1);
assert_eq!(table.num_scans(), 1);
}
fn create_example_dataset() -> Doc {
let input = r#"
code: "D"
iso:
two: "de"
three: "deu"
name:
de: "Deutschland"
en: "Germany"
mappings:
acme: "DD"
---
code: "A"
iso:
two: "at"
three: "aut"
upper: "AAA"
name:
de: "Österreich"
en: "Austria"
"#;
let rows = YamlLoader::load_from_str(input).unwrap();
list_to_doc(rows.as_slice(), |_| true).unwrap()
}
}