1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
//! A table wraps a [crate::ig::docs::Doc] together with an [Trie] as index to support InfoGraphDB queries.
//!
//! Note that a table consumes a given doc to ensure it is immutable, as the index is built
//! up on creation of the table and then kept as long as the table lives.
use crate::idb::trie::{PrefixIter, Trie};
use crate::ig::docs::{Doc, Element, Query};
use crate::ig::symbols::Symbol;
use std::sync::atomic::{AtomicUsize, Ordering};
/// Represents an entry in the lookup index.
///
/// Basically an entry is either exact (matching the whole field contents) or loose (matching
/// a word / token within a field while comparing case-insensitive). Internally each entry
/// contains the field symbol it points to along with the position (row number) for which is
/// was created.
pub struct IndexEntry {
field: Symbol,
row: usize,
exact: bool,
}
/// Specifies the minimal token length for seach tokens to be indexed.
const MIN_TOKEN_LENGTH: usize = 3;
/// Describes the type of index being built for a field.
pub enum IndexType {
/// A lookup index permits to quickly find all rows which contain the exact field value
/// as given in a query.
LookupIndex(String),
/// A fulltext index permits to search case-insensitive and also contains words or other
/// sub tokens within the text.
FulltextIndex(String),
}
impl IndexType {
/// Creates a new `LookupIndex` for the given field or path.
pub fn lookup(path: &str) -> Self {
IndexType::LookupIndex(path.to_string())
}
/// Creates a new `FulltextIndex` for the given field or path.
pub fn fulltext(path: &str) -> Self {
IndexType::FulltextIndex(path.to_string())
}
/// Returns the field name or path on which this index operates.
pub fn field_name(&self) -> &str {
match self {
IndexType::LookupIndex(name) => name.as_ref(),
IndexType::FulltextIndex(name) => name.as_ref(),
}
}
}
/// Note that when comparing index entries, we only check if their field and row matches no matter
/// if these are exact or loose.
///
/// The idea is that when building the index, we first record all exact matches and then compute
/// the loose ones. Still we want to de-duplicate the index entries, so that for a single match
/// (as in path within the TRIE) a document only occurs once. We do not need two entries for the
/// same word and field (one being loose and one being exact) as all exact matches are also
/// considered valid when performing a loose (`IDB.SEARCH`) search.
impl PartialEq for IndexEntry {
fn eq(&self, other: &Self) -> bool {
self.field == other.field && self.row == other.row
}
}
/// Wraps a [crate::ig::docs::Doc] and permits to execute fast lookups backed by a [Trie].
///
/// Note that this is the work horse of the InfoGraphDB.
pub struct Table {
doc: Doc,
index: Trie<IndexEntry>,
default_lang_query: Query,
known_indices: fnv::FnvHashSet<Symbol>,
queries: AtomicUsize,
scan_queries: AtomicUsize,
scans: AtomicUsize,
}
/// Represents the result when parsing a list of fields to search in.
enum SearchFields {
// Search in all fields. Note that this will only search in indexed fields.
All,
// Search in the given set of fields, which are all backed by an index.
IndexLookup(fnv::FnvHashSet<Symbol>),
// Perform a table scan for the given set of fields.
Scan(Vec<Query>),
}
/// Contains the (fake) language code used to retrieve a fallback or placeholder value if no
/// proper translation is present.
const DEFAULT_LANG: &str = "xx";
impl Table {
/// Creates a new table from the given doc while building a lookup index for the given paths.
///
/// Note that each value in `indices` can be a path like `field.inner.other`.
pub fn new(mut doc: Doc, indices: Vec<IndexType>) -> anyhow::Result<Self> {
let mut trie = Trie::new();
let known_indices = Table::build_indices(&mut doc, indices, &mut trie)?;
let default_lang_query = doc.compile(DEFAULT_LANG);
Ok(Table {
doc,
index: trie,
default_lang_query,
known_indices,
queries: AtomicUsize::new(0),
scan_queries: AtomicUsize::new(0),
scans: AtomicUsize::new(0),
})
}
/// Compiles the list of indices into a set of symbols and also fills the appropriate data
/// into the index TRIE.
fn build_indices(
doc: &mut Doc,
indices: Vec<IndexType>,
trie: &mut Trie<IndexEntry>,
) -> anyhow::Result<fnv::FnvHashSet<Symbol>> {
let mut known_indices = fnv::FnvHashSet::default();
for index_type in indices {
let query = doc.compile(index_type.field_name());
let field_name = index_type.field_name();
let field_symbol = doc.symbols_mut().find_or_create(field_name)?;
let _ = known_indices.insert(field_symbol);
let mut row = 0;
let max = doc.root().len();
while row < max {
let element = doc.root().at(row);
let mut dedup_set = fnv::FnvHashSet::default();
let mut additional_index_values = Vec::new();
for child in query.execute_all(element) {
// Extract all exact matches and insert them into the index...
Table::search_values(child, field_name, |field, value| {
if !dedup_set.contains(value) {
trie.insert_unique(
value,
IndexEntry {
field: field_symbol,
row,
exact: true,
},
);
let _ = dedup_set.insert(value.to_string());
}
// For inner maps, we also create a sub-index e.g. when indexing:
// mappings:
// acme: test
// For an index on mapping, we also record an exact match for mappings.acme
// with the value test (along with an entry for "mappings").
if field_name != field {
additional_index_values.push((field.to_string(), value.to_string()));
}
});
if let IndexType::FulltextIndex(_) = index_type {
// Extract and tokenize all loose matches and insert them into the index.
// Note that this has to be performed in a 2nd step, as a value can be a list or
// even a nested object, where we always need to first store all exact values
// and then collect the loose matches (which are not already present as exact
// match).
// An example would be a list which contains the entries "hello world" and "hello".
// As the tokenized version of "hello world" would also emit "hello" (as loose
// term) we first want to collect it as exact term.
//
// Note that we index the complete (but lowercased) token as a potential
// exact match - see README.md for details
Table::search_values(child, "", |_, value| {
Table::tokenize(value, |token, full_token| {
if !dedup_set.contains(token) {
trie.insert_unique(
token,
IndexEntry {
field: field_symbol,
row,
exact: full_token,
},
);
let _ = dedup_set.insert(token.to_string());
}
});
});
}
}
// Resolve and create "inner indices". This has to happen outside of the
// search_values closure as we need mutable access on symbols (and thus doc).
for (field, value) in additional_index_values {
if let Ok(field_symbol) = doc.symbols_mut().find_or_create(field) {
trie.insert_unique(
value.as_str(),
IndexEntry {
field: field_symbol,
row,
exact: true,
},
);
let _ = known_indices.insert(field_symbol);
}
}
row += 1;
}
}
Ok(known_indices)
}
/// Extracts all fields values to be inserted into the index.
///
/// Next to the trivial case of a simple string or int, this also iterates over inner lists
/// or an object which contains values or an inner list. Note that we only support this single
/// level (to catch translation maps) but do not index complete "subtrees" as this seems quite
/// an overkill.
fn search_values<C>(element: Element, prefix: &str, mut callback: C)
where
C: FnMut(&str, &str),
{
if element.is_list() {
for child in element.iter() {
callback(prefix, child.to_str().as_ref())
}
} else if element.is_object() {
for (key, value) in element.entries() {
let field = format!("{}.{}", prefix, key);
if value.is_list() {
for child in value.iter() {
callback(field.as_str(), child.to_str().as_ref())
}
} else {
callback(field.as_str(), value.to_str().as_ref())
}
}
} else {
callback(prefix, element.to_str().as_ref())
}
}
/// Tokenizes a string value by splitting at all whitespaces or occurrences of
/// `split_pattern`. Each inner token is trimmed (removing both, whitespaces as well
/// as non-alphanumeric characters) and turned to lower case before being submitted to the
/// callback.
fn tokenize<C>(value: &str, mut callback: C)
where
C: FnMut(&str, bool),
{
let effective_value = value.to_lowercase();
callback(effective_value.as_str(), true);
for token in effective_value.split(Table::split_pattern) {
let effective_token = token.trim_matches(|ch: char| !ch.is_alphanumeric());
if effective_token.len() >= MIN_TOKEN_LENGTH {
callback(effective_token, false);
for inner_token in effective_token.split(|ch: char| !ch.is_alphanumeric()) {
if inner_token.len() >= MIN_TOKEN_LENGTH {
callback(inner_token, false);
}
}
}
}
}
/// Provides a pattern matcher which identifies characters to split tokens at.
///
/// We split at whitespace and all "special characters" except:
/// `'-' | '_' | '.' | ':' | '/' | '\\' | '@'`.
fn split_pattern(ch: char) -> bool {
if ch.is_alphanumeric() {
false
} else {
!matches!(ch, '-' | '_' | '.' | ':' | '/' | '\\' | '@')
}
}
/// Parses a field name or comma separated list of field names or paths and compiles
/// them into a `SearchFields` specifier.
fn determine_search_fields(&self, fields_string: &str) -> SearchFields {
if fields_string == "*" {
SearchFields::All
} else {
let mut fields = fnv::FnvHashSet::default();
for field in fields_string.split(',') {
// Try to resolve the field into a symbol for which we know that an index is
// present...
if let Some(field_symbol) = self
.doc
.symbols()
.resolve(field)
.filter(|symbol| self.known_indices.contains(symbol))
{
let _ = fields.insert(field_symbol);
} else {
// If we cannot find an index for the given field, we perform a table scan.
// However, before we do that, we ensure that there isn't an index for the
// base part of a composite field (as in this case we would know that the
// query cannot be fulfilled using this field) See `is_non_existent_inner_index`
// for more details...
if !self.is_non_existent_inner_index(field) {
return self.compile_scan_fields(fields_string);
}
}
}
SearchFields::IndexLookup(fields)
}
}
/// Determines if this is a composite field and if an index exists for any of its parent fields.
///
/// A composite field would be "mappings.acme". If we now detect that no index is present for
/// this field, but that one exists for "mappings", we know that the query cannot be fulfilled
/// for this field, as otherwise the inner index would have been created during indexing.
fn is_non_existent_inner_index(&self, field: &str) -> bool {
for (index, _) in field.match_indices('.') {
if self
.doc
.symbols()
.resolve(&field[..index])
.filter(|symbol| self.known_indices.contains(symbol))
.is_some()
{
return true;
}
}
false
}
/// Parses the list of field or paths into a list of queries to execute during the table scan.
fn compile_scan_fields(&self, fields_string: &str) -> SearchFields {
let mut fields = Vec::new();
for field in fields_string.split(',') {
fields.push(self.doc.compile(field));
}
SearchFields::Scan(fields)
}
/// Compiles the given field or path name into a fast lookup query which extracts the matching
/// value from a given doc.
pub fn compile(&self, query: impl AsRef<str>) -> Query {
self.doc.compile(query.as_ref())
}
/// Executes a query by searching for the given value in the given list of fields
/// (see `determine_search_fields`).
///
/// If `exact` is true, the given value must match the field contents, otherwise it can also
/// be a token or even the prefix of a token within the exact or loose index values of the
/// selected fields.
///
/// Returns an iterator which yields all matching docs or an error, if a search query is
/// executed against non-indexed fields.
pub fn query<'a>(
&'a self,
fields: &'a str,
value: &'a str,
exact: bool,
) -> anyhow::Result<TableIter<'a>> {
self.queries
.store(self.queries.load(Ordering::Relaxed) + 1, Ordering::Relaxed);
match self.determine_search_fields(fields) {
SearchFields::All => {
if exact {
Ok(TableIter::ExactIndexQuery(
&self.doc,
self.index.query_values(value),
None,
0,
))
} else {
Ok(TableIter::PrefixIndexQuery(
&self.doc,
self.index.prefix_query(value.to_lowercase().as_str()),
fnv::FnvHashSet::default(),
None,
))
}
}
SearchFields::IndexLookup(fields) => {
if exact {
Ok(TableIter::ExactIndexQuery(
&self.doc,
self.index.query_values(value),
Some(fields),
0,
))
} else {
Ok(TableIter::PrefixIndexQuery(
&self.doc,
self.index.prefix_query(value.to_lowercase().as_str()),
fnv::FnvHashSet::default(),
Some(fields),
))
}
}
SearchFields::Scan(fields) => {
if !exact {
Err(anyhow::anyhow!(
"Cannot use a search query on non-indexed fields!"
))
} else {
self.scan_queries.store(
self.scan_queries.load(Ordering::Relaxed) + 1,
Ordering::Relaxed,
);
Ok(TableIter::ScanQuery(&self.doc, fields, value, 0))
}
}
}
}
/// Performs a table scan which simply iterates over all rows of a table.
pub fn table_scan(&self) -> TableIter {
self.scans
.store(self.scans.load(Ordering::Relaxed) + 1, Ordering::Relaxed);
TableIter::TableScan(&self.doc, 0)
}
/// Returns the number of rows in the table.
pub fn len(&self) -> usize {
self.doc.root().len()
}
/// Determines if this table is empty.
pub fn is_empty(&self) -> bool {
self.doc.root().is_empty()
}
/// Returns the number of queries which have been executed against this table.
pub fn num_queries(&self) -> usize {
self.queries.load(Ordering::Relaxed)
}
/// Returns the number of queries which have resorted to a table scan in order to execute
/// properly.
pub fn num_scan_queries(&self) -> usize {
self.scan_queries.load(Ordering::Relaxed)
}
/// Returns the number of deliberate table scans.
pub fn num_scans(&self) -> usize {
self.scans.load(Ordering::Relaxed)
}
/// Estimates the allocated memory for both, the table data and its index.
///
/// The returned value is in bytes.
pub fn allocated_memory(&self) -> usize {
self.doc.allocated_size() + self.index.allocated_size()
}
/// Returns the selector or query for the fallback/default language ("xx").
pub fn default_lang_query(&self) -> &Query {
&self.default_lang_query
}
}
/// Represents the various strategies used to iterate over contents or query results within a table.
pub enum TableIter<'a> {
/// Represents an iterator which is based on a direct index hit.
ExactIndexQuery(
&'a Doc,
Option<&'a Vec<IndexEntry>>,
Option<fnv::FnvHashSet<Symbol>>,
usize,
),
/// Represents an iterator which is based on a prefix hit within the index.
PrefixIndexQuery(
&'a Doc,
PrefixIter<'a, IndexEntry>,
fnv::FnvHashSet<usize>,
Option<fnv::FnvHashSet<Symbol>>,
),
/// Represents an iterator which scans over all index items to check for matches.
ScanQuery(&'a Doc, Vec<Query>, &'a str, usize),
/// Represents an iterator which represents all entries in the index.
TableScan(&'a Doc, usize),
}
impl<'a> Iterator for TableIter<'a> {
type Item = Element<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self {
// For an exact query, we operate on the underlying index vector (which is per matching
// token. We therefore have to check and skip over tokens of non-matching fields...
TableIter::ExactIndexQuery(doc, hits, fields, next_hit) => {
if let Some(hits) = hits {
while *next_hit < hits.len() {
let hit = &hits[*next_hit];
*next_hit += 1;
if hit.exact
&& (fields.is_none() || fields.as_ref().unwrap().contains(&hit.field))
{
return Some(doc.root().at(hit.row));
}
}
}
None
}
// For prefix matches, we can simply operate on the underlying PrefixIter of the Trie
// and also filter out matches, which do not belong to our selected fields...
TableIter::PrefixIndexQuery(doc, iter, de_dup, fields) => {
for hit in iter {
if (fields.is_none() || fields.as_ref().unwrap().contains(&hit.field))
&& !de_dup.contains(&hit.row)
{
let _ = de_dup.insert(hit.row);
return Some(doc.root().at(hit.row));
}
}
None
}
// A table scan is a bit more involved. Naturally the documents to check are easily
// identified (we simply iterate through all). However, we now have to perform the whole
// value extraction process which normally happens during indexing...
TableIter::ScanQuery(doc, fields, value, next_hit) => {
while *next_hit < doc.root().len() {
let row = doc.root().at(*next_hit);
*next_hit += 1;
if TableIter::has_match(row, fields, value) {
return Some(row);
}
}
None
}
// A raw table scan is quite simple - iterate over each row, each row is a match...
TableIter::TableScan(doc, next_hit) => {
if *next_hit < doc.root().len() {
let row = doc.root().at(*next_hit);
*next_hit += 1;
Some(row)
} else {
None
}
}
}
}
}
impl<'a> TableIter<'a> {
/// Checks if for the given row any of the given fields or paths contains the given value.
fn has_match(row: Element, fields: &[Query], value: &str) -> bool {
let mut matching = false;
for field in fields.iter() {
let field_value = field.execute(row);
Table::search_values(field_value, "", |_, search_val| {
if search_val == value {
matching = true
}
});
if matching {
break;
}
}
matching
}
}
#[cfg(test)]
mod tests {
use crate::idb::table::{IndexType, Table};
use crate::ig::docs::Doc;
use crate::ig::yaml::list_to_doc;
use yaml_rust::YamlLoader;
#[test]
fn table_test() {
// Load an extremely elaborate example dataset...
let dataset = create_example_dataset();
let table = Table::new(
dataset,
vec![
IndexType::lookup("code"),
IndexType::lookup("iso.two"),
IndexType::lookup("mappings"),
IndexType::fulltext("name"),
],
)
.unwrap();
assert_eq!(table.len(), 2);
assert_eq!(table.is_empty(), false);
// Check that exact query work...
let row = table.query("code", "D", true).unwrap().next().unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
// Ensure that exact queries are case sensitive...
assert_eq!(
table.query("code", "d", true).unwrap().next().is_none(),
true
);
// Ensure that exact queries for which a fulltext index exists, can be case-insensitive
// if the search value is already lowercased...
let row = table
.query("name", "deutschland", true)
.unwrap()
.next()
.unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
// Check that inexact queries work...
let row = table.query("name", "deu", false).unwrap().next().unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
let row = table.query("name", "Deu", false).unwrap().next().unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
// Ensure that exact queries don't match prefixes...
assert_eq!(
table.query("name", "de", true).unwrap().next().is_none(),
true
);
// Ensure that exact queries remain empty if no match is present...
assert_eq!(
table.query("name", "xxx", true).unwrap().next().is_none(),
true
);
// Cannot perform inexact search in non-indexed field...
assert_eq!(table.query("iso.three", "xxx", false).is_err(), true);
// but can perform an exact search in non-indexed field...
let row = table
.query("iso.three", "aut", true)
.unwrap()
.next()
.unwrap();
assert_eq!(row.query("name.en").as_str().unwrap(), "Austria");
// Ensure inner indices are created and used...
let scan_queries_so_far = table.num_scan_queries();
let row = table
.query("mappings.acme", "DD", true)
.unwrap()
.next()
.unwrap();
assert_eq!(row.query("name.de").as_str().unwrap(), "Deutschland");
// Test that even unsuccessful queries never resort to a table scan...
let _ = table.query("mappings.acme1", "DD", true);
let _ = table.query("mappings.acme", "YY", true);
// Ensure neither query needed a scan to be fulfilled...
assert_eq!(table.num_scan_queries(), scan_queries_so_far);
// Enforce a simple table scan...
assert_eq!(table.table_scan().count(), 2);
// Ensure that the metrics are correctly updated...
assert_eq!(table.num_queries(), 12);
assert_eq!(table.num_scan_queries(), 1);
assert_eq!(table.num_scans(), 1);
}
fn create_example_dataset() -> Doc {
let input = r#"
code: "D"
iso:
two: "de"
three: "deu"
name:
de: "Deutschland"
en: "Germany"
mappings:
acme: "DD"
---
code: "A"
iso:
two: "at"
three: "aut"
upper: "AAA"
name:
de: "Österreich"
en: "Austria"
"#;
let rows = YamlLoader::load_from_str(input).unwrap();
list_to_doc(rows.as_slice(), |_| true).unwrap()
}
}