kglite 0.10.8

Pure-Rust knowledge graph engine — Cypher pipeline, snapshot/working CoW transactions, columnar/mmap/disk storage backends, optional dataset loaders (SEC EDGAR, Sodir, Wikidata). PyO3 wrappers live in the sibling kglite-py crate (the Python wheel); embeddable directly from any Rust binary without PyO3 in the dep tree.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
//! Join-order optimisation — pick pattern-start nodes, reorder MATCH
//! patterns by estimated selectivity.

use super::super::ast::*;
use crate::graph::core::pattern_matching::{PatternElement, PropertyMatcher};
use crate::graph::schema::DirGraph;
use crate::graph::storage::GraphRead;
use std::collections::{HashMap, HashSet};

pub(super) fn optimize_pattern_start_node(query: &mut CypherQuery, graph: &DirGraph) {
    use crate::graph::core::pattern_matching::EdgeDirection;

    // Track variables bound by earlier clauses so an unconstrained pattern
    // node like `(p)` in M2 — which will be pre-bound at runtime — is treated
    // as effectively-anchored (selectivity 1). Without this, the planner sees
    // `(p)-[:T]->(target:Type)` and reverses it because `(p)` is statically
    // unconstrained → looks worst-case, even though it'll resolve to a single
    // pre-bound NodeIndex when the executor reaches this clause.
    let mut bound_vars: HashSet<String> = HashSet::new();

    for clause in &mut query.clauses {
        let (patterns, path_assignments) = match clause {
            Clause::Match(m) => (&mut m.patterns, &m.path_assignments),
            Clause::OptionalMatch(m) => (&mut m.patterns, &m.path_assignments),
            // Other clauses don't introduce node bindings the optimizer cares
            // about; advance without modifying patterns.
            _ => continue,
        };
        for (pi, pattern) in patterns.iter_mut().enumerate() {
            if pattern.elements.len() < 3 {
                continue;
            }
            // Don't reverse patterns with path assignments — breaks path semantics
            if path_assignments.iter().any(|pa| pa.pattern_index == pi) {
                continue;
            }

            let first_node = match &pattern.elements[0] {
                PatternElement::Node(np) => np,
                _ => continue,
            };
            let last_node = match pattern.elements.last() {
                Some(PatternElement::Node(np)) => np,
                _ => continue,
            };

            // Reversing is safe for undirected and variable-length edges:
            // - `Both` flips to `Both` (identity).
            // - Var-length without path assignment is symmetric — `(a)-[*1..3]-(b)`
            //   is `(b)-[*1..3]-(a)` and `(a)-[*1..3]->(b)` reversed yields
            //   `(b)<-[*1..3]-(a)` (same edges traversed in reverse). Path-bound
            //   patterns are protected by the `path_assignments` check above.
            // No early-exit needed for direction/var-length anymore.

            let first_sel = estimate_node_selectivity_in_context(first_node, graph, &bound_vars);
            let last_sel = estimate_node_selectivity_in_context(last_node, graph, &bound_vars);

            // Only reverse if last node is significantly more selective (5× threshold).
            // A 5x advantage already saves 80% of expansion work. `saturating_mul`
            // because unconstrained nodes report `usize::MAX` and would otherwise
            // overflow.
            if last_sel.saturating_mul(5) >= first_sel {
                continue;
            }

            // Reverse: flip element order and flip each edge direction
            pattern.elements.reverse();
            for elem in &mut pattern.elements {
                if let PatternElement::Edge(ep) = elem {
                    ep.direction = match ep.direction {
                        EdgeDirection::Outgoing => EdgeDirection::Incoming,
                        EdgeDirection::Incoming => EdgeDirection::Outgoing,
                        EdgeDirection::Both => EdgeDirection::Both,
                    };
                }
            }
        }

        // Accumulate node variables introduced by this clause's patterns so
        // subsequent clauses see them as bound.
        for pattern in patterns.iter() {
            for elem in &pattern.elements {
                if let PatternElement::Node(np) = elem {
                    if let Some(ref v) = np.variable {
                        bound_vars.insert(v.clone());
                    }
                }
            }
        }
    }
}

/// Selectivity estimate that knows about variables bound by earlier clauses.
/// A pre-bound node resolves to a single NodeIndex at runtime, so its
/// effective candidate count is 1 — the most selective possible.
fn estimate_node_selectivity_in_context(
    np: &crate::graph::core::pattern_matching::NodePattern,
    graph: &DirGraph,
    bound_vars: &HashSet<String>,
) -> usize {
    if let Some(ref v) = np.variable {
        if bound_vars.contains(v) {
            return 1;
        }
    }
    estimate_node_selectivity(np, graph)
}

/// Estimate the number of candidate nodes for a node pattern.
/// Lower = more selective = better as start node.
pub(super) fn estimate_node_selectivity(
    np: &crate::graph::core::pattern_matching::NodePattern,
    graph: &DirGraph,
) -> usize {
    let type_count = np
        .node_type
        .as_ref()
        .and_then(|t| graph.type_indices.get(t))
        .map(|idx| idx.len())
        .unwrap_or(GraphRead::node_count(&graph.graph));

    // Unconstrained nodes (no type, no properties) match every node in the
    // graph — they represent the *worst* possible start node. Returning
    // `usize::MAX` ensures the optimizer never picks an unconstrained node
    // over a constrained one, regardless of how the graph is populated.
    // (On a freshly-created graph, `type_count = 0`, which would otherwise
    // make unconstrained nodes look maximally selective.)
    let unconstrained = np.node_type.is_none();
    // Floor typed-no-property and empty-property branches at 1 so they never
    // beat a legitimately-anchored node (`{id: X}` returns 1, pre-bound vars
    // also map to 1 in the in-context estimator). Without the floor, a typed
    // node on an empty graph reports 0 and the optimizer reverses toward it
    // even when the other end is a bound anchor.
    match &np.properties {
        None if unconstrained => usize::MAX,
        None => type_count.max(1),
        Some(props) if props.is_empty() && unconstrained => usize::MAX,
        Some(props) if props.is_empty() => type_count.max(1),
        Some(props) => {
            // {id: X} is always selectivity 1 regardless of type
            for (prop, matcher) in props {
                if prop == "id" {
                    match matcher {
                        PropertyMatcher::Equals(_) | PropertyMatcher::EqualsParam(_) => return 1,
                        PropertyMatcher::In(vals) => return vals.len(),
                        _ => {}
                    }
                }
            }
            // Check if any property has equality on an indexed field
            if let Some(ref nt) = np.node_type {
                for (prop, matcher) in props {
                    match matcher {
                        PropertyMatcher::Equals(val) => {
                            let key = (nt.clone(), prop.clone());
                            if graph.property_indices.contains_key(&key) {
                                if let Some(results) = graph.lookup_by_index(nt, prop, val) {
                                    return results.len().max(1);
                                }
                                return 1;
                            }
                        }
                        PropertyMatcher::In(vals) => return vals.len(),
                        _ => {}
                    }
                }
            }
            // Heuristic: equality filters on string properties typically have many
            // distinct values (~100x reduction each). Range/comparison filters are
            // gentler (~10x). Multiple filters multiply the reduction.
            let eq_count = props
                .values()
                .filter(|m| {
                    matches!(
                        m,
                        PropertyMatcher::Equals(_) | PropertyMatcher::EqualsParam(_)
                    )
                })
                .count();
            let other_count = props.len() - eq_count;
            let mut est = type_count;
            for _ in 0..eq_count {
                est /= 100;
            }
            for _ in 0..other_count {
                est /= 10;
            }
            est.max(1)
        }
    }
}

/// Reorder consecutive MATCH clauses by edge-type total-count cost.
///
/// For a span of `MATCH … MATCH …` clauses where every clause has an
/// `{id: X}` anchor on one endpoint and a known connection type, drive
/// the cheaper one first. The cost proxy is the connection type's total
/// edge count from `edge_type_counts_cache` — O(1) per clause when the
/// cache is populated (always for graphs loaded from disk; warmed during
/// build for in-memory graphs that exercise it).
///
/// **The motivating case (Wikidata, 124M nodes / 861M edges):**
/// ```cypher
/// MATCH (p)-[:P31]->({id:5})       -- ~80M P31 edges total
/// MATCH (p)-[:P27]->({id:183})     -- ~3M P27 edges total
/// RETURN p.title LIMIT 20
/// ```
/// Without this pass the executor enumerates 13.4M humans then filters
/// each by P27 — observed at ~500s. Driving from M2 first (3M Germans →
/// per-row P31 check) is ~25× cheaper.
///
/// Safety conditions (any miss → no reorder):
/// - At least 2 consecutive `Match` clauses (not OPTIONAL, no path
///   assignments).
/// - `edge_type_counts_cache` is populated (avoids triggering a fresh
///   O(E) scan at plan time).
/// - Every clause in the span has an `{id: X}` anchor on at least one
///   endpoint of every pattern, AND a connection type whose total-edge
///   count is known. This restricts us to queries where the proxy is a
///   real upper bound on expansion size.
/// - All clauses in the span share at least one variable (otherwise no
///   join, no benefit).
/// - The cost ordering would actually change.
///
/// Runs *before* `optimize_pattern_start_node` so subsequent reversal
/// sees the new clause order and accumulates `bound_vars` correctly.
pub(super) fn reorder_match_clauses(query: &mut CypherQuery, graph: &DirGraph) {
    if !graph.has_edge_type_counts_cache() {
        return;
    }
    let edge_counts = graph.get_edge_type_counts();

    // 0.9.35 (AgensGraph-inspired): when the label-pair connectivity
    // cache is also populated, use the per-triple `(src_type, edge_type,
    // tgt_type)` counts instead of the broader edge-type total. Drops
    // the cost-estimate error on label-asymmetric patterns from "all
    // edges of type R" to "only edges of type R between the matched
    // labels" — typically 10–100× tighter on Wikidata-shaped graphs.
    // Gating on `has_type_connectivity_cache()` mirrors the existing
    // `has_edge_type_counts_cache()` gate so plan-time stays O(1).
    let triple_counts: Option<HashMap<(String, String, String), usize>> =
        if graph.has_type_connectivity_cache() {
            graph.get_type_connectivity().map(|triples| {
                triples
                    .into_iter()
                    .map(|t| ((t.src, t.conn, t.tgt), t.count))
                    .collect()
            })
        } else {
            None
        };

    let mut i = 0;
    while i < query.clauses.len() {
        // Find a span of consecutive non-OPTIONAL MATCH clauses with no
        // path assignments. Stops at any other clause kind (WITH, WHERE,
        // RETURN, etc.) to preserve their semantic boundaries.
        let mut j = i;
        while j < query.clauses.len() {
            match &query.clauses[j] {
                Clause::Match(m) if m.path_assignments.is_empty() => j += 1,
                _ => break,
            }
        }
        if j - i < 2 {
            i = j.max(i + 1);
            continue;
        }

        // Estimate cost for each MATCH in the span. Bail on the whole span
        // if any clause is unscoreable — partial knowledge could mislead
        // a sort.
        let mut costs: Vec<usize> = Vec::with_capacity(j - i);
        let mut all_scored = true;
        for k in i..j {
            let m = match &query.clauses[k] {
                Clause::Match(m) => m,
                _ => unreachable!(),
            };
            match estimate_match_edge_cost(m, &edge_counts, triple_counts.as_ref()) {
                Some(c) => costs.push(c),
                None => {
                    all_scored = false;
                    break;
                }
            }
        }
        if !all_scored {
            i = j;
            continue;
        }

        if !shares_variable_across(&query.clauses[i..j]) {
            i = j;
            continue;
        }

        // Stable sort by (cost, original_index) so equal costs preserve
        // textual order (no churn).
        let mut order: Vec<(usize, usize)> = costs.iter().copied().enumerate().collect();
        order.sort_by_key(|&(orig, c)| (c, orig));

        let already_sorted = order
            .iter()
            .enumerate()
            .all(|(pos, &(orig, _))| pos == orig);
        if !already_sorted {
            let extracted: Vec<Clause> = query.clauses.drain(i..j).collect();
            for (offset, &(orig, _)) in order.iter().enumerate() {
                query.clauses.insert(i + offset, extracted[orig].clone());
            }
        }

        i = j;
    }
}

/// Cost proxy for a MATCH clause: sum of total edge counts (over all
/// connection types in its patterns), provided every pattern is
/// id-anchored. Returns `None` if the clause is unscoreable under the
/// safety rules in [`reorder_match_clauses`].
fn estimate_match_edge_cost(
    m: &MatchClause,
    edge_counts: &HashMap<String, usize>,
    triple_counts: Option<&HashMap<(String, String, String), usize>>,
) -> Option<usize> {
    let mut total: usize = 0;
    for pattern in &m.patterns {
        if pattern.elements.len() < 3 {
            // A node-only pattern carries no edge cost; ordering it
            // relative to edge-bearing patterns is meaningless under
            // this proxy. Bail.
            return None;
        }
        // Need at least one id-anchored endpoint on every pattern in
        // the clause. Mid-pattern nodes are not checked — typical case
        // is `(node)-[:T]->(node)`.
        let first = &pattern.elements[0];
        let last = pattern.elements.last().unwrap();
        if !is_id_anchored(first) && !is_id_anchored(last) {
            return None;
        }
        // Sum the edge count for every typed edge in the pattern.
        // Prefer the label-pair triple count (src_type, edge, tgt_type)
        // when both endpoints are labelled AND triple_counts is
        // populated — this drops the cost estimate from "all R edges"
        // to "R edges only between (T1, T2)", which on label-skewed
        // graphs (humans-in-Germany style queries) is the difference
        // between picking the right driving side and not.
        let elems = &pattern.elements;
        for idx in 0..elems.len() {
            let ep = match &elems[idx] {
                PatternElement::Edge(ep) => ep,
                _ => continue,
            };
            let ct = ep.connection_type.as_ref()?;
            let mut count: Option<usize> = None;
            if let Some(triples) = triple_counts {
                // Lookup neighbouring node-type labels. Pattern shape
                // is always (node, edge, node, edge, ...) so the
                // surrounding nodes are at idx-1 and idx+1. Fall back
                // to per-edge total when either side is untyped or the
                // (src, edge, tgt) triple isn't in the cache.
                let src_label = idx
                    .checked_sub(1)
                    .and_then(|i| elems.get(i))
                    .and_then(node_label);
                let tgt_label = elems.get(idx + 1).and_then(node_label);
                if let (Some(sl), Some(tl)) = (src_label, tgt_label) {
                    let key_fwd = (sl.clone(), ct.clone(), tl.clone());
                    let key_rev = (tl, ct.clone(), sl);
                    // Direction-agnostic for `()-[]-()` patterns:
                    // honour both directions, take the sum.
                    let fwd = triples.get(&key_fwd).copied().unwrap_or(0);
                    let rev = triples.get(&key_rev).copied().unwrap_or(0);
                    if fwd > 0 || rev > 0 {
                        count = Some(fwd + rev);
                    }
                }
            }
            let resolved = match count {
                Some(c) => c,
                None => *edge_counts.get(ct)?,
            };
            total = total.saturating_add(resolved);
        }
    }
    Some(total)
}

/// Extract the node-type label (e.g. `Person`) from a NodePattern
/// element. Returns `None` for edges, anonymous nodes, or nodes
/// without a label.
fn node_label(elem: &PatternElement) -> Option<String> {
    let np = match elem {
        PatternElement::Node(np) => np,
        _ => return None,
    };
    np.node_type.clone()
}

fn is_id_anchored(elem: &PatternElement) -> bool {
    let np = match elem {
        PatternElement::Node(np) => np,
        _ => return false,
    };
    let props = match &np.properties {
        Some(p) => p,
        None => return false,
    };
    props.iter().any(|(prop, matcher)| {
        prop == "id"
            && matches!(
                matcher,
                PropertyMatcher::Equals(_) | PropertyMatcher::EqualsParam(_)
            )
    })
}

fn shares_variable_across(clauses: &[Clause]) -> bool {
    let mut common: Option<HashSet<String>> = None;
    for clause in clauses {
        let m = match clause {
            Clause::Match(m) => m,
            _ => return false,
        };
        let vars: HashSet<String> = m
            .patterns
            .iter()
            .flat_map(|p| p.elements.iter())
            .filter_map(|e| match e {
                PatternElement::Node(np) => np.variable.clone(),
                _ => None,
            })
            .collect();
        common = Some(match common {
            None => vars,
            Some(prev) => prev.intersection(&vars).cloned().collect(),
        });
    }
    common.is_some_and(|s| !s.is_empty())
}

/// Reorder patterns within a MATCH clause so the most selective pattern runs first.
///
/// For `MATCH (n)-[:P31]->({id:6256}), (n)-[:P30]->({id:46})`, the pattern with
/// the more selective start node should execute first to minimize the number of
/// rows passed to subsequent patterns via shared-variable join.
///
/// Estimates selectivity by looking at the first node of each pattern (after
/// start-node optimization has already picked the best direction).
pub(super) fn reorder_match_patterns(query: &mut CypherQuery, graph: &DirGraph) {
    let mut bound_vars: HashSet<String> = HashSet::new();

    for clause in &mut query.clauses {
        let mc = match clause {
            Clause::Match(mc) => mc,
            Clause::OptionalMatch(mc) => {
                // OPTIONAL MATCH still binds vars for downstream clauses;
                // accumulate but don't reorder OPTIONAL MATCH patterns.
                for pat in mc.patterns.iter() {
                    for elem in &pat.elements {
                        if let PatternElement::Node(np) = elem {
                            if let Some(ref v) = np.variable {
                                bound_vars.insert(v.clone());
                            }
                        }
                    }
                }
                continue;
            }
            _ => continue,
        };
        if mc.patterns.len() < 2 || !mc.path_assignments.is_empty() {
            // Still accumulate vars even when not reordering.
            for pat in mc.patterns.iter() {
                for elem in &pat.elements {
                    if let PatternElement::Node(np) = elem {
                        if let Some(ref v) = np.variable {
                            bound_vars.insert(v.clone());
                        }
                    }
                }
            }
            continue;
        }
        // Estimate selectivity for each pattern based on its start node,
        // accounting for variables already bound by prior clauses.
        let mut pattern_scores: Vec<(usize, usize)> = mc
            .patterns
            .iter()
            .enumerate()
            .map(|(i, pat)| {
                let sel = if let Some(PatternElement::Node(np)) = pat.elements.first() {
                    estimate_node_selectivity_in_context(np, graph, &bound_vars)
                } else {
                    usize::MAX
                };
                (i, sel)
            })
            .collect();

        // Sort by selectivity (lower = more selective = should go first)
        pattern_scores.sort_by_key(|&(_, sel)| sel);

        // Only reorder if the order actually changes
        let already_ordered = pattern_scores
            .iter()
            .enumerate()
            .all(|(pos, &(idx, _))| pos == idx);
        if !already_ordered {
            let old_patterns = std::mem::take(&mut mc.patterns);
            mc.patterns = pattern_scores
                .iter()
                .map(|&(idx, _)| old_patterns[idx].clone())
                .collect();
        }

        // Accumulate vars from this clause's (possibly reordered) patterns.
        for pat in mc.patterns.iter() {
            for elem in &pat.elements {
                if let PatternElement::Node(np) = elem {
                    if let Some(ref v) = np.variable {
                        bound_vars.insert(v.clone());
                    }
                }
            }
        }
    }
}