Skip to main content

brk_bindgen/analysis/
tree.rs

1//! Tree traversal helpers for pattern analysis.
2//!
3//! This module provides utilities for working with the TreeNode structure,
4//! including leaf name extraction and index pattern detection.
5
6use std::collections::{BTreeMap, BTreeSet};
7
8use brk_types::{Index, TreeNode, extract_json_type};
9use indexmap::IndexMap;
10
11use crate::{IndexSetPattern, PatternField, child_type_name};
12
13use super::{find_common_prefix, find_common_suffix, normalize_prefix};
14
15/// Get the shortest leaf name from a tree node.
16///
17/// This is useful for pattern base analysis where we want the "base" case
18/// (e.g., the leaf without suffix like `_btc` or `_usd`).
19pub(super) fn get_shortest_leaf_name(node: &TreeNode) -> Option<String> {
20    match node {
21        TreeNode::Leaf(leaf) => Some(leaf.name().to_string()),
22        TreeNode::Branch(children) => children
23            .values()
24            .filter_map(get_shortest_leaf_name)
25            .min_by_key(|name| name.len()),
26    }
27}
28
29/// Get the field signature for a branch node's children.
30/// Fields are sorted alphabetically for consistent pattern matching.
31pub fn get_node_fields(
32    children: &IndexMap<String, TreeNode>,
33    pattern_lookup: &BTreeMap<Vec<PatternField>, String>,
34) -> Vec<PatternField> {
35    let mut fields: Vec<PatternField> = children
36        .iter()
37        .map(|(name, node)| {
38            let (rust_type, json_type, indexes) = match node {
39                TreeNode::Leaf(leaf) => (
40                    leaf.kind().to_string(),
41                    extract_json_type(&leaf.schema),
42                    leaf.indexes().clone(),
43                ),
44                TreeNode::Branch(grandchildren) => {
45                    let child_fields = get_node_fields(grandchildren, pattern_lookup);
46                    let pattern_name = pattern_lookup
47                        .get(&child_fields)
48                        .cloned()
49                        .unwrap_or_else(|| "Unknown".to_string());
50                    (pattern_name.clone(), pattern_name, BTreeSet::new())
51                }
52            };
53            PatternField {
54                name: name.clone(),
55                rust_type,
56                json_type,
57                indexes,
58                type_param: None,
59            }
60        })
61        .collect();
62    // Sort for consistent pattern matching (display order preserved in IndexMap)
63    fields.sort_by(|a, b| a.name.cmp(&b.name));
64    fields
65}
66
67/// Detect index patterns (sets of indexes that appear together on series).
68pub fn detect_index_patterns(tree: &TreeNode) -> Vec<IndexSetPattern> {
69    let mut unique_index_sets: BTreeSet<BTreeSet<Index>> = BTreeSet::new();
70    collect_index_sets_from_tree(tree, &mut unique_index_sets);
71
72    // Sort by count (descending) then by first index name for deterministic ordering
73    let mut sorted_sets: Vec<_> = unique_index_sets
74        .into_iter()
75        .filter(|indexes| !indexes.is_empty())
76        .collect();
77    sorted_sets.sort_by(|a, b| {
78        b.len()
79            .cmp(&a.len())
80            .then_with(|| a.iter().next().cmp(&b.iter().next()))
81    });
82
83    // Assign unique sequential names
84    sorted_sets
85        .into_iter()
86        .enumerate()
87        .map(|(i, indexes)| IndexSetPattern {
88            name: format!("SeriesPattern{}", i + 1),
89            indexes,
90        })
91        .collect()
92}
93
94fn collect_index_sets_from_tree(
95    node: &TreeNode,
96    unique_index_sets: &mut BTreeSet<BTreeSet<Index>>,
97) {
98    match node {
99        TreeNode::Leaf(leaf) => {
100            unique_index_sets.insert(leaf.indexes().clone());
101        }
102        TreeNode::Branch(children) => {
103            for child in children.values() {
104                collect_index_sets_from_tree(child, unique_index_sets);
105            }
106        }
107    }
108}
109
110/// Result of analyzing a pattern instance's base.
111#[derive(Debug, Clone)]
112pub struct PatternBaseResult {
113    /// The computed base name for the pattern.
114    pub base: String,
115    /// Whether an outlier child was excluded to find the pattern.
116    /// If true, pattern factory should not be used.
117    pub has_outlier: bool,
118    /// Whether this instance uses suffix mode (common prefix) or prefix mode (common suffix).
119    /// Used to check compatibility with the pattern's mode.
120    pub is_suffix_mode: bool,
121    /// The field parts (suffix in suffix mode, prefix in prefix mode) for each field.
122    /// Used to check if instance field parts match the pattern's field parts.
123    pub field_parts: BTreeMap<String, String>,
124}
125
126impl PatternBaseResult {
127    /// Create a default result that forces inlining (has_outlier = true).
128    /// Use when no pattern base could be computed during lookup.
129    pub fn force_inline() -> Self {
130        Self {
131            base: String::new(),
132            has_outlier: true,
133            is_suffix_mode: true,
134            field_parts: BTreeMap::new(),
135        }
136    }
137
138    /// Create an empty result with no outlier.
139    /// Use for root-level patterns or when children have no common pattern.
140    pub fn empty() -> Self {
141        Self {
142            base: String::new(),
143            has_outlier: false,
144            is_suffix_mode: true,
145            field_parts: BTreeMap::new(),
146        }
147    }
148}
149
150/// Get the series base for a pattern instance by analyzing direct children.
151///
152/// Uses the shortest leaf names from direct children to find common prefix/suffix.
153///
154/// If the initial analysis fails to find a common pattern, it tries excluding
155/// each child one at a time to detect outliers (e.g., a mismatched "base" field
156/// from indexer/computed tree merging).
157///
158/// Returns both the base and whether an outlier was detected.
159pub fn get_pattern_instance_base(node: &TreeNode) -> PatternBaseResult {
160    let child_names = get_direct_children_for_analysis(node);
161    if child_names.is_empty() {
162        return PatternBaseResult::empty();
163    }
164
165    // Try to find common base from leaf names
166    if let Some(result) = try_find_base(&child_names, false) {
167        return PatternBaseResult {
168            base: result.base,
169            has_outlier: result.has_outlier,
170            is_suffix_mode: result.is_suffix_mode,
171            field_parts: result.field_parts,
172        };
173    }
174
175    // If no common pattern found and we have enough children, try excluding outliers
176    if child_names.len() > 2 {
177        for i in 0..child_names.len() {
178            let filtered: Vec<_> = child_names
179                .iter()
180                .enumerate()
181                .filter(|(j, _)| *j != i)
182                .map(|(_, v)| v.clone())
183                .collect();
184
185            if let Some(result) = try_find_base(&filtered, true) {
186                return PatternBaseResult {
187                    base: result.base,
188                    has_outlier: true,
189                    is_suffix_mode: result.is_suffix_mode,
190                    field_parts: result.field_parts,
191                };
192            }
193        }
194    }
195
196    // Fallback: no common prefix/suffix found - this is a root-level pattern
197    // Return empty base so series names are used directly
198    PatternBaseResult::empty()
199}
200
201/// Result of try_find_base: base name, has_outlier flag, is_suffix_mode flag, and field_parts.
202struct FindBaseResult {
203    base: String,
204    has_outlier: bool,
205    is_suffix_mode: bool,
206    field_parts: BTreeMap<String, String>,
207}
208
209/// Try to find a common base from child names using prefix/suffix detection.
210/// Returns Some(FindBaseResult) if found.
211fn try_find_base(
212    child_names: &[(String, String)],
213    is_outlier_attempt: bool,
214) -> Option<FindBaseResult> {
215    let leaf_names: Vec<&str> = child_names.iter().map(|(_, n)| n.as_str()).collect();
216
217    // Try common prefix first (suffix mode)
218    if let Some(prefix) = find_common_prefix(&leaf_names) {
219        let base = prefix.trim_end_matches('_').to_string();
220        let mut field_parts = BTreeMap::new();
221        for (field_name, leaf_name) in child_names {
222            // Compute the suffix part for this field
223            let suffix = if leaf_name == &base {
224                String::new()
225            } else {
226                leaf_name
227                    .strip_prefix(&prefix)
228                    .unwrap_or(leaf_name)
229                    .to_string()
230            };
231            field_parts.insert(field_name.clone(), suffix);
232        }
233        return Some(FindBaseResult {
234            base,
235            has_outlier: is_outlier_attempt,
236            is_suffix_mode: true,
237            field_parts,
238        });
239    }
240
241    // Try common suffix (prefix mode)
242    if let Some(suffix) = find_common_suffix(&leaf_names) {
243        let base = suffix.trim_start_matches('_').to_string();
244        let mut field_parts = BTreeMap::new();
245        for (field_name, leaf_name) in child_names {
246            // Compute the prefix part for this field, normalized to end with _
247            let prefix_part = leaf_name
248                .strip_suffix(&suffix)
249                .map(normalize_prefix)
250                .unwrap_or_default();
251            field_parts.insert(field_name.clone(), prefix_part);
252        }
253        return Some(FindBaseResult {
254            base,
255            has_outlier: is_outlier_attempt,
256            is_suffix_mode: false,
257            field_parts,
258        });
259    }
260
261    None
262}
263
264/// Get (field_name, shortest_leaf_name) pairs for direct children of a branch node.
265///
266/// Uses the shortest leaf name from each child subtree to find the "base" case
267/// (the leaf without suffix modifiers like `_btc` or `_usd`).
268fn get_direct_children_for_analysis(node: &TreeNode) -> Vec<(String, String)> {
269    match node {
270        TreeNode::Leaf(leaf) => vec![(leaf.name().to_string(), leaf.name().to_string())],
271        TreeNode::Branch(children) => children
272            .iter()
273            .filter_map(|(field_name, child)| {
274                get_shortest_leaf_name(child).map(|leaf_name| (field_name.clone(), leaf_name))
275            })
276            .collect(),
277    }
278}
279
280/// Infer the accumulated name for a child node based on a descendant leaf name.
281pub fn infer_accumulated_name(parent_acc: &str, field_name: &str, descendant_leaf: &str) -> String {
282    if let Some(pos) = descendant_leaf.find(field_name) {
283        if pos == 0 {
284            return field_name.to_string();
285        }
286        if pos > 0 && descendant_leaf.chars().nth(pos - 1) == Some('_') {
287            return if parent_acc.is_empty() {
288                field_name.to_string()
289            } else {
290                format!("{}_{}", parent_acc, field_name)
291            };
292        }
293    }
294
295    if parent_acc.is_empty() {
296        field_name.to_string()
297    } else {
298        format!("{}_{}", parent_acc, field_name)
299    }
300}
301
302/// Get fields with child field information for generic pattern lookup.
303pub fn get_fields_with_child_info(
304    children: &IndexMap<String, TreeNode>,
305    parent_name: &str,
306    pattern_lookup: &BTreeMap<Vec<PatternField>, String>,
307) -> Vec<(PatternField, Option<Vec<PatternField>>)> {
308    children
309        .iter()
310        .map(|(name, node)| {
311            let (rust_type, json_type, indexes, child_fields) = match node {
312                TreeNode::Leaf(leaf) => (
313                    leaf.kind().to_string(),
314                    extract_json_type(&leaf.schema),
315                    leaf.indexes().clone(),
316                    None,
317                ),
318                TreeNode::Branch(grandchildren) => {
319                    let child_fields = get_node_fields(grandchildren, pattern_lookup);
320                    let pattern_name = pattern_lookup
321                        .get(&child_fields)
322                        .cloned()
323                        .unwrap_or_else(|| child_type_name(parent_name, name));
324                    (
325                        pattern_name.clone(),
326                        pattern_name,
327                        BTreeSet::new(),
328                        Some(child_fields),
329                    )
330                }
331            };
332            (
333                PatternField {
334                    name: name.clone(),
335                    rust_type,
336                    json_type,
337                    indexes,
338                    type_param: None,
339                },
340                child_fields,
341            )
342        })
343        .collect()
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use brk_types::{SeriesLeaf, SeriesLeafWithSchema, TreeNode};
350
351    fn make_leaf(name: &str) -> TreeNode {
352        let leaf = SeriesLeaf {
353            name: name.to_string(),
354            kind: "TestType".to_string(),
355            indexes: BTreeSet::new(),
356        };
357        TreeNode::Leaf(SeriesLeafWithSchema::new(leaf, serde_json::json!({})))
358    }
359
360    fn make_branch(children: Vec<(&str, TreeNode)>) -> TreeNode {
361        let map: IndexMap<String, TreeNode> = children
362            .into_iter()
363            .map(|(k, v)| (k.to_string(), v))
364            .collect();
365        TreeNode::Branch(map)
366    }
367
368    #[test]
369    fn test_get_pattern_instance_base_with_base_field() {
370        // Simulates vbytes tree: has base field with block_vbytes leaf
371        let tree = make_branch(vec![
372            (
373                "base",
374                make_branch(vec![("day1", make_leaf("block_vbytes"))]),
375            ),
376            (
377                "average",
378                make_branch(vec![("day1", make_leaf("block_vbytes_average"))]),
379            ),
380            (
381                "sum",
382                make_branch(vec![("day1", make_leaf("block_vbytes_sum"))]),
383            ),
384        ]);
385
386        let result = get_pattern_instance_base(&tree);
387        assert_eq!(result.base, "block_vbytes");
388        assert!(!result.has_outlier);
389    }
390
391    #[test]
392    fn test_get_pattern_instance_base_without_base_field() {
393        // Simulates weight tree: NO base field, only suffixed series
394        let tree = make_branch(vec![
395            (
396                "average",
397                make_branch(vec![("day1", make_leaf("block_weight_average"))]),
398            ),
399            (
400                "sum",
401                make_branch(vec![("day1", make_leaf("block_weight_sum"))]),
402            ),
403            (
404                "cumulative",
405                make_branch(vec![("day1", make_leaf("block_weight_cumulative"))]),
406            ),
407            (
408                "max",
409                make_branch(vec![("day1", make_leaf("block_weight_max"))]),
410            ),
411            (
412                "min",
413                make_branch(vec![("day1", make_leaf("block_weight_min"))]),
414            ),
415        ]);
416
417        let result = get_pattern_instance_base(&tree);
418        assert_eq!(result.base, "block_weight");
419        assert!(!result.has_outlier);
420    }
421
422    #[test]
423    fn test_get_pattern_instance_base_with_duplicate_base_field() {
424        // What if there's a "base" field that points to the same leaf as "average"?
425        // This could happen if the tree generation creates a base field that shares leaves with average
426        let tree = make_branch(vec![
427            (
428                "base",
429                make_branch(vec![("day1", make_leaf("block_weight_average"))]),
430            ),
431            (
432                "average",
433                make_branch(vec![("day1", make_leaf("block_weight_average"))]),
434            ),
435            (
436                "sum",
437                make_branch(vec![("day1", make_leaf("block_weight_sum"))]),
438            ),
439        ]);
440
441        let result = get_pattern_instance_base(&tree);
442        // Common prefix among all children is "block_weight_"
443        assert_eq!(result.base, "block_weight");
444        assert!(!result.has_outlier);
445    }
446
447    #[test]
448    fn test_get_pattern_instance_base_with_mismatched_base_name() {
449        // Simulates the actual bug: indexed tree's "base" field has name "weight"
450        // but computed tree's derived series use "block_weight_*" prefix.
451        // After tree merge, we get a base field with mismatched naming.
452        let tree = make_branch(vec![
453            ("base", make_leaf("weight")), // Outlier - doesn't match pattern
454            ("average", make_leaf("block_weight_average")),
455            ("sum", make_leaf("block_weight_sum")),
456            ("cumulative", make_leaf("block_weight_cumulative")),
457            ("max", make_leaf("block_weight_max")),
458            ("min", make_leaf("block_weight_min")),
459        ]);
460
461        let result = get_pattern_instance_base(&tree);
462        // Should detect "weight" as outlier and find common prefix from others
463        assert_eq!(result.base, "block_weight");
464        assert!(result.has_outlier); // Pattern factory should NOT be used
465    }
466
467    #[test]
468    fn test_get_pattern_instance_base_root_level_no_common_pattern() {
469        // Simulates root-level pattern with series that have no common prefix/suffix.
470        // These names have no shared prefix or suffix, even when excluding any one.
471        // In this case, we should return empty base so series names are used directly.
472        let tree = make_branch(vec![
473            ("alpha", make_leaf("foo_series")),
474            ("beta", make_leaf("bar_value")),
475            ("gamma", make_leaf("baz_count")),
476        ]);
477
478        let result = get_pattern_instance_base(&tree);
479        // No common prefix or suffix - return empty base
480        assert_eq!(result.base, "");
481        assert!(!result.has_outlier);
482    }
483
484    #[test]
485    fn test_get_pattern_instance_base_two_children_no_pattern() {
486        // Two children with no common pattern - should still return empty base
487        let tree = make_branch(vec![
488            ("foo", make_leaf("alpha")),
489            ("bar", make_leaf("beta")),
490        ]);
491
492        let result = get_pattern_instance_base(&tree);
493        assert_eq!(result.base, "");
494        assert!(!result.has_outlier);
495    }
496
497    #[test]
498    fn test_get_pattern_instance_base_with_outlier_excluded() {
499        // Simulates the realized pattern: adjusted_sopr, sopr, asopr.
500        // When "asopr" is excluded as outlier, "adjusted_sopr" and "sopr" share suffix "_sopr".
501        // The outlier detection should find base="sopr" with has_outlier=true.
502        let tree = make_branch(vec![
503            ("adjustedSopr", make_leaf("adjusted_sopr")),
504            ("sopr", make_leaf("sopr")),
505            ("asopr", make_leaf("asopr")),
506        ]);
507
508        let result = get_pattern_instance_base(&tree);
509        // Outlier detected - pattern base found by excluding "asopr"
510        assert_eq!(result.base, "sopr");
511        assert!(result.has_outlier); // Pattern factory should NOT be used (inline instead)
512    }
513
514    #[test]
515    fn test_get_pattern_instance_base_suffix_mode_price_ago() {
516        // Simulates price_ago pattern: price_24h_ago, price_1w_ago, price_10y_ago
517        // Common prefix is "price_", so this is suffix mode
518        let tree = make_branch(vec![
519            ("_24h", make_leaf("price_24h_ago")),
520            ("_1w", make_leaf("price_1w_ago")),
521            ("_1m", make_leaf("price_1m_ago")),
522            ("_10y", make_leaf("price_10y_ago")),
523        ]);
524
525        let result = get_pattern_instance_base(&tree);
526        assert_eq!(result.base, "price");
527        assert!(result.is_suffix_mode); // Suffix mode: _m(base, "24h_ago")
528        assert!(!result.has_outlier);
529    }
530
531    #[test]
532    fn test_get_pattern_instance_base_prefix_mode_price_returns() {
533        // Simulates price_returns pattern: 24h_price_returns, 1w_price_returns, 10y_price_returns
534        // Common suffix is "_price_returns", so this is prefix mode
535        let tree = make_branch(vec![
536            ("_24h", make_leaf("24h_price_returns")),
537            ("_1w", make_leaf("1w_price_returns")),
538            ("_1m", make_leaf("1m_price_returns")),
539            ("_10y", make_leaf("10y_price_returns")),
540        ]);
541
542        let result = get_pattern_instance_base(&tree);
543        assert_eq!(result.base, "price_returns");
544        assert!(!result.is_suffix_mode); // Prefix mode: _p("24h_", base)
545        assert!(!result.has_outlier);
546    }
547
548    #[test]
549    fn test_mode_detection_distinguishes_similar_structures() {
550        // Two patterns with identical structure but different naming conventions
551        // should have different modes detected
552
553        // Suffix mode pattern
554        let suffix_tree = make_branch(vec![
555            ("_1y", make_leaf("lump_sum_1y")),
556            ("_2y", make_leaf("lump_sum_2y")),
557            ("_5y", make_leaf("lump_sum_5y")),
558        ]);
559        let suffix_result = get_pattern_instance_base(&suffix_tree);
560        assert_eq!(suffix_result.base, "lump_sum");
561        assert!(suffix_result.is_suffix_mode);
562
563        // Prefix mode pattern (same structure, different naming)
564        let prefix_tree = make_branch(vec![
565            ("_1y", make_leaf("1y_returns")),
566            ("_2y", make_leaf("2y_returns")),
567            ("_5y", make_leaf("5y_returns")),
568        ]);
569        let prefix_result = get_pattern_instance_base(&prefix_tree);
570        assert_eq!(prefix_result.base, "returns");
571        assert!(!prefix_result.is_suffix_mode);
572    }
573}