1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
use super::RangeSpec;
use crate::annis::operator::EstimationType;
use crate::{
    annis::{
        db::aql::model::AnnotationComponentType,
        operator::{UnaryOperator, UnaryOperatorSpec},
    },
    errors::Result,
    graph::{GraphStorage, Match},
    AnnotationGraph,
};
use graphannis_core::types::{Component, NodeID};
use std::collections::HashSet;
use std::sync::Arc;

use rustc_hash::FxHashSet;

#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct AritySpec {
    pub children: RangeSpec,
}

impl UnaryOperatorSpec for AritySpec {
    fn necessary_components(
        &self,
        db: &AnnotationGraph,
    ) -> HashSet<Component<AnnotationComponentType>> {
        let mut result = HashSet::default();
        result.extend(db.get_all_components(Some(AnnotationComponentType::Dominance), None));
        result.extend(db.get_all_components(Some(AnnotationComponentType::Pointing), None));
        result
    }

    fn create_operator(&self, db: &AnnotationGraph) -> Result<Box<dyn UnaryOperator>> {
        // collect all relevant graph storages
        let mut graphstorages = Vec::default();

        for component in db.get_all_components(Some(AnnotationComponentType::Dominance), None) {
            if let Some(gs) = db.get_graphstorage(&component) {
                graphstorages.push(gs);
            }
        }
        for component in db.get_all_components(Some(AnnotationComponentType::Pointing), None) {
            if let Some(gs) = db.get_graphstorage(&component) {
                graphstorages.push(gs);
            }
        }

        Ok(Box::new(ArityOperator {
            graphstorages,
            allowed_range: self.children.clone(),
        }))
    }
}

struct ArityOperator {
    graphstorages: Vec<Arc<dyn GraphStorage>>,
    allowed_range: RangeSpec,
}

impl std::fmt::Display for ArityOperator {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, ":arity={}", self.allowed_range)
    }
}

impl UnaryOperator for ArityOperator {
    fn filter_match(&self, m: &Match) -> Result<bool> {
        let mut children: FxHashSet<NodeID> = FxHashSet::default();
        for gs in self.graphstorages.iter() {
            for out in gs.get_outgoing_edges(m.node) {
                let out = out?;
                children.insert(out);
            }
        }

        let num_children = children.len();
        if num_children >= self.allowed_range.min_dist() {
            match self.allowed_range.max_dist() {
                std::ops::Bound::Unbounded => Ok(true),
                std::ops::Bound::Included(max_dist) => Ok(num_children <= max_dist),
                std::ops::Bound::Excluded(max_dist) => Ok(num_children < max_dist),
            }
        } else {
            Ok(false)
        }
    }

    fn estimation_type(&self) -> EstimationType {
        if let RangeSpec::Bound { min_dist, max_dist } = self.allowed_range {
            let mut min_matches_any = false;
            let mut max_sel: f64 = 0.0;

            for gs in self.graphstorages.iter() {
                if let Some(stats) = gs.get_statistics() {
                    if min_dist <= stats.max_fan_out {
                        min_matches_any = true;
                    }

                    let max_fan_out = stats.max_fan_out;
                    // clip to asssumed maximum
                    let max_dist = std::cmp::min(max_dist, max_fan_out);
                    let min_dist = std::cmp::min(min_dist, max_dist);

                    // TODO: we would need a histogram of the distribution of outgoing edges
                    // for guessing the the number of matching nodes. For now just assume it is much
                    // more likely to have a larger range instead of a single value
                    let spec_range_len = max_dist - min_dist + 1;
                    let stats_range_len = max_fan_out;
                    let sel = spec_range_len as f64 / (stats_range_len as f64);
                    max_sel = max_sel.max(sel);
                } else {
                    // use default
                    max_sel = max_sel.max(0.1);
                }
            }

            if min_matches_any {
                if max_sel >= 1.0 {
                    EstimationType::Selectivity(1.0)
                } else {
                    EstimationType::Selectivity(max_sel)
                }
            } else {
                // no graph storages has the minimum required amount of outgoing edges
                EstimationType::Selectivity(0.0)
            }
        } else {
            // this range spec allows any number of outgoing edges
            EstimationType::Selectivity(1.0)
        }
    }
}