graphannis/annis/db/aql/operators/
inclusion.rs

1use crate::annis::db::exec::CostEstimate;
2use crate::annis::db::token_helper;
3use crate::annis::db::token_helper::TokenHelper;
4use crate::annis::errors::GraphAnnisError;
5use crate::annis::operator::{BinaryOperator, BinaryOperatorIndex, EstimationType};
6use crate::{AnnotationGraph, try_as_boxed_iter};
7use crate::{
8    annis::operator::{BinaryOperatorBase, BinaryOperatorSpec},
9    errors::Result,
10    graph::{GraphStorage, Match},
11    model::AnnotationComponentType,
12};
13use graphannis_core::types::NodeID;
14use graphannis_core::{
15    graph::{ANNIS_NS, DEFAULT_ANNO_KEY},
16    types::Component,
17};
18use itertools::Itertools;
19
20use std::collections::HashSet;
21use std::sync::Arc;
22
23#[derive(Clone, Debug, PartialOrd, Ord, Hash, PartialEq, Eq)]
24pub struct InclusionSpec;
25
26pub struct Inclusion<'a> {
27    gs_order: Arc<dyn GraphStorage>,
28    tok_helper: TokenHelper<'a>,
29}
30
31lazy_static! {
32    static ref COMPONENT_ORDER: Component<AnnotationComponentType> = {
33        Component::new(
34            AnnotationComponentType::Ordering,
35            ANNIS_NS.into(),
36            "".into(),
37        )
38    };
39}
40
41impl BinaryOperatorSpec for InclusionSpec {
42    fn necessary_components(
43        &self,
44        db: &AnnotationGraph,
45    ) -> HashSet<Component<AnnotationComponentType>> {
46        let mut v = HashSet::default();
47        v.insert(COMPONENT_ORDER.clone());
48        v.extend(token_helper::necessary_components(db));
49        v
50    }
51
52    fn create_operator<'a>(
53        &self,
54        db: &'a AnnotationGraph,
55        _cost_estimate: Option<(&CostEstimate, &CostEstimate)>,
56    ) -> Result<BinaryOperator<'a>> {
57        let optional_op = Inclusion::new(db);
58        optional_op.map(|op| BinaryOperator::Index(Box::new(op)))
59    }
60
61    #[cfg(test)]
62    fn into_any(self: Arc<Self>) -> Arc<dyn std::any::Any> {
63        self
64    }
65
66    #[cfg(test)]
67    fn any_ref(&self) -> &dyn std::any::Any {
68        self
69    }
70}
71
72impl<'a> Inclusion<'a> {
73    pub fn new(db: &'a AnnotationGraph) -> Result<Inclusion<'a>> {
74        let gs_order = db.get_graphstorage(&COMPONENT_ORDER).ok_or_else(|| {
75            GraphAnnisError::ImpossibleSearch(
76                "Ordering component missing (needed for _i_ operator)".to_string(),
77            )
78        })?;
79
80        let tok_helper = TokenHelper::new(db)?;
81
82        Ok(Inclusion {
83            gs_order,
84            tok_helper,
85        })
86    }
87}
88
89impl std::fmt::Display for Inclusion<'_> {
90    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
91        write!(f, "_i_")
92    }
93}
94
95impl BinaryOperatorBase for Inclusion<'_> {
96    fn filter_match(&self, lhs: &Match, rhs: &Match) -> Result<bool> {
97        let left_right_lhs = self.tok_helper.left_right_token_for(lhs.node)?;
98        let left_right_rhs = self.tok_helper.left_right_token_for(rhs.node)?;
99        if let (Some(start_lhs), Some(end_lhs), Some(start_rhs), Some(end_rhs)) = (
100            left_right_lhs.0,
101            left_right_lhs.1,
102            left_right_rhs.0,
103            left_right_rhs.1,
104        ) {
105            // span length of LHS
106            if let Some(l) = self.gs_order.distance(start_lhs, end_lhs)? {
107                // path between left-most tokens exists in ORDERING component and has maximum length l
108                if self.gs_order.is_connected(start_lhs, start_rhs, 0, std::ops::Bound::Included(l))?
109                // path between right-most tokens exists in ORDERING component and has maximum length l
110                && self.gs_order.is_connected(end_rhs, end_lhs, 0, std::ops::Bound::Included(l))?
111                {
112                    return Ok(true);
113                }
114            }
115        }
116
117        Ok(false)
118    }
119
120    fn is_reflexive(&self) -> bool {
121        false
122    }
123
124    fn estimation_type(&self) -> Result<EstimationType> {
125        if let (Some(stats_order), Some(stats_left)) = (
126            self.gs_order.get_statistics(),
127            self.tok_helper.get_gs_left_token().get_statistics(),
128        ) {
129            let mut sum_cov_nodes = 0;
130            let mut sum_included = 0;
131
132            let num_of_token = stats_order.nodes as f64;
133            for gs_cov in self.tok_helper.get_gs_coverage().iter() {
134                if let Some(stats_cov) = gs_cov.get_statistics() {
135                    sum_cov_nodes += stats_cov.nodes;
136
137                    let covered_token_per_node = stats_cov.fan_out_99_percentile;
138                    let aligned_non_token =
139                        covered_token_per_node * stats_left.inverse_fan_out_99_percentile;
140
141                    sum_included += covered_token_per_node + aligned_non_token;
142                }
143            }
144            if sum_cov_nodes == 0 {
145                // only token in this corpus
146                return Ok(EstimationType::Selectivity(1.0 / num_of_token));
147            } else {
148                return Ok(EstimationType::Selectivity(
149                    (sum_included as f64) / (sum_cov_nodes as f64),
150                ));
151            }
152        }
153
154        Ok(EstimationType::Selectivity(0.1))
155    }
156}
157
158enum NodeType {
159    Token(NodeID),
160    Other(NodeID),
161}
162
163impl BinaryOperatorIndex for Inclusion<'_> {
164    fn retrieve_matches<'b>(&'b self, lhs: &Match) -> Box<dyn Iterator<Item = Result<Match>> + 'b> {
165        let left_right_token = try_as_boxed_iter!(self.tok_helper.left_right_token_for(lhs.node));
166        if let (Some(start_lhs), Some(end_lhs)) = left_right_token {
167            // span length of LHS
168            let l = try_as_boxed_iter!(self.gs_order.distance(start_lhs, end_lhs));
169            if let Some(l) = l {
170                // find each token which is between the left and right border
171                let overlapped_token =
172                    self.gs_order
173                        .find_connected(start_lhs, 0, std::ops::Bound::Included(l));
174                // get the nodes that are covering these overlapped tokens
175                let candidates = overlapped_token
176                    .map_ok(move |t| {
177                        let others = self
178                            .tok_helper
179                            .get_gs_left_token()
180                            .get_ingoing_edges(t)
181                            .map_ok(NodeType::Other);
182                        // return the token itself and all aligned nodes
183                        std::iter::once(Ok(NodeType::Token(t))).chain(others)
184                    })
185                    .flatten_ok()
186                    // Unwrap the Result<Result<_>>
187                    .map(|c| match c {
188                        Ok(c) => match c {
189                            Ok(c) => Ok(c),
190                            Err(e) => Err(GraphAnnisError::from(e)),
191                        },
192                        Err(e) => Err(GraphAnnisError::from(e)),
193                    });
194                // we need to check if the the RHS of these candidates is also included by the original span
195                let result = candidates
196                    .map(move |n| {
197                        let n = n?;
198                        match n {
199                            NodeType::Token(t) => Ok(Some(t)),
200                            NodeType::Other(n) => {
201                                // get right-aligned token of candidate
202                                let mut end_n =
203                                    self.tok_helper.get_gs_right_token().get_outgoing_edges(n);
204                                if let Some(end_n) = end_n.next() {
205                                    let end_n = end_n?;
206                                    if self.gs_order.is_connected(
207                                        end_n,
208                                        end_lhs,
209                                        0,
210                                        std::ops::Bound::Included(l),
211                                    )? {
212                                        // path between right-most tokens exists in ORDERING component
213                                        // and has maximum length l
214                                        return Ok(Some(n));
215                                    }
216                                }
217                                Ok(None)
218                            }
219                        }
220                    })
221                    // Only include the ones where the constraint was met
222                    .filter_map_ok(|n| n)
223                    .map_ok(|n| Match {
224                        node: n,
225                        anno_key: DEFAULT_ANNO_KEY.clone(),
226                    });
227                return Box::new(result);
228            }
229        }
230
231        Box::new(std::iter::empty())
232    }
233
234    fn as_binary_operator(&self) -> &dyn BinaryOperatorBase {
235        self
236    }
237}