annis_web/
converter.rs

1use graphannis_core::{
2    annostorage::ValueSearch,
3    graph::ANNIS_NS,
4    types::{Component, NodeID},
5};
6use serde::{Deserialize, Serialize};
7use serde_with::{serde_as, DisplayFromStr, NoneAsEmptyString};
8use std::{
9    collections::{BTreeMap, BTreeSet, HashSet},
10    sync::Arc,
11};
12use tokio::sync::mpsc::Sender;
13
14use graphannis::{
15    graph::{AnnoKey, GraphStorage},
16    model::AnnotationComponentType,
17    AnnotationGraph,
18};
19use transient_btree_index::BtreeIndex;
20
21use crate::{
22    client::{
23        corpora,
24        search::{self, FindQuery},
25    },
26    state::{GlobalAppState, SessionArg},
27    Result,
28};
29
30#[serde_as]
31#[derive(Serialize, Deserialize, Debug, Clone)]
32pub struct CSVConfig {
33    #[serde(default)]
34    #[serde_as(as = "NoneAsEmptyString")]
35    pub span_segmentation: Option<String>,
36    #[serde(default)]
37    #[serde_as(as = "DisplayFromStr")]
38    pub left_context: usize,
39    #[serde(default)]
40    #[serde_as(as = "DisplayFromStr")]
41    pub right_context: usize,
42}
43
44pub struct CSVExporter {
45    query: FindQuery,
46    config: CSVConfig,
47    annotations_for_matched_nodes: BTreeMap<usize, BTreeSet<AnnoKey>>,
48    gap_edges: bimap::BiHashMap<NodeID, NodeID>,
49    subgraphs: BTreeMap<u64, AnnotationGraph>,
50    progress: Option<Sender<f32>>,
51}
52
53const SINGLE_PASS_PROGRESS: f32 = 0.5;
54const AFTER_FIRST_PASS_PROGRESS: f32 = SINGLE_PASS_PROGRESS;
55
56impl CSVExporter {
57    pub fn new(query: FindQuery, config: CSVConfig, progress: Option<Sender<f32>>) -> Self {
58        Self {
59            query,
60            config,
61            annotations_for_matched_nodes: BTreeMap::new(),
62            progress,
63            gap_edges: bimap::BiHashMap::new(),
64            subgraphs: BTreeMap::new(),
65        }
66    }
67
68    pub async fn convert_text<W: std::io::Write>(
69        &mut self,
70        session: SessionArg,
71        state: &GlobalAppState,
72        limit: Option<u64>,
73        output: &mut W,
74    ) -> Result<()> {
75        // Get all the matches as Salt ID
76        let mut query = self.query.clone();
77        query.limit = limit;
78
79        let result = search::find(&session, &query, state).await?;
80
81        self.first_pass(&result, state, &session).await?;
82
83        if let Some(progress) = &self.progress {
84            progress.send(AFTER_FIRST_PASS_PROGRESS).await?;
85        }
86        self.second_pass(&result, output).await?;
87
88        if let Some(progress) = &self.progress {
89            progress.send(1.0).await?;
90        }
91        Ok(())
92    }
93
94    async fn first_pass(
95        &mut self,
96        matches: &BtreeIndex<u64, Vec<String>>,
97        state: &GlobalAppState,
98        session: &SessionArg,
99    ) -> Result<()> {
100        let datasource_gap_component = Component::new(
101            AnnotationComponentType::Ordering,
102            ANNIS_NS.into(),
103            "datasource-gap".into(),
104        );
105        for m in matches.range(..)? {
106            let (match_nr, node_ids) = m?;
107            // Get the corpus from the first node
108            if let Some(id) = node_ids.first() {
109                let (corpus, _) = id.split_once('/').unwrap_or_default();
110                // Get the subgraph for the IDs
111                let g = corpora::subgraph(
112                    session,
113                    corpus,
114                    node_ids.clone(),
115                    self.config.span_segmentation.clone(),
116                    self.config.left_context,
117                    self.config.right_context,
118                    state,
119                )
120                .await?;
121                // Collect annotations for the matched nodes
122                for (pos_in_match, node_name) in node_ids.iter().enumerate() {
123                    if let Some(n_id) = g.get_node_id_from_name(node_name)? {
124                        let annos = g
125                            .get_node_annos()
126                            .get_annotations_for_item(&n_id)?
127                            .into_iter()
128                            .filter(|a| a.key.ns != "annis")
129                            .map(|a| a.key);
130                        self.annotations_for_matched_nodes
131                            .entry(pos_in_match)
132                            .or_default()
133                            .extend(annos);
134                    }
135                }
136                // Remember all datasource gaph edges
137                if let Some(gs) = g.get_graphstorage_as_ref(&datasource_gap_component) {
138                    for source in gs.source_nodes() {
139                        let source = source?;
140                        for target in gs.get_outgoing_edges(source) {
141                            let target = target?;
142                            self.gap_edges.insert(source, target);
143                        }
144                    }
145                }
146                self.subgraphs.insert(match_nr, g);
147            }
148            if match_nr % 10 == 0 {
149                if let Some(sender) = &self.progress {
150                    let partial_progress = match_nr as f32 / matches.len() as f32;
151                    sender.send(partial_progress * SINGLE_PASS_PROGRESS).await?;
152                }
153            }
154        }
155        Ok(())
156    }
157
158    async fn second_pass<W>(
159        &self,
160        matches: &BtreeIndex<u64, Vec<String>>,
161        output: &mut W,
162    ) -> Result<()>
163    where
164        W: std::io::Write,
165    {
166        let mut writer = csv::Writer::from_writer(output);
167        // Create the header from the first entry
168        if matches.contains_key(&0)? {
169            let mut header = Vec::default();
170            header.push("text".to_string());
171            for (m_nr, annos) in &self.annotations_for_matched_nodes {
172                for anno_key in annos {
173                    let anno_qname =
174                        graphannis_core::util::join_qname(&anno_key.ns, &anno_key.name);
175                    header.push(format!("{} ({})", anno_qname, m_nr + 1));
176                }
177            }
178            writer.write_record(header)?;
179        }
180
181        // Iterate over all matches
182        for m in matches.range(..)? {
183            let (idx, node_ids) = m?;
184            // Get the subgraph for the IDs
185            if let Some(g) = self.subgraphs.get(&idx) {
186                let mut record: Vec<String> = Vec::with_capacity(node_ids.len() + 1);
187                // Output all columns for this match, first column is the matched text
188                let text = self.get_spannd_text(g)?;
189                record.push(text);
190                for (m_nr, annos) in &self.annotations_for_matched_nodes {
191                    if let Some(id) = g.get_node_id_from_name(&node_ids[*m_nr])? {
192                        // Get the annotation values for this node
193                        for anno_key in annos {
194                            let value = g
195                                .get_node_annos()
196                                .get_value_for_item(&id, anno_key)?
197                                .unwrap_or_default();
198                            record.push(value.to_string());
199                        }
200                    }
201                }
202                writer.write_record(record)?;
203            }
204
205            if idx % 10 == 0 {
206                if let Some(sender) = &self.progress {
207                    let partial_progress = idx as f32 / matches.len() as f32;
208                    sender
209                        .send(AFTER_FIRST_PASS_PROGRESS + (partial_progress * SINGLE_PASS_PROGRESS))
210                        .await?;
211                }
212            }
213        }
214        Ok(())
215    }
216
217    fn get_spannd_text(&self, g: &AnnotationGraph) -> Result<String> {
218        // Get ordering component that matches the configured segmentation
219        let ordering_component = if let Some(seg) = &self.config.span_segmentation {
220            Component::new(
221                AnnotationComponentType::Ordering,
222                "default_ns".into(),
223                seg.into(),
224            )
225        } else {
226            Component::new(
227                AnnotationComponentType::Ordering,
228                ANNIS_NS.into(),
229                "".into(),
230            )
231        };
232
233        let filtering_anno_key = self.config.span_segmentation.as_ref().map(|seg| AnnoKey {
234            name: seg.into(),
235            ns: "default_ns".into(),
236        });
237
238        let ordering_gs = g.get_graphstorage_as_ref(&ordering_component);
239        let cov_edges: Vec<Arc<dyn GraphStorage>> = g
240            .get_all_components(Some(AnnotationComponentType::Coverage), None)
241            .into_iter()
242            .filter_map(|c| g.get_graphstorage(&c))
243            .filter(|gs| {
244                if let Some(stats) = gs.get_statistics() {
245                    stats.nodes > 0
246                } else {
247                    true
248                }
249            })
250            .collect();
251
252        let mut roots: HashSet<_> = HashSet::new();
253        for n in g
254            .get_node_annos()
255            .exact_anno_search(Some(ANNIS_NS), "tok", ValueSearch::Any)
256        {
257            let n = n?;
258
259            let has_anno = if let Some(filter) = &filtering_anno_key {
260                // For segmentation search, only include the nodes that have a matching annotation
261                g.get_node_annos()
262                    .get_value_for_item(&n.node, filter)?
263                    .is_some()
264            } else {
265                // Check that this is an actual token and there are no outgoing coverage edges
266                let mut actual_token = true;
267                for c in cov_edges.iter() {
268                    if c.has_outgoing_edges(n.node)? {
269                        actual_token = false;
270                        break;
271                    }
272                }
273                actual_token
274            };
275
276            if has_anno
277                && (ordering_gs.is_none()
278                    || ordering_gs.is_some_and(|gs| gs.get_ingoing_edges(n.node).next().is_none()))
279            {
280                roots.insert(n.node);
281            }
282        }
283
284        // Order the roots in the overall text position by using the
285        // explicit gap edges. First find the root node that has no incoming
286        // gap, than follow the ordering and gap edges and construct the
287        // text in between.
288        let mut result = String::new();
289        let mut token = roots
290            .into_iter()
291            .find(|r| !self.gap_edges.contains_right(r));
292        let token_value_key = AnnoKey {
293            ns: ANNIS_NS.into(),
294            name: "tok".into(),
295        };
296        let whitespace_before_key = AnnoKey {
297            ns: ANNIS_NS.into(),
298            name: "tok-whitespace-before".into(),
299        };
300        let whitespace_after_key = AnnoKey {
301            ns: ANNIS_NS.into(),
302            name: "tok-whitespace-after".into(),
303        };
304
305        let mut is_first_token = true;
306
307        while let Some(current_token) = token {
308            // Add prefix whitespace only for first token
309            if is_first_token {
310                if let Some(val) = g
311                    .get_node_annos()
312                    .get_value_for_item(&current_token, &whitespace_before_key)?
313                {
314                    result.push_str(&val);
315                }
316            }
317
318            if let Some(val) = g
319                .get_node_annos()
320                .get_value_for_item(&current_token, &token_value_key)?
321            {
322                result.push_str(&val);
323            }
324
325            is_first_token = false;
326
327            // Try to get the outgoing ordering edge first
328            token = if let Some(ordering_gs) = ordering_gs {
329                if let Some(next_token) = ordering_gs.get_outgoing_edges(current_token).next() {
330                    let next_token = next_token?;
331                    Some(next_token)
332                } else if let Some(next_token) = self.gap_edges.get_by_left(&current_token) {
333                    result.push_str("(...) ");
334                    Some(*next_token)
335                } else {
336                    None
337                }
338            } else {
339                None
340            };
341
342            // Add postfix whitespace (but not for the last token)
343            if token.is_some() {
344                if let Some(val) = g
345                    .get_node_annos()
346                    .get_value_for_item(&current_token, &whitespace_after_key)?
347                {
348                    result.push_str(&val);
349                } else if self.config.span_segmentation.is_some() {
350                    // Use a space character as default seperation character
351                    result.push(' ');
352                }
353            }
354        }
355
356        Ok(result)
357    }
358}