1use graphannis_core::{
2 annostorage::ValueSearch,
3 graph::ANNIS_NS,
4 types::{Component, NodeID},
5};
6use serde::{Deserialize, Serialize};
7use serde_with::{serde_as, DisplayFromStr, NoneAsEmptyString};
8use std::{
9 collections::{BTreeMap, BTreeSet, HashSet},
10 sync::Arc,
11};
12use tokio::sync::mpsc::Sender;
13
14use graphannis::{
15 graph::{AnnoKey, GraphStorage},
16 model::AnnotationComponentType,
17 AnnotationGraph,
18};
19use transient_btree_index::BtreeIndex;
20
21use crate::{
22 client::{
23 corpora,
24 search::{self, FindQuery},
25 },
26 state::{GlobalAppState, SessionArg},
27 Result,
28};
29
30#[serde_as]
31#[derive(Serialize, Deserialize, Debug, Clone)]
32pub struct CSVConfig {
33 #[serde(default)]
34 #[serde_as(as = "NoneAsEmptyString")]
35 pub span_segmentation: Option<String>,
36 #[serde(default)]
37 #[serde_as(as = "DisplayFromStr")]
38 pub left_context: usize,
39 #[serde(default)]
40 #[serde_as(as = "DisplayFromStr")]
41 pub right_context: usize,
42}
43
44pub struct CSVExporter {
45 query: FindQuery,
46 config: CSVConfig,
47 annotations_for_matched_nodes: BTreeMap<usize, BTreeSet<AnnoKey>>,
48 gap_edges: bimap::BiHashMap<NodeID, NodeID>,
49 subgraphs: BTreeMap<u64, AnnotationGraph>,
50 progress: Option<Sender<f32>>,
51}
52
53const SINGLE_PASS_PROGRESS: f32 = 0.5;
54const AFTER_FIRST_PASS_PROGRESS: f32 = SINGLE_PASS_PROGRESS;
55
56impl CSVExporter {
57 pub fn new(query: FindQuery, config: CSVConfig, progress: Option<Sender<f32>>) -> Self {
58 Self {
59 query,
60 config,
61 annotations_for_matched_nodes: BTreeMap::new(),
62 progress,
63 gap_edges: bimap::BiHashMap::new(),
64 subgraphs: BTreeMap::new(),
65 }
66 }
67
68 pub async fn convert_text<W: std::io::Write>(
69 &mut self,
70 session: SessionArg,
71 state: &GlobalAppState,
72 limit: Option<u64>,
73 output: &mut W,
74 ) -> Result<()> {
75 let mut query = self.query.clone();
77 query.limit = limit;
78
79 let result = search::find(&session, &query, state).await?;
80
81 self.first_pass(&result, state, &session).await?;
82
83 if let Some(progress) = &self.progress {
84 progress.send(AFTER_FIRST_PASS_PROGRESS).await?;
85 }
86 self.second_pass(&result, output).await?;
87
88 if let Some(progress) = &self.progress {
89 progress.send(1.0).await?;
90 }
91 Ok(())
92 }
93
94 async fn first_pass(
95 &mut self,
96 matches: &BtreeIndex<u64, Vec<String>>,
97 state: &GlobalAppState,
98 session: &SessionArg,
99 ) -> Result<()> {
100 let datasource_gap_component = Component::new(
101 AnnotationComponentType::Ordering,
102 ANNIS_NS.into(),
103 "datasource-gap".into(),
104 );
105 for m in matches.range(..)? {
106 let (match_nr, node_ids) = m?;
107 if let Some(id) = node_ids.first() {
109 let (corpus, _) = id.split_once('/').unwrap_or_default();
110 let g = corpora::subgraph(
112 session,
113 corpus,
114 node_ids.clone(),
115 self.config.span_segmentation.clone(),
116 self.config.left_context,
117 self.config.right_context,
118 state,
119 )
120 .await?;
121 for (pos_in_match, node_name) in node_ids.iter().enumerate() {
123 if let Some(n_id) = g.get_node_id_from_name(node_name)? {
124 let annos = g
125 .get_node_annos()
126 .get_annotations_for_item(&n_id)?
127 .into_iter()
128 .filter(|a| a.key.ns != "annis")
129 .map(|a| a.key);
130 self.annotations_for_matched_nodes
131 .entry(pos_in_match)
132 .or_default()
133 .extend(annos);
134 }
135 }
136 if let Some(gs) = g.get_graphstorage_as_ref(&datasource_gap_component) {
138 for source in gs.source_nodes() {
139 let source = source?;
140 for target in gs.get_outgoing_edges(source) {
141 let target = target?;
142 self.gap_edges.insert(source, target);
143 }
144 }
145 }
146 self.subgraphs.insert(match_nr, g);
147 }
148 if match_nr % 10 == 0 {
149 if let Some(sender) = &self.progress {
150 let partial_progress = match_nr as f32 / matches.len() as f32;
151 sender.send(partial_progress * SINGLE_PASS_PROGRESS).await?;
152 }
153 }
154 }
155 Ok(())
156 }
157
158 async fn second_pass<W>(
159 &self,
160 matches: &BtreeIndex<u64, Vec<String>>,
161 output: &mut W,
162 ) -> Result<()>
163 where
164 W: std::io::Write,
165 {
166 let mut writer = csv::Writer::from_writer(output);
167 if matches.contains_key(&0)? {
169 let mut header = Vec::default();
170 header.push("text".to_string());
171 for (m_nr, annos) in &self.annotations_for_matched_nodes {
172 for anno_key in annos {
173 let anno_qname =
174 graphannis_core::util::join_qname(&anno_key.ns, &anno_key.name);
175 header.push(format!("{} ({})", anno_qname, m_nr + 1));
176 }
177 }
178 writer.write_record(header)?;
179 }
180
181 for m in matches.range(..)? {
183 let (idx, node_ids) = m?;
184 if let Some(g) = self.subgraphs.get(&idx) {
186 let mut record: Vec<String> = Vec::with_capacity(node_ids.len() + 1);
187 let text = self.get_spannd_text(g)?;
189 record.push(text);
190 for (m_nr, annos) in &self.annotations_for_matched_nodes {
191 if let Some(id) = g.get_node_id_from_name(&node_ids[*m_nr])? {
192 for anno_key in annos {
194 let value = g
195 .get_node_annos()
196 .get_value_for_item(&id, anno_key)?
197 .unwrap_or_default();
198 record.push(value.to_string());
199 }
200 }
201 }
202 writer.write_record(record)?;
203 }
204
205 if idx % 10 == 0 {
206 if let Some(sender) = &self.progress {
207 let partial_progress = idx as f32 / matches.len() as f32;
208 sender
209 .send(AFTER_FIRST_PASS_PROGRESS + (partial_progress * SINGLE_PASS_PROGRESS))
210 .await?;
211 }
212 }
213 }
214 Ok(())
215 }
216
217 fn get_spannd_text(&self, g: &AnnotationGraph) -> Result<String> {
218 let ordering_component = if let Some(seg) = &self.config.span_segmentation {
220 Component::new(
221 AnnotationComponentType::Ordering,
222 "default_ns".into(),
223 seg.into(),
224 )
225 } else {
226 Component::new(
227 AnnotationComponentType::Ordering,
228 ANNIS_NS.into(),
229 "".into(),
230 )
231 };
232
233 let filtering_anno_key = self.config.span_segmentation.as_ref().map(|seg| AnnoKey {
234 name: seg.into(),
235 ns: "default_ns".into(),
236 });
237
238 let ordering_gs = g.get_graphstorage_as_ref(&ordering_component);
239 let cov_edges: Vec<Arc<dyn GraphStorage>> = g
240 .get_all_components(Some(AnnotationComponentType::Coverage), None)
241 .into_iter()
242 .filter_map(|c| g.get_graphstorage(&c))
243 .filter(|gs| {
244 if let Some(stats) = gs.get_statistics() {
245 stats.nodes > 0
246 } else {
247 true
248 }
249 })
250 .collect();
251
252 let mut roots: HashSet<_> = HashSet::new();
253 for n in g
254 .get_node_annos()
255 .exact_anno_search(Some(ANNIS_NS), "tok", ValueSearch::Any)
256 {
257 let n = n?;
258
259 let has_anno = if let Some(filter) = &filtering_anno_key {
260 g.get_node_annos()
262 .get_value_for_item(&n.node, filter)?
263 .is_some()
264 } else {
265 let mut actual_token = true;
267 for c in cov_edges.iter() {
268 if c.has_outgoing_edges(n.node)? {
269 actual_token = false;
270 break;
271 }
272 }
273 actual_token
274 };
275
276 if has_anno
277 && (ordering_gs.is_none()
278 || ordering_gs.is_some_and(|gs| gs.get_ingoing_edges(n.node).next().is_none()))
279 {
280 roots.insert(n.node);
281 }
282 }
283
284 let mut result = String::new();
289 let mut token = roots
290 .into_iter()
291 .find(|r| !self.gap_edges.contains_right(r));
292 let token_value_key = AnnoKey {
293 ns: ANNIS_NS.into(),
294 name: "tok".into(),
295 };
296 let whitespace_before_key = AnnoKey {
297 ns: ANNIS_NS.into(),
298 name: "tok-whitespace-before".into(),
299 };
300 let whitespace_after_key = AnnoKey {
301 ns: ANNIS_NS.into(),
302 name: "tok-whitespace-after".into(),
303 };
304
305 let mut is_first_token = true;
306
307 while let Some(current_token) = token {
308 if is_first_token {
310 if let Some(val) = g
311 .get_node_annos()
312 .get_value_for_item(¤t_token, &whitespace_before_key)?
313 {
314 result.push_str(&val);
315 }
316 }
317
318 if let Some(val) = g
319 .get_node_annos()
320 .get_value_for_item(¤t_token, &token_value_key)?
321 {
322 result.push_str(&val);
323 }
324
325 is_first_token = false;
326
327 token = if let Some(ordering_gs) = ordering_gs {
329 if let Some(next_token) = ordering_gs.get_outgoing_edges(current_token).next() {
330 let next_token = next_token?;
331 Some(next_token)
332 } else if let Some(next_token) = self.gap_edges.get_by_left(¤t_token) {
333 result.push_str("(...) ");
334 Some(*next_token)
335 } else {
336 None
337 }
338 } else {
339 None
340 };
341
342 if token.is_some() {
344 if let Some(val) = g
345 .get_node_annos()
346 .get_value_for_item(¤t_token, &whitespace_after_key)?
347 {
348 result.push_str(&val);
349 } else if self.config.span_segmentation.is_some() {
350 result.push(' ');
352 }
353 }
354 }
355
356 Ok(result)
357 }
358}