1use crate::align::print_transposition as print_translation;
2use serde::Deserialize;
3use stam::*;
4use std::borrow::Cow;
5use std::collections::HashMap;
6use toml;
7
8pub fn translate<'store>(
9 store: &'store mut AnnotationStore,
10 mut translation_queries: Vec<Query<'store>>,
11 queries: Vec<Query<'store>>,
12 use_translation_var: Option<&str>,
13 use_var: Option<&str>,
14 id_prefix: Option<String>,
15 idstrategy: IdStrategy,
16 ignore_errors: bool,
17 verbose: bool,
18 config: TranslateConfig,
19) -> Result<Vec<AnnotationHandle>, StamError> {
20 let mut builders = Vec::new();
21 while translation_queries.len() < queries.len() {
22 let query = translation_queries
23 .get(translation_queries.len() - 1)
24 .expect("there must be translation queries");
25 translation_queries.push(query.clone());
26 }
27 for (translation_query, query) in translation_queries.into_iter().zip(queries.into_iter()) {
28 let iter = store.query(translation_query)?;
29 let mut translation: Option<ResultItem<Annotation>> = None;
30 let translationdata = store
31 .find_data(
32 "https://w3id.org/stam/extensions/stam-translate/",
33 "Translation",
34 DataOperator::Null,
35 )
36 .next()
37 .ok_or_else(|| {
38 StamError::OtherError(
39 "No translations at all were found in the annotation store (the STAM translate vocabulary is not present in the store)",
40 )
41 })?;
42 for resultrow in iter {
43 if let Ok(QueryResultItem::Annotation(annotation)) =
44 resultrow.get_by_name_or_last(use_translation_var)
45 {
46 if !annotation.has_data(&translationdata) {
47 return Err(StamError::OtherError(
48 "The retrieved annotation is not explicitly marked as a translation, refusing to use",
49 ));
50 }
51 translation = Some(annotation.clone());
52 break;
53 }
54 }
55 if let Some(translation) = translation {
56 let iter = store.query(query)?;
57 for resultrow in iter {
58 if let Ok(QueryResultItem::Annotation(annotation)) =
59 resultrow.get_by_name_or_last(use_var)
60 {
61 let mut config = config.clone();
62 if let Some(id) = annotation.id() {
63 let randomid = generate_id("", "");
64 config.translation_id = if let Some(id_prefix) = &id_prefix {
65 Some(format!("{}{}-translation-{}", id_prefix, id, randomid))
66 } else {
67 Some(format!("{}-translation-{}", id, randomid))
68 };
69 config.resegmentation_id = if let Some(id_prefix) = &id_prefix {
70 Some(format!("{}{}-resegmentation-{}", id_prefix, id, randomid))
71 } else {
72 Some(format!("{}-resegmentation-{}", id, randomid))
73 };
74 config.source_side_id = Some(id.to_string());
75 config.existing_source_side = true;
76 config.target_side_ids = vec![if let Some(id_prefix) = &id_prefix {
77 format!("{}{}", id_prefix, regenerate_id(id, &idstrategy))
78 } else {
79 regenerate_id(id, &idstrategy)
80 }];
81 } else {
82 config.existing_source_side = false;
83 }
84 match annotation.translate(&translation, config) {
85 Ok(results) => builders.extend(results),
86 Err(StamError::NoText(e)) => {
87 eprintln!(
88 "WARNING: Skipping translation of annotation {} that references no text: {}",
89 annotation.id().unwrap_or("(no id)"),
90 e
91 );
92 }
93 Err(err) => {
94 eprintln!(
95 "WARNING: Failed to translate annotation {}: {}",
96 annotation.id().unwrap_or("(no id)"),
97 err
98 );
99 if !ignore_errors {
100 return Err(StamError::OtherError(
101 "Failed to translate annotation",
102 ));
103 }
104 }
105 }
106 } else {
107 return Err(StamError::OtherError(
108 "Query should return instances of ANNOTATION to translate, got something else instead",
109 ));
110 }
111 }
112 } else {
113 return Err(StamError::OtherError(
114 "Translation queries should return an ANNOTATION that is a translation, none found",
115 ));
116 }
117 }
118 let mut annotations = Vec::with_capacity(builders.len());
119 for builder in builders {
120 let annotation_handle = if !config.modify_existing {
121 let annotation_handle = store.annotate(builder)?;
123 annotations.push(annotation_handle);
124 annotation_handle
125 } else {
126 store.reannotate(builder, ReannotateMode::default())?
128 };
129 if verbose {
130 let annotation = store
131 .annotation(annotation_handle)
132 .expect("annotation was just added");
133 let translationdata = store
134 .find_data(
135 "https://w3id.org/stam/extensions/stam-translate/",
136 "Translation",
137 DataOperator::Null,
138 )
139 .next()
140 .ok_or_else(|| {
141 StamError::OtherError(
142 "No translations at all were found in the annotation store (the STAM translate vocabulary is not present in the store)",
143 )
144 })?;
145 if annotation.has_data(&translationdata) {
146 print_translation(&annotation);
147 } else if config.modify_existing {
148 eprintln!(
149 "# updated annotation {}",
150 annotation.id().expect("annotation must have ID")
151 );
152 } else {
153 eprintln!(
154 "# added annotation {}",
155 annotation.id().expect("annotation must have ID")
156 );
157 }
158 }
159 }
160 if verbose {
161 if !config.modify_existing {
162 eprintln!("{} annotations(s) created", annotations.len());
163 } else {
164 eprintln!("{} annotations(s) updated", annotations.len());
165 }
166 }
167 Ok(annotations)
168}
169
170#[derive(Clone, Default, Deserialize, Debug)]
171pub struct TranslateTextRule {
172 source: Option<String>,
173 target: String,
174 left: Option<String>,
175 right: Option<String>,
176
177 #[serde(default = "f_true")]
178 case_sensitive: bool,
179
180 #[serde(default)]
181 invert_context_match: bool,
182
183 #[serde(default)]
184 constraints: Vec<TranslateTextConstraint>,
185
186 #[serde(skip)]
187 source_regex: Option<Regex>,
188 #[serde(skip)]
189 left_regex: Option<Regex>,
190 #[serde(skip)]
191 right_regex: Option<Regex>,
192}
193
194fn f_true() -> bool {
196 true
197}
198
199#[derive(Clone, Default, Deserialize, Debug)]
200pub struct TranslateTextConstraint {
201 query: String,
202
203 #[serde(default)]
204 test: Option<String>,
205
206 #[serde(default)]
207 invert: bool,
208}
209
210pub struct MatchedRule<'a> {
211 source: &'a str,
212 target: Cow<'a, str>,
213}
214
215impl TranslateTextRule {
216 pub fn test<'a>(&'a self, text: &'a str, bytecursor: usize) -> Option<MatchedRule<'a>> {
218 if let Some(source_regex) = self.source_regex.as_ref() {
219 if let Some(m) = source_regex.find(&text[bytecursor..]) {
221 if self.test_context(text, bytecursor, m.len()) {
222 return Some(MatchedRule {
223 target: self.get_target(m.as_str()),
224 source: m.as_str(),
225 });
226 }
227 }
228 } else if let Some(source) = self.source.as_ref() {
229 if bytecursor + source.len() <= text.len() {
231 if let Some(candidate) = text.get(bytecursor..bytecursor + source.len()) {
232 if ((self.case_sensitive && candidate == *source)
233 || (!self.case_sensitive && candidate.to_lowercase() == *source))
234 && self.test_context(text, bytecursor, source.len())
235 {
236 return Some(MatchedRule {
237 target: self.get_target(source.as_str()),
238 source: source.as_str().into(),
239 });
240 }
241 }
242 }
243 }
244 None
245 }
246
247 fn test_context(&self, text: &str, bytecursor: usize, matchbytelen: usize) -> bool {
249 if let Some(left_regex) = self.left_regex.as_ref() {
250 let leftcontext = &text[..bytecursor];
252 if !left_regex.is_match(leftcontext) {
253 if self.invert_context_match {
254 return true;
255 } else {
256 return false;
257 }
258 }
259 } else if let Some(left_pattern) = self.left.as_ref() {
260 let leftcontext = &text[..bytecursor];
262 if (self.case_sensitive && !leftcontext.ends_with(left_pattern))
263 || (!self.case_sensitive
264 && leftcontext[std::cmp::min(0, bytecursor - left_pattern.len())..]
265 .to_lowercase()
266 != left_pattern.to_lowercase())
267 {
268 if self.invert_context_match {
269 return true;
270 } else {
271 return false;
272 }
273 }
274 }
275 if let Some(right_regex) = self.right_regex.as_ref() {
276 let rightcontext = &text[bytecursor + matchbytelen..];
278 if !right_regex.is_match(rightcontext) {
279 if self.invert_context_match {
280 return true;
281 } else {
282 return false;
283 }
284 }
285 } else if let Some(right_pattern) = self.right.as_ref() {
286 let rightcontext = &text[bytecursor + matchbytelen..];
288 if (self.case_sensitive && !rightcontext.starts_with(right_pattern))
289 || (!self.case_sensitive
290 && rightcontext[..std::cmp::min(rightcontext.len(), right_pattern.len())]
291 .to_lowercase()
292 != right_pattern.to_lowercase())
293 {
294 if self.invert_context_match {
295 return true;
296 } else {
297 return false;
298 }
299 }
300 }
301 if self.invert_context_match {
302 return false;
303 } else {
304 return true;
305 }
306 }
307
308 fn get_target<'a>(&'a self, source: &'a str) -> Cow<'a, str> {
309 match self.target.as_str() {
310 "$UPPER" => source.to_uppercase().into(),
311 "$LOWER" => source.to_lowercase().into(),
312 "$REVERSED" => Cow::Owned(source.chars().rev().collect::<String>()),
313 _ => Cow::Borrowed(self.target.as_str()),
314 }
315 }
316}
317
318#[derive(Clone, Default, Deserialize, Debug)]
319pub struct TranslateTextConfig {
320 rules: Vec<TranslateTextRule>,
321
322 #[serde(default)]
324 id_suffix: Option<String>,
325
326 #[serde(default)]
328 discard_unmatched: bool,
329
330 #[serde(default)]
332 no_annotations: bool,
333
334 #[serde(default)]
336 force_when_unchanged: bool,
337
338 #[serde(default)]
339 debug: bool,
340}
341
342impl TranslateTextConfig {
343 pub fn from_toml_str(tomlstr: &str, debug: bool) -> Result<Self, String> {
345 let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
346 config.debug = debug;
347 config.compile_regexps()?;
348 Ok(config)
349 }
350
351 pub fn with_id_suffix(mut self, suffix: impl Into<String>) -> Self {
353 self.id_suffix = Some(suffix.into());
354 self
355 }
356
357 pub fn with_force_when_unchanged(mut self) -> Self {
359 self.force_when_unchanged = true;
360 self
361 }
362
363 pub fn with_debug(mut self, value: bool) -> Self {
365 self.debug = value;
366 self
367 }
368
369 fn compile_regexps<'a>(&'a mut self) -> Result<(), String> {
370 for rule in self.rules.iter_mut() {
371 if let Some(v) = rule.source.as_ref() {
372 if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
373 let regex = format!("^{}", &v[1..v.len() - 1]);
374 rule.source_regex = Some(
375 RegexBuilder::new(®ex)
376 .case_insensitive(!rule.case_sensitive)
377 .build()
378 .map_err(|e| {
379 format!("Invalid regular expression for source: {}: {}", regex, e)
380 })?,
381 );
382 if self.debug {
383 eprintln!(
384 "[stam translatetext] compiled source regex {:?}",
385 rule.source_regex
386 )
387 }
388 }
389 }
390 if let Some(v) = rule.left.as_ref() {
391 if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
392 let regex = format!(".*{}$", &v[1..v.len() - 1]);
393 rule.left_regex = Some(
394 RegexBuilder::new(®ex)
395 .case_insensitive(!rule.case_sensitive)
396 .build()
397 .map_err(|e| {
398 format!(
399 "Invalid regular expression for left context: {}: {}",
400 regex, e
401 )
402 })?,
403 );
404 if self.debug {
405 eprintln!(
406 "[stam translatetext] compiled left context regex {:?}",
407 rule.left_regex
408 )
409 }
410 }
411 }
412 if let Some(v) = rule.right.as_ref() {
413 if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
414 let regex = format!("^{}.*", &v[1..v.len() - 1]);
415 rule.right_regex = Some(
416 RegexBuilder::new(®ex)
417 .case_insensitive(!rule.case_sensitive)
418 .build()
419 .map_err(|e| {
420 format!(
421 "Invalid regular expression for right context: {}: {}",
422 regex, e
423 )
424 })?,
425 );
426 if self.debug {
427 eprintln!(
428 "[stam translatetext] compiled right context regex {:?}",
429 rule.right_regex
430 )
431 }
432 }
433 }
434 if rule.source.is_none() {
435 return Err("Translation rules must have both a source".into());
436 }
437 }
438 if self.debug {
439 eprintln!("[stam translatetext] {} rules read", self.rules.len())
440 }
441 Ok(())
442 }
443
444 pub fn compile_queries<'a>(&'a self) -> Result<HashMap<String, Query<'a>>, String> {
445 let mut compiled_queries = HashMap::new();
446 for rule in self.rules.iter() {
447 for constraint in rule.constraints.iter() {
448 if !compiled_queries.contains_key(constraint.query.as_str()) {
449 compiled_queries.insert(
450 constraint.query.clone(),
451 stam::Query::parse(constraint.query.as_str())
452 .map_err(|err| format!("{}", err))?
453 .0,
454 );
455 }
456 }
457 }
458 Ok(compiled_queries)
459 }
460}
461
462pub fn translate_text<'store>(
464 store: &'store AnnotationStore,
465 queries: Vec<Query<'store>>,
466 usevar: Option<&'store str>,
467 config: &TranslateTextConfig,
468) -> Result<(Vec<TextResourceBuilder>, Vec<AnnotationBuilder<'static>>), String> {
469 let mut annotations = Vec::new();
470 let mut resourcebuilders = Vec::new();
471 let constraint_queries = config.compile_queries()?;
472
473 let mut seqnr = 0;
474 for query in queries.into_iter() {
475 let iter = store.query(query).map_err(|e| format!("{}", e))?;
476 for resultrow in iter {
477 if let Ok(result) = resultrow.get_by_name_or_last(usevar) {
478 match result {
479 QueryResultItem::TextResource(resource) => {
480 let resource_id = resource.id().expect("resource must have ID");
481 let new_resource_id = format!(
482 "{}.{}{}",
483 if resource_id.ends_with(".txt") {
484 &resource_id[..resource_id.len() - 4]
485 } else if resource_id.ends_with(".md") {
486 &resource_id[..resource_id.len() - 3]
487 } else {
488 resource_id
489 },
490 config
491 .id_suffix
492 .as_ref()
493 .map(|s| s.as_str())
494 .unwrap_or("translation"),
495 if resource_id.ends_with(".txt") {
496 ".txt"
497 } else if resource_id.ends_with(".md") {
498 ".md"
499 } else {
500 ""
501 }
502 );
503 let new_filename = if let Some(filename) = resource.as_ref().filename() {
504 Some(format!(
505 "{}.{}.txt",
506 if filename.ends_with(".txt") {
507 &filename[..filename.len() - 4]
508 } else if filename.ends_with(".md") {
509 &filename[..filename.len() - 3]
510 } else {
511 filename
512 },
513 config
514 .id_suffix
515 .as_ref()
516 .map(|s| s.as_str())
517 .unwrap_or("translation")
518 ))
519 } else {
520 None
521 };
522 translate_text_helper(
523 config,
524 store,
525 resource.text(),
526 resource,
527 0,
528 new_resource_id,
529 new_filename,
530 &mut resourcebuilders,
531 &mut annotations,
532 &constraint_queries,
533 )?;
534 }
535 QueryResultItem::TextSelection(textselection) => {
536 seqnr += 1;
537 let resource = textselection.resource();
538 let new_resource_id = format!(
539 "{}.{}.{}",
540 resource.id().expect("resource must have ID"),
541 config
542 .id_suffix
543 .as_ref()
544 .map(|s| s.as_str())
545 .unwrap_or("translation"),
546 seqnr
547 );
548 let new_filename = if let Some(filename) = resource.as_ref().filename() {
549 Some(format!(
550 "{}.{}.{}.txt",
551 if filename.ends_with(".txt") {
552 &filename[..filename.len() - 4]
553 } else if filename.ends_with(".md") {
554 &filename[..filename.len() - 3]
555 } else {
556 filename
557 },
558 config
559 .id_suffix
560 .as_ref()
561 .map(|s| s.as_str())
562 .unwrap_or("translation"),
563 seqnr
564 ))
565 } else {
566 None
567 };
568 translate_text_helper(
569 config,
570 store,
571 textselection.text(),
572 &resource,
573 textselection.begin(),
574 new_resource_id,
575 new_filename,
576 &mut resourcebuilders,
577 &mut annotations,
578 &constraint_queries,
579 )?;
580 }
581 _ => {
582 return Err(
583 "translatetext is only implemented for resources and text selections at the moment"
584 .into(),
585 );
586 }
587 }
588 }
589 }
590 }
591
592 Ok((resourcebuilders, annotations))
593}
594
595fn translate_text_helper<'store, 'a>(
596 config: &TranslateTextConfig,
597 store: &'store AnnotationStore,
598 text: &'store str,
599 resource: &ResultItem<'store, TextResource>,
600 baseoffset: usize,
601 new_resource_id: String,
602 new_filename: Option<String>,
603 resourcebuilders: &mut Vec<TextResourceBuilder>,
604 annotations: &mut Vec<AnnotationBuilder<'static>>,
605 constraint_queries: &HashMap<String, Query<'a>>,
606) -> Result<(), String> {
607 let mut new_text =
608 String::with_capacity(text.len() + (0.1 * text.len() as f64).round() as usize); let mut sourceselectors: Vec<SelectorBuilder<'static>> = Vec::new();
611 let mut targetselectors: Vec<SelectorBuilder<'static>> = Vec::new();
612
613 let mut skipbytes = 0;
614 let mut targetcharpos = 0;
615 for (charpos, (bytepos, c)) in text.char_indices().enumerate() {
616 if skipbytes > 0 {
617 skipbytes -= c.len_utf8();
618 continue;
619 }
620 let mut foundrule = false;
621 for rule in config.rules.iter().rev() {
622 if let Some(m) = rule.test(text, bytepos) {
623 if !rule.constraints.is_empty() {
624 let mut constraints_match = true; let sourcecharlen = m.source.chars().count();
626 let source = resource
627 .textselection(&Offset::simple(charpos, sourcecharlen))
628 .map_err(|e| format!("Failed to extract source: {}", e))?;
629 let left = resource
630 .textselection(&Offset::new(
631 Cursor::BeginAligned(0),
632 Cursor::BeginAligned(charpos),
633 ))
634 .map_err(|e| format!("Failed to extract left context: {}", e))?;
635 let right = resource
636 .textselection(&Offset::new(
637 Cursor::BeginAligned(charpos + sourcecharlen), Cursor::EndAligned(0),
639 ))
640 .map_err(|e| format!("Failed to extract right context: {}", e))?;
641 for constraint in rule.constraints.iter() {
642 let mut query = constraint_queries
644 .get(constraint.query.as_str())
645 .expect("constraint query should have been compiled earlier")
646 .clone();
647 query.bind_resourcevar("resource", resource);
648 query.bind_textvar("source", &source);
649 query.bind_textvar("left", &left);
650 query.bind_textvar("right", &right);
651 let mut iter = store
652 .query(query)
653 .map_err(|e| format!("Constraint query failed: {}", e))?;
654 if let Some(result) = iter.next() {
655 if let Some(testvar) = constraint.test.as_ref() {
657 if result.get_by_name(testvar.as_str()).is_ok() {
658 if constraint.invert {
659 constraints_match = false;
660 break;
661 }
662 } else if !constraint.invert {
663 constraints_match = false;
664 break;
665 }
666 } else if constraint.invert {
667 constraints_match = false;
669 break;
670 }
671 } else if !constraint.invert {
672 constraints_match = false;
674 break;
675 }
676 }
677 if !constraints_match {
678 if config.debug {
679 eprintln!(
680 "[stam translatetext] @{} failed to matched rule {:?} -> {:?} because of unmet constraints",
681 charpos, m.source, m.target
682 )
683 }
684 continue; }
686 }
687
688 skipbytes += m.source.len() - c.len_utf8(); if config.debug {
691 eprintln!(
692 "[stam translatetext] @{} (byte {}) matched rule {:?} -> {:?}",
693 charpos, bytepos, m.source, m.target
694 )
695 }
696
697 new_text += &m.target;
698
699 if !config.no_annotations {
700 sourceselectors.push(SelectorBuilder::TextSelector(
701 resource.handle().into(),
702 Offset::simple(
703 baseoffset + charpos,
704 baseoffset + charpos + m.source.chars().count(),
705 ),
706 ));
707 let targetlen = m.target.chars().count();
708 targetselectors.push(SelectorBuilder::TextSelector(
709 new_resource_id.clone().into(),
710 Offset::simple(targetcharpos, targetcharpos + targetlen),
711 ));
712 targetcharpos += targetlen;
713 }
714
715 foundrule = true;
716 continue; }
718 }
719
720 if !foundrule && !config.discard_unmatched {
721 if config.debug {
722 eprintln!(
723 "[stam translatetext] @{} (byte {}) no rule matches {:?}, falling back",
724 charpos, bytepos, c
725 )
726 }
727 new_text.push(c);
729 if !config.no_annotations {
730 sourceselectors.push(SelectorBuilder::TextSelector(
731 resource.handle().into(),
732 Offset::simple(baseoffset + charpos, baseoffset + charpos + 1),
733 ));
734 targetselectors.push(SelectorBuilder::TextSelector(
735 new_resource_id.clone().into(),
736 Offset::simple(targetcharpos, targetcharpos + 1),
737 ));
738 }
739 targetcharpos += 1;
740 }
741 }
742
743 if !config.force_when_unchanged && new_text.as_str() == text {
744 eprintln!(
745 "[stam translatetext] text for {} has not changed after translation, skipping..",
746 new_resource_id
747 );
748 return Ok(());
749 }
750
751 let mut resourcebuilder = TextResourceBuilder::new()
752 .with_text(new_text)
753 .with_id(new_resource_id.clone());
754 if let Some(new_filename) = new_filename {
755 resourcebuilder = resourcebuilder.with_filename(new_filename);
756 }
757 resourcebuilders.push(resourcebuilder);
758
759 if !config.no_annotations {
760 annotations.push(
761 AnnotationBuilder::new()
762 .with_id(format!("{}.translation-source", new_resource_id.as_str()))
763 .with_target(SelectorBuilder::DirectionalSelector(sourceselectors)),
764 );
765 annotations.push(
766 AnnotationBuilder::new()
767 .with_id(format!("{}.translation-target", new_resource_id.as_str()))
768 .with_target(SelectorBuilder::DirectionalSelector(targetselectors)),
769 );
770 annotations.push(
771 AnnotationBuilder::new()
772 .with_id(format!("{}.translation", new_resource_id.as_str()))
773 .with_data(
774 "https://w3id.org/stam/extensions/stam-translate/",
775 "Translation",
776 DataValue::Null,
777 )
778 .with_target(SelectorBuilder::DirectionalSelector(vec![
779 SelectorBuilder::AnnotationSelector(
780 format!("{}.translation-source", &new_resource_id).into(),
781 None,
782 ),
783 SelectorBuilder::AnnotationSelector(
784 format!("{}.translation-target", &new_resource_id).into(),
785 None,
786 ),
787 ])),
788 );
789 }
790 Ok(())
791}