1use crate::align::print_transposition as print_translation;
2use serde::Deserialize;
3use stam::*;
4use std::borrow::Cow;
5use std::collections::HashMap;
6use toml;
7
8pub fn translate<'store>(
9 store: &'store mut AnnotationStore,
10 mut translation_queries: Vec<Query<'store>>,
11 queries: Vec<Query<'store>>,
12 use_translation_var: Option<&str>,
13 use_var: Option<&str>,
14 id_prefix: Option<String>,
15 idstrategy: IdStrategy,
16 ignore_errors: bool,
17 verbose: bool,
18 config: TranslateConfig,
19) -> Result<Vec<AnnotationHandle>, StamError> {
20 let mut builders = Vec::new();
21 while translation_queries.len() < queries.len() {
22 let query = translation_queries
23 .get(translation_queries.len() - 1)
24 .expect("there must be translation queries");
25 translation_queries.push(query.clone());
26 }
27 for (translation_query, query) in translation_queries.into_iter().zip(queries.into_iter()) {
28 let iter = store.query(translation_query)?;
29 let mut translation: Option<ResultItem<Annotation>> = None;
30 let translationdata = store
31 .find_data(
32 "https://w3id.org/stam/extensions/stam-translate/",
33 "Translation",
34 DataOperator::Null,
35 )
36 .next()
37 .ok_or_else(|| {
38 StamError::OtherError(
39 "No translations at all were found in the annotation store (the STAM translate vocabulary is not present in the store)",
40 )
41 })?;
42 for resultrow in iter {
43 if let Ok(QueryResultItem::Annotation(annotation)) =
44 resultrow.get_by_name_or_last(use_translation_var)
45 {
46 if !annotation.has_data(&translationdata) {
47 return Err(StamError::OtherError(
48 "The retrieved annotation is not explicitly marked as a translation, refusing to use",
49 ));
50 }
51 translation = Some(annotation.clone());
52 break;
53 }
54 }
55 if let Some(translation) = translation {
56 let iter = store.query(query)?;
57 for resultrow in iter {
58 if let Ok(QueryResultItem::Annotation(annotation)) =
59 resultrow.get_by_name_or_last(use_var)
60 {
61 let mut config = config.clone();
62 if let Some(id) = annotation.id() {
63 let randomid = generate_id("", "");
64 config.translation_id = if let Some(id_prefix) = &id_prefix {
65 Some(format!("{}{}-translation-{}", id_prefix, id, randomid))
66 } else {
67 Some(format!("{}-translation-{}", id, randomid))
68 };
69 config.resegmentation_id = if let Some(id_prefix) = &id_prefix {
70 Some(format!("{}{}-resegmentation-{}", id_prefix, id, randomid))
71 } else {
72 Some(format!("{}-resegmentation-{}", id, randomid))
73 };
74 config.source_side_id = Some(id.to_string());
75 config.existing_source_side = true;
76 config.target_side_ids = vec![if let Some(id_prefix) = &id_prefix {
77 format!("{}{}", id_prefix, regenerate_id(id, &idstrategy))
78 } else {
79 regenerate_id(id, &idstrategy)
80 }];
81 } else {
82 config.existing_source_side = false;
83 }
84 match annotation.translate(&translation, config) {
85 Ok(results) => builders.extend(results),
86 Err(StamError::NoText(_)) => {
87 eprintln!(
88 "WARNING: Skipping translation of annotation that references no text: {}",
89 annotation.id().unwrap_or("(no id)"),
90 );
91 }
92 Err(err) => {
93 eprintln!(
94 "WARNING: Failed to translate annotation {}: {}",
95 annotation.id().unwrap_or("(no id)"),
96 err
97 );
98 if !ignore_errors {
99 return Err(StamError::OtherError(
100 "Failed to translate annotation",
101 ));
102 }
103 }
104 }
105 } else {
106 return Err(StamError::OtherError(
107 "Query should return instances of ANNOTATION to translate, got something else instead",
108 ));
109 }
110 }
111 } else {
112 return Err(StamError::OtherError(
113 "Translation queries should return an ANNOTATION that is a translation, none found",
114 ));
115 }
116 }
117 let mut annotations = Vec::with_capacity(builders.len());
118 for builder in builders {
119 let annotation_handle = if !config.modify_existing {
120 let annotation_handle = store.annotate(builder)?;
122 annotations.push(annotation_handle);
123 annotation_handle
124 } else {
125 store.reannotate(builder, ReannotateMode::default())?
127 };
128 if verbose {
129 let annotation = store
130 .annotation(annotation_handle)
131 .expect("annotation was just added");
132 let translationdata = store
133 .find_data(
134 "https://w3id.org/stam/extensions/stam-translate/",
135 "Translation",
136 DataOperator::Null,
137 )
138 .next()
139 .ok_or_else(|| {
140 StamError::OtherError(
141 "No translations at all were found in the annotation store (the STAM translate vocabulary is not present in the store)",
142 )
143 })?;
144 if annotation.has_data(&translationdata) {
145 print_translation(&annotation);
146 } else if config.modify_existing {
147 eprintln!(
148 "# updated annotation {}",
149 annotation.id().expect("annotation must have ID")
150 );
151 } else {
152 eprintln!(
153 "# added annotation {}",
154 annotation.id().expect("annotation must have ID")
155 );
156 }
157 }
158 }
159 if verbose {
160 if !config.modify_existing {
161 eprintln!("{} annotations(s) created", annotations.len());
162 } else {
163 eprintln!("{} annotations(s) updated", annotations.len());
164 }
165 }
166 Ok(annotations)
167}
168
169#[derive(Clone, Default, Deserialize, Debug)]
170pub struct TranslateTextRule {
171 source: Option<String>,
172 target: String,
173 left: Option<String>,
174 right: Option<String>,
175
176 #[serde(default = "f_true")]
177 case_sensitive: bool,
178
179 #[serde(default)]
180 invert_context_match: bool,
181
182 #[serde(default)]
183 constraints: Vec<TranslateTextConstraint>,
184
185 #[serde(skip)]
186 source_regex: Option<Regex>,
187 #[serde(skip)]
188 left_regex: Option<Regex>,
189 #[serde(skip)]
190 right_regex: Option<Regex>,
191}
192
193fn f_true() -> bool {
195 true
196}
197
198#[derive(Clone, Default, Deserialize, Debug)]
199pub struct TranslateTextConstraint {
200 query: String,
201
202 #[serde(default)]
203 test: Option<String>,
204
205 #[serde(default)]
206 invert: bool,
207}
208
209pub struct MatchedRule<'a> {
210 source: &'a str,
211 target: Cow<'a, str>,
212}
213
214impl TranslateTextRule {
215 pub fn test<'a>(&'a self, text: &'a str, bytecursor: usize) -> Option<MatchedRule<'a>> {
217 if let Some(source_regex) = self.source_regex.as_ref() {
218 if let Some(m) = source_regex.find(&text[bytecursor..]) {
220 if self.test_context(text, bytecursor, m.len()) {
221 return Some(MatchedRule {
222 target: self.get_target(m.as_str()),
223 source: m.as_str(),
224 });
225 }
226 }
227 } else if let Some(source) = self.source.as_ref() {
228 if bytecursor + source.len() <= text.len() {
230 if let Some(candidate) = text.get(bytecursor..bytecursor + source.len()) {
231 if ((self.case_sensitive && candidate == *source)
232 || (!self.case_sensitive && candidate.to_lowercase() == *source))
233 && self.test_context(text, bytecursor, source.len())
234 {
235 return Some(MatchedRule {
236 target: self.get_target(source.as_str()),
237 source: source.as_str().into(),
238 });
239 }
240 }
241 }
242 }
243 None
244 }
245
246 fn test_context(&self, text: &str, bytecursor: usize, matchbytelen: usize) -> bool {
248 if let Some(left_regex) = self.left_regex.as_ref() {
249 let leftcontext = &text[..bytecursor];
251 if !left_regex.is_match(leftcontext) {
252 if self.invert_context_match {
253 return true;
254 } else {
255 return false;
256 }
257 }
258 } else if let Some(left_pattern) = self.left.as_ref() {
259 let leftcontext = &text[..bytecursor];
261 if (self.case_sensitive && !leftcontext.ends_with(left_pattern))
262 || (!self.case_sensitive
263 && leftcontext[std::cmp::min(0, bytecursor - left_pattern.len())..]
264 .to_lowercase()
265 != left_pattern.to_lowercase())
266 {
267 if self.invert_context_match {
268 return true;
269 } else {
270 return false;
271 }
272 }
273 }
274 if let Some(right_regex) = self.right_regex.as_ref() {
275 let rightcontext = &text[bytecursor + matchbytelen..];
277 if !right_regex.is_match(rightcontext) {
278 if self.invert_context_match {
279 return true;
280 } else {
281 return false;
282 }
283 }
284 } else if let Some(right_pattern) = self.right.as_ref() {
285 let rightcontext = &text[bytecursor + matchbytelen..];
287 if (self.case_sensitive && !rightcontext.starts_with(right_pattern))
288 || (!self.case_sensitive
289 && rightcontext[..std::cmp::min(rightcontext.len(), right_pattern.len())]
290 .to_lowercase()
291 != right_pattern.to_lowercase())
292 {
293 if self.invert_context_match {
294 return true;
295 } else {
296 return false;
297 }
298 }
299 }
300 if self.invert_context_match {
301 return false;
302 } else {
303 return true;
304 }
305 }
306
307 fn get_target<'a>(&'a self, source: &'a str) -> Cow<'a, str> {
308 match self.target.as_str() {
309 "$UPPER" => source.to_uppercase().into(),
310 "$LOWER" => source.to_lowercase().into(),
311 "$REVERSED" => Cow::Owned(source.chars().rev().collect::<String>()),
312 _ => Cow::Borrowed(self.target.as_str()),
313 }
314 }
315}
316
317#[derive(Clone, Default, Deserialize, Debug)]
318pub struct TranslateTextConfig {
319 rules: Vec<TranslateTextRule>,
320
321 #[serde(default)]
323 id_suffix: Option<String>,
324
325 #[serde(default)]
327 discard_unmatched: bool,
328
329 #[serde(default)]
331 no_annotations: bool,
332
333 #[serde(default)]
335 force_when_unchanged: bool,
336
337 #[serde(default)]
338 debug: bool,
339}
340
341impl TranslateTextConfig {
342 pub fn from_toml_str(tomlstr: &str, debug: bool) -> Result<Self, String> {
344 let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
345 config.debug = debug;
346 config.compile_regexps()?;
347 Ok(config)
348 }
349
350 pub fn with_id_suffix(mut self, suffix: impl Into<String>) -> Self {
352 self.id_suffix = Some(suffix.into());
353 self
354 }
355
356 pub fn with_force_when_unchanged(mut self) -> Self {
358 self.force_when_unchanged = true;
359 self
360 }
361
362 pub fn with_debug(mut self, value: bool) -> Self {
364 self.debug = value;
365 self
366 }
367
368 fn compile_regexps<'a>(&'a mut self) -> Result<(), String> {
369 for rule in self.rules.iter_mut() {
370 if let Some(v) = rule.source.as_ref() {
371 if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
372 let regex = format!("^{}", &v[1..v.len() - 1]);
373 rule.source_regex = Some(
374 RegexBuilder::new(®ex)
375 .case_insensitive(!rule.case_sensitive)
376 .build()
377 .map_err(|e| {
378 format!("Invalid regular expression for source: {}: {}", regex, e)
379 })?,
380 );
381 if self.debug {
382 eprintln!(
383 "[stam translatetext] compiled source regex {:?}",
384 rule.source_regex
385 )
386 }
387 }
388 }
389 if let Some(v) = rule.left.as_ref() {
390 if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
391 let regex = format!(".*{}$", &v[1..v.len() - 1]);
392 rule.left_regex = Some(
393 RegexBuilder::new(®ex)
394 .case_insensitive(!rule.case_sensitive)
395 .build()
396 .map_err(|e| {
397 format!(
398 "Invalid regular expression for left context: {}: {}",
399 regex, e
400 )
401 })?,
402 );
403 if self.debug {
404 eprintln!(
405 "[stam translatetext] compiled left context regex {:?}",
406 rule.left_regex
407 )
408 }
409 }
410 }
411 if let Some(v) = rule.right.as_ref() {
412 if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
413 let regex = format!("^{}.*", &v[1..v.len() - 1]);
414 rule.right_regex = Some(
415 RegexBuilder::new(®ex)
416 .case_insensitive(!rule.case_sensitive)
417 .build()
418 .map_err(|e| {
419 format!(
420 "Invalid regular expression for right context: {}: {}",
421 regex, e
422 )
423 })?,
424 );
425 if self.debug {
426 eprintln!(
427 "[stam translatetext] compiled right context regex {:?}",
428 rule.right_regex
429 )
430 }
431 }
432 }
433 if rule.source.is_none() {
434 return Err("Translation rules must have both a source".into());
435 }
436 }
437 if self.debug {
438 eprintln!("[stam translatetext] {} rules read", self.rules.len())
439 }
440 Ok(())
441 }
442
443 pub fn compile_queries<'a>(&'a self) -> Result<HashMap<String, Query<'a>>, String> {
444 let mut compiled_queries = HashMap::new();
445 for rule in self.rules.iter() {
446 for constraint in rule.constraints.iter() {
447 if !compiled_queries.contains_key(constraint.query.as_str()) {
448 compiled_queries.insert(
449 constraint.query.clone(),
450 stam::Query::parse(constraint.query.as_str())
451 .map_err(|err| format!("{}", err))?
452 .0,
453 );
454 }
455 }
456 }
457 Ok(compiled_queries)
458 }
459}
460
461pub fn translate_text<'store>(
463 store: &'store AnnotationStore,
464 queries: Vec<Query<'store>>,
465 usevar: Option<&'store str>,
466 config: &TranslateTextConfig,
467) -> Result<(Vec<TextResourceBuilder>, Vec<AnnotationBuilder<'static>>), String> {
468 let mut annotations = Vec::new();
469 let mut resourcebuilders = Vec::new();
470 let constraint_queries = config.compile_queries()?;
471
472 let mut seqnr = 0;
473 for query in queries.into_iter() {
474 let iter = store.query(query).map_err(|e| format!("{}", e))?;
475 for resultrow in iter {
476 if let Ok(result) = resultrow.get_by_name_or_last(usevar) {
477 match result {
478 QueryResultItem::TextResource(resource) => {
479 let resource_id = resource.id().expect("resource must have ID");
480 let new_resource_id = format!(
481 "{}.{}{}",
482 if resource_id.ends_with(".txt") {
483 &resource_id[..resource_id.len() - 4]
484 } else if resource_id.ends_with(".md") {
485 &resource_id[..resource_id.len() - 3]
486 } else {
487 resource_id
488 },
489 config
490 .id_suffix
491 .as_ref()
492 .map(|s| s.as_str())
493 .unwrap_or("translation"),
494 if resource_id.ends_with(".txt") {
495 ".txt"
496 } else if resource_id.ends_with(".md") {
497 ".md"
498 } else {
499 ""
500 }
501 );
502 let new_filename = if let Some(filename) = resource.as_ref().filename() {
503 Some(format!(
504 "{}.{}.txt",
505 if filename.ends_with(".txt") {
506 &filename[..filename.len() - 4]
507 } else if filename.ends_with(".md") {
508 &filename[..filename.len() - 3]
509 } else {
510 filename
511 },
512 config
513 .id_suffix
514 .as_ref()
515 .map(|s| s.as_str())
516 .unwrap_or("translation")
517 ))
518 } else {
519 None
520 };
521 translate_text_helper(
522 config,
523 store,
524 resource.text(),
525 resource,
526 0,
527 new_resource_id,
528 new_filename,
529 &mut resourcebuilders,
530 &mut annotations,
531 &constraint_queries,
532 )?;
533 }
534 QueryResultItem::TextSelection(textselection) => {
535 seqnr += 1;
536 let resource = textselection.resource();
537 let new_resource_id = format!(
538 "{}.{}.{}",
539 resource.id().expect("resource must have ID"),
540 config
541 .id_suffix
542 .as_ref()
543 .map(|s| s.as_str())
544 .unwrap_or("translation"),
545 seqnr
546 );
547 let new_filename = if let Some(filename) = resource.as_ref().filename() {
548 Some(format!(
549 "{}.{}.{}.txt",
550 if filename.ends_with(".txt") {
551 &filename[..filename.len() - 4]
552 } else if filename.ends_with(".md") {
553 &filename[..filename.len() - 3]
554 } else {
555 filename
556 },
557 config
558 .id_suffix
559 .as_ref()
560 .map(|s| s.as_str())
561 .unwrap_or("translation"),
562 seqnr
563 ))
564 } else {
565 None
566 };
567 translate_text_helper(
568 config,
569 store,
570 textselection.text(),
571 &resource,
572 textselection.begin(),
573 new_resource_id,
574 new_filename,
575 &mut resourcebuilders,
576 &mut annotations,
577 &constraint_queries,
578 )?;
579 }
580 _ => {
581 return Err(
582 "translatetext is only implemented for resources and text selections at the moment"
583 .into(),
584 );
585 }
586 }
587 }
588 }
589 }
590
591 Ok((resourcebuilders, annotations))
592}
593
594fn translate_text_helper<'store, 'a>(
595 config: &TranslateTextConfig,
596 store: &'store AnnotationStore,
597 text: &'store str,
598 resource: &ResultItem<'store, TextResource>,
599 baseoffset: usize,
600 new_resource_id: String,
601 new_filename: Option<String>,
602 resourcebuilders: &mut Vec<TextResourceBuilder>,
603 annotations: &mut Vec<AnnotationBuilder<'static>>,
604 constraint_queries: &HashMap<String, Query<'a>>,
605) -> Result<(), String> {
606 let mut new_text =
607 String::with_capacity(text.len() + (0.1 * text.len() as f64).round() as usize); let mut sourceselectors: Vec<SelectorBuilder<'static>> = Vec::new();
610 let mut targetselectors: Vec<SelectorBuilder<'static>> = Vec::new();
611
612 let mut skipbytes = 0;
613 let mut targetcharpos = 0;
614 for (charpos, (bytepos, c)) in text.char_indices().enumerate() {
615 if skipbytes > 0 {
616 skipbytes -= c.len_utf8();
617 continue;
618 }
619 let mut foundrule = false;
620 for rule in config.rules.iter().rev() {
621 if let Some(m) = rule.test(text, bytepos) {
622 if !rule.constraints.is_empty() {
623 let mut constraints_match = true; let sourcecharlen = m.source.chars().count();
625 let source = resource
626 .textselection(&Offset::simple(charpos, sourcecharlen))
627 .map_err(|e| format!("Failed to extract source: {}", e))?;
628 let left = resource
629 .textselection(&Offset::new(
630 Cursor::BeginAligned(0),
631 Cursor::BeginAligned(charpos),
632 ))
633 .map_err(|e| format!("Failed to extract left context: {}", e))?;
634 let right = resource
635 .textselection(&Offset::new(
636 Cursor::BeginAligned(charpos + sourcecharlen), Cursor::EndAligned(0),
638 ))
639 .map_err(|e| format!("Failed to extract right context: {}", e))?;
640 for constraint in rule.constraints.iter() {
641 let mut query = constraint_queries
643 .get(constraint.query.as_str())
644 .expect("constraint query should have been compiled earlier")
645 .clone();
646 query.bind_resourcevar("resource", resource);
647 query.bind_textvar("source", &source);
648 query.bind_textvar("left", &left);
649 query.bind_textvar("right", &right);
650 let mut iter = store
651 .query(query)
652 .map_err(|e| format!("Constraint query failed: {}", e))?;
653 if let Some(result) = iter.next() {
654 if let Some(testvar) = constraint.test.as_ref() {
656 if result.get_by_name(testvar.as_str()).is_ok() {
657 if constraint.invert {
658 constraints_match = false;
659 break;
660 }
661 } else if !constraint.invert {
662 constraints_match = false;
663 break;
664 }
665 } else if constraint.invert {
666 constraints_match = false;
668 break;
669 }
670 } else if !constraint.invert {
671 constraints_match = false;
673 break;
674 }
675 }
676 if !constraints_match {
677 if config.debug {
678 eprintln!(
679 "[stam translatetext] @{} failed to matched rule {:?} -> {:?} because of unmet constraints",
680 charpos, m.source, m.target
681 )
682 }
683 continue; }
685 }
686
687 skipbytes += m.source.len() - c.len_utf8(); if config.debug {
690 eprintln!(
691 "[stam translatetext] @{} (byte {}) matched rule {:?} -> {:?}",
692 charpos, bytepos, m.source, m.target
693 )
694 }
695
696 new_text += &m.target;
697
698 if !config.no_annotations {
699 sourceselectors.push(SelectorBuilder::TextSelector(
700 resource.handle().into(),
701 Offset::simple(
702 baseoffset + charpos,
703 baseoffset + charpos + m.source.chars().count(),
704 ),
705 ));
706 let targetlen = m.target.chars().count();
707 targetselectors.push(SelectorBuilder::TextSelector(
708 new_resource_id.clone().into(),
709 Offset::simple(targetcharpos, targetcharpos + targetlen),
710 ));
711 targetcharpos += targetlen;
712 }
713
714 foundrule = true;
715 continue; }
717 }
718
719 if !foundrule && !config.discard_unmatched {
720 if config.debug {
721 eprintln!(
722 "[stam translatetext] @{} (byte {}) no rule matches {:?}, falling back",
723 charpos, bytepos, c
724 )
725 }
726 new_text.push(c);
728 if !config.no_annotations {
729 sourceselectors.push(SelectorBuilder::TextSelector(
730 resource.handle().into(),
731 Offset::simple(baseoffset + charpos, baseoffset + charpos + 1),
732 ));
733 targetselectors.push(SelectorBuilder::TextSelector(
734 new_resource_id.clone().into(),
735 Offset::simple(targetcharpos, targetcharpos + 1),
736 ));
737 }
738 targetcharpos += 1;
739 }
740 }
741
742 if !config.force_when_unchanged && new_text.as_str() == text {
743 eprintln!(
744 "[stam translatetext] text for {} has not changed after translation, skipping..",
745 new_resource_id
746 );
747 return Ok(());
748 }
749
750 let mut resourcebuilder = TextResourceBuilder::new()
751 .with_text(new_text)
752 .with_id(new_resource_id.clone());
753 if let Some(new_filename) = new_filename {
754 resourcebuilder = resourcebuilder.with_filename(new_filename);
755 }
756 resourcebuilders.push(resourcebuilder);
757
758 if !config.no_annotations {
759 annotations.push(
760 AnnotationBuilder::new()
761 .with_id(format!("{}.translation-source", new_resource_id.as_str()))
762 .with_target(SelectorBuilder::DirectionalSelector(sourceselectors)),
763 );
764 annotations.push(
765 AnnotationBuilder::new()
766 .with_id(format!("{}.translation-target", new_resource_id.as_str()))
767 .with_target(SelectorBuilder::DirectionalSelector(targetselectors)),
768 );
769 annotations.push(
770 AnnotationBuilder::new()
771 .with_id(format!("{}.translation", new_resource_id.as_str()))
772 .with_data(
773 "https://w3id.org/stam/extensions/stam-translate/",
774 "Translation",
775 DataValue::Null,
776 )
777 .with_target(SelectorBuilder::DirectionalSelector(vec![
778 SelectorBuilder::AnnotationSelector(
779 format!("{}.translation-source", &new_resource_id).into(),
780 None,
781 ),
782 SelectorBuilder::AnnotationSelector(
783 format!("{}.translation-target", &new_resource_id).into(),
784 None,
785 ),
786 ])),
787 );
788 }
789 Ok(())
790}