use crate::align::print_transposition as print_translation;
use serde::Deserialize;
use stam::*;
use std::borrow::Cow;
use std::collections::HashMap;
use toml;
pub fn translate<'store>(
store: &'store mut AnnotationStore,
mut translation_queries: Vec<Query<'store>>,
queries: Vec<Query<'store>>,
use_translation_var: Option<&str>,
use_var: Option<&str>,
id_prefix: Option<String>,
idstrategy: IdStrategy,
ignore_errors: bool,
verbose: bool,
config: TranslateConfig,
) -> Result<Vec<AnnotationHandle>, StamError> {
let mut builders = Vec::new();
while translation_queries.len() < queries.len() {
let query = translation_queries
.get(translation_queries.len() - 1)
.expect("there must be translation queries");
translation_queries.push(query.clone());
}
for (translation_query, query) in translation_queries.into_iter().zip(queries.into_iter()) {
let iter = store.query(translation_query)?;
let mut translation: Option<ResultItem<Annotation>> = None;
let translationdata = store
.find_data(
"https://w3id.org/stam/extensions/stam-translate/",
"Translation",
DataOperator::Null,
)
.next()
.ok_or_else(|| {
StamError::OtherError(
"No translations at all were found in the annotation store (the STAM translate vocabulary is not present in the store)",
)
})?;
for resultrow in iter {
if let Ok(QueryResultItem::Annotation(annotation)) =
resultrow.get_by_name_or_last(use_translation_var)
{
if !annotation.has_data(&translationdata) {
return Err(StamError::OtherError(
"The retrieved annotation is not explicitly marked as a translation, refusing to use",
));
}
translation = Some(annotation.clone());
break;
}
}
if let Some(translation) = translation {
let iter = store.query(query)?;
for resultrow in iter {
if let Ok(QueryResultItem::Annotation(annotation)) =
resultrow.get_by_name_or_last(use_var)
{
let mut config = config.clone();
if let Some(id) = annotation.id() {
let randomid = generate_id("", "");
config.translation_id = if let Some(id_prefix) = &id_prefix {
Some(format!("{}{}-translation-{}", id_prefix, id, randomid))
} else {
Some(format!("{}-translation-{}", id, randomid))
};
config.resegmentation_id = if let Some(id_prefix) = &id_prefix {
Some(format!("{}{}-resegmentation-{}", id_prefix, id, randomid))
} else {
Some(format!("{}-resegmentation-{}", id, randomid))
};
config.source_side_id = Some(id.to_string());
config.existing_source_side = true;
config.target_side_ids = vec![if let Some(id_prefix) = &id_prefix {
format!("{}{}", id_prefix, regenerate_id(id, &idstrategy))
} else {
regenerate_id(id, &idstrategy)
}];
} else {
config.existing_source_side = false;
}
match annotation.translate(&translation, config) {
Ok(results) => builders.extend(results),
Err(StamError::NoText(e)) => {
eprintln!(
"WARNING: Skipping translation of annotation {} that references no text: {}",
annotation.id().unwrap_or("(no id)"),
e
);
}
Err(err) => {
eprintln!(
"WARNING: Failed to translate annotation {}: {}",
annotation.id().unwrap_or("(no id)"),
err
);
if !ignore_errors {
return Err(StamError::OtherError(
"Failed to translate annotation",
));
}
}
}
} else {
return Err(StamError::OtherError(
"Query should return instances of ANNOTATION to translate, got something else instead",
));
}
}
} else {
return Err(StamError::OtherError(
"Translation queries should return an ANNOTATION that is a translation, none found",
));
}
}
let mut annotations = Vec::with_capacity(builders.len());
for builder in builders {
let annotation_handle = if !config.modify_existing {
let annotation_handle = store.annotate(builder)?;
annotations.push(annotation_handle);
annotation_handle
} else {
store.reannotate(builder, ReannotateMode::default())?
};
if verbose {
let annotation = store
.annotation(annotation_handle)
.expect("annotation was just added");
let translationdata = store
.find_data(
"https://w3id.org/stam/extensions/stam-translate/",
"Translation",
DataOperator::Null,
)
.next()
.ok_or_else(|| {
StamError::OtherError(
"No translations at all were found in the annotation store (the STAM translate vocabulary is not present in the store)",
)
})?;
if annotation.has_data(&translationdata) {
print_translation(&annotation);
} else if config.modify_existing {
eprintln!(
"# updated annotation {}",
annotation.id().expect("annotation must have ID")
);
} else {
eprintln!(
"# added annotation {}",
annotation.id().expect("annotation must have ID")
);
}
}
}
if verbose {
if !config.modify_existing {
eprintln!("{} annotations(s) created", annotations.len());
} else {
eprintln!("{} annotations(s) updated", annotations.len());
}
}
Ok(annotations)
}
#[derive(Clone, Default, Deserialize, Debug)]
pub struct TranslateTextRule {
source: Option<String>,
target: String,
left: Option<String>,
right: Option<String>,
#[serde(default = "f_true")]
case_sensitive: bool,
#[serde(default)]
invert_context_match: bool,
#[serde(default)]
constraints: Vec<TranslateTextConstraint>,
#[serde(skip)]
source_regex: Option<Regex>,
#[serde(skip)]
left_regex: Option<Regex>,
#[serde(skip)]
right_regex: Option<Regex>,
}
fn f_true() -> bool {
true
}
#[derive(Clone, Default, Deserialize, Debug)]
pub struct TranslateTextConstraint {
query: String,
#[serde(default)]
test: Option<String>,
#[serde(default)]
invert: bool,
}
pub struct MatchedRule<'a> {
source: &'a str,
target: Cow<'a, str>,
}
impl TranslateTextRule {
pub fn test<'a>(&'a self, text: &'a str, bytecursor: usize) -> Option<MatchedRule<'a>> {
if let Some(source_regex) = self.source_regex.as_ref() {
if let Some(m) = source_regex.find(&text[bytecursor..]) {
if self.test_context(text, bytecursor, m.len()) {
return Some(MatchedRule {
target: self.get_target(m.as_str()),
source: m.as_str(),
});
}
}
} else if let Some(source) = self.source.as_ref() {
if bytecursor + source.len() <= text.len() {
if let Some(candidate) = text.get(bytecursor..bytecursor + source.len()) {
if ((self.case_sensitive && candidate == *source)
|| (!self.case_sensitive && candidate.to_lowercase() == *source))
&& self.test_context(text, bytecursor, source.len())
{
return Some(MatchedRule {
target: self.get_target(source.as_str()),
source: source.as_str().into(),
});
}
}
}
}
None
}
fn test_context(&self, text: &str, bytecursor: usize, matchbytelen: usize) -> bool {
if let Some(left_regex) = self.left_regex.as_ref() {
let leftcontext = &text[..bytecursor];
if !left_regex.is_match(leftcontext) {
if self.invert_context_match {
return true;
} else {
return false;
}
}
} else if let Some(left_pattern) = self.left.as_ref() {
let leftcontext = &text[..bytecursor];
if (self.case_sensitive && !leftcontext.ends_with(left_pattern))
|| (!self.case_sensitive
&& leftcontext[std::cmp::min(0, bytecursor - left_pattern.len())..]
.to_lowercase()
!= left_pattern.to_lowercase())
{
if self.invert_context_match {
return true;
} else {
return false;
}
}
}
if let Some(right_regex) = self.right_regex.as_ref() {
let rightcontext = &text[bytecursor + matchbytelen..];
if !right_regex.is_match(rightcontext) {
if self.invert_context_match {
return true;
} else {
return false;
}
}
} else if let Some(right_pattern) = self.right.as_ref() {
let rightcontext = &text[bytecursor + matchbytelen..];
if (self.case_sensitive && !rightcontext.starts_with(right_pattern))
|| (!self.case_sensitive
&& rightcontext[..std::cmp::min(rightcontext.len(), right_pattern.len())]
.to_lowercase()
!= right_pattern.to_lowercase())
{
if self.invert_context_match {
return true;
} else {
return false;
}
}
}
if self.invert_context_match {
return false;
} else {
return true;
}
}
fn get_target<'a>(&'a self, source: &'a str) -> Cow<'a, str> {
match self.target.as_str() {
"$UPPER" => source.to_uppercase().into(),
"$LOWER" => source.to_lowercase().into(),
"$REVERSED" => Cow::Owned(source.chars().rev().collect::<String>()),
_ => Cow::Borrowed(self.target.as_str()),
}
}
}
#[derive(Clone, Default, Deserialize, Debug)]
pub struct TranslateTextConfig {
rules: Vec<TranslateTextRule>,
#[serde(default)]
id_suffix: Option<String>,
#[serde(default)]
discard_unmatched: bool,
#[serde(default)]
no_annotations: bool,
#[serde(default)]
force_when_unchanged: bool,
#[serde(default)]
debug: bool,
}
impl TranslateTextConfig {
pub fn from_toml_str(tomlstr: &str, debug: bool) -> Result<Self, String> {
let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
config.debug = debug;
config.compile_regexps()?;
Ok(config)
}
pub fn with_id_suffix(mut self, suffix: impl Into<String>) -> Self {
self.id_suffix = Some(suffix.into());
self
}
pub fn with_force_when_unchanged(mut self) -> Self {
self.force_when_unchanged = true;
self
}
pub fn with_debug(mut self, value: bool) -> Self {
self.debug = value;
self
}
fn compile_regexps<'a>(&'a mut self) -> Result<(), String> {
for rule in self.rules.iter_mut() {
if let Some(v) = rule.source.as_ref() {
if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
let regex = format!("^{}", &v[1..v.len() - 1]);
rule.source_regex = Some(
RegexBuilder::new(®ex)
.case_insensitive(!rule.case_sensitive)
.build()
.map_err(|e| {
format!("Invalid regular expression for source: {}: {}", regex, e)
})?,
);
if self.debug {
eprintln!(
"[stam translatetext] compiled source regex {:?}",
rule.source_regex
)
}
}
}
if let Some(v) = rule.left.as_ref() {
if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
let regex = format!(".*{}$", &v[1..v.len() - 1]);
rule.left_regex = Some(
RegexBuilder::new(®ex)
.case_insensitive(!rule.case_sensitive)
.build()
.map_err(|e| {
format!(
"Invalid regular expression for left context: {}: {}",
regex, e
)
})?,
);
if self.debug {
eprintln!(
"[stam translatetext] compiled left context regex {:?}",
rule.left_regex
)
}
}
}
if let Some(v) = rule.right.as_ref() {
if v.starts_with('/') && v.ends_with('/') && v.len() > 1 {
let regex = format!("^{}.*", &v[1..v.len() - 1]);
rule.right_regex = Some(
RegexBuilder::new(®ex)
.case_insensitive(!rule.case_sensitive)
.build()
.map_err(|e| {
format!(
"Invalid regular expression for right context: {}: {}",
regex, e
)
})?,
);
if self.debug {
eprintln!(
"[stam translatetext] compiled right context regex {:?}",
rule.right_regex
)
}
}
}
if rule.source.is_none() {
return Err("Translation rules must have both a source".into());
}
}
if self.debug {
eprintln!("[stam translatetext] {} rules read", self.rules.len())
}
Ok(())
}
pub fn compile_queries<'a>(&'a self) -> Result<HashMap<String, Query<'a>>, String> {
let mut compiled_queries = HashMap::new();
for rule in self.rules.iter() {
for constraint in rule.constraints.iter() {
if !compiled_queries.contains_key(constraint.query.as_str()) {
compiled_queries.insert(
constraint.query.clone(),
stam::Query::parse(constraint.query.as_str())
.map_err(|err| format!("{}", err))?
.0,
);
}
}
}
Ok(compiled_queries)
}
}
pub fn translate_text<'store>(
store: &'store AnnotationStore,
queries: Vec<Query<'store>>,
usevar: Option<&'store str>,
config: &TranslateTextConfig,
) -> Result<(Vec<TextResourceBuilder>, Vec<AnnotationBuilder<'static>>), String> {
let mut annotations = Vec::new();
let mut resourcebuilders = Vec::new();
let constraint_queries = config.compile_queries()?;
let mut seqnr = 0;
for query in queries.into_iter() {
let iter = store.query(query).map_err(|e| format!("{}", e))?;
for resultrow in iter {
if let Ok(result) = resultrow.get_by_name_or_last(usevar) {
match result {
QueryResultItem::TextResource(resource) => {
let resource_id = resource.id().expect("resource must have ID");
let new_resource_id = format!(
"{}.{}{}",
if resource_id.ends_with(".txt") {
&resource_id[..resource_id.len() - 4]
} else if resource_id.ends_with(".md") {
&resource_id[..resource_id.len() - 3]
} else {
resource_id
},
config
.id_suffix
.as_ref()
.map(|s| s.as_str())
.unwrap_or("translation"),
if resource_id.ends_with(".txt") {
".txt"
} else if resource_id.ends_with(".md") {
".md"
} else {
""
}
);
let new_filename = if let Some(filename) = resource.as_ref().filename() {
Some(format!(
"{}.{}.txt",
if filename.ends_with(".txt") {
&filename[..filename.len() - 4]
} else if filename.ends_with(".md") {
&filename[..filename.len() - 3]
} else {
filename
},
config
.id_suffix
.as_ref()
.map(|s| s.as_str())
.unwrap_or("translation")
))
} else {
None
};
translate_text_helper(
config,
store,
resource.text(),
resource,
0,
new_resource_id,
new_filename,
&mut resourcebuilders,
&mut annotations,
&constraint_queries,
)?;
}
QueryResultItem::TextSelection(textselection) => {
seqnr += 1;
let resource = textselection.resource();
let new_resource_id = format!(
"{}.{}.{}",
resource.id().expect("resource must have ID"),
config
.id_suffix
.as_ref()
.map(|s| s.as_str())
.unwrap_or("translation"),
seqnr
);
let new_filename = if let Some(filename) = resource.as_ref().filename() {
Some(format!(
"{}.{}.{}.txt",
if filename.ends_with(".txt") {
&filename[..filename.len() - 4]
} else if filename.ends_with(".md") {
&filename[..filename.len() - 3]
} else {
filename
},
config
.id_suffix
.as_ref()
.map(|s| s.as_str())
.unwrap_or("translation"),
seqnr
))
} else {
None
};
translate_text_helper(
config,
store,
textselection.text(),
&resource,
textselection.begin(),
new_resource_id,
new_filename,
&mut resourcebuilders,
&mut annotations,
&constraint_queries,
)?;
}
_ => {
return Err(
"translatetext is only implemented for resources and text selections at the moment"
.into(),
);
}
}
}
}
}
Ok((resourcebuilders, annotations))
}
fn translate_text_helper<'store, 'a>(
config: &TranslateTextConfig,
store: &'store AnnotationStore,
text: &'store str,
resource: &ResultItem<'store, TextResource>,
baseoffset: usize,
new_resource_id: String,
new_filename: Option<String>,
resourcebuilders: &mut Vec<TextResourceBuilder>,
annotations: &mut Vec<AnnotationBuilder<'static>>,
constraint_queries: &HashMap<String, Query<'a>>,
) -> Result<(), String> {
let mut new_text =
String::with_capacity(text.len() + (0.1 * text.len() as f64).round() as usize);
let mut sourceselectors: Vec<SelectorBuilder<'static>> = Vec::new();
let mut targetselectors: Vec<SelectorBuilder<'static>> = Vec::new();
let mut skipbytes = 0;
let mut targetcharpos = 0;
for (charpos, (bytepos, c)) in text.char_indices().enumerate() {
if skipbytes > 0 {
skipbytes -= c.len_utf8();
continue;
}
let mut foundrule = false;
for rule in config.rules.iter().rev() {
if let Some(m) = rule.test(text, bytepos) {
if !rule.constraints.is_empty() {
let mut constraints_match = true; let sourcecharlen = m.source.chars().count();
let source = resource
.textselection(&Offset::simple(charpos, charpos + sourcecharlen))
.map_err(|e| format!("Failed to extract source: {}", e))?;
let left = resource
.textselection(&Offset::new(
Cursor::BeginAligned(0),
Cursor::BeginAligned(charpos),
))
.map_err(|e| format!("Failed to extract left context: {}", e))?;
let right = resource
.textselection(&Offset::new(
Cursor::BeginAligned(charpos + sourcecharlen), Cursor::EndAligned(0),
))
.map_err(|e| format!("Failed to extract right context: {}", e))?;
for constraint in rule.constraints.iter() {
let mut query = constraint_queries
.get(constraint.query.as_str())
.expect("constraint query should have been compiled earlier")
.clone();
query.bind_resourcevar("resource", resource);
query.bind_textvar("source", &source);
query.bind_textvar("left", &left);
query.bind_textvar("right", &right);
let mut iter = store
.query(query)
.map_err(|e| format!("Constraint query failed: {}", e))?;
if let Some(result) = iter.next() {
if let Some(testvar) = constraint.test.as_ref() {
if result.get_by_name(testvar.as_str()).is_ok() {
if constraint.invert {
constraints_match = false;
break;
}
} else if !constraint.invert {
constraints_match = false;
break;
}
} else if constraint.invert {
constraints_match = false;
break;
}
} else if !constraint.invert {
constraints_match = false;
break;
}
}
if !constraints_match {
if config.debug {
eprintln!(
"[stam translatetext] @{} failed to matched rule {:?} -> {:?} because of unmet constraints",
charpos, m.source, m.target
)
}
continue; }
}
skipbytes += m.source.len() - c.len_utf8();
if config.debug {
eprintln!(
"[stam translatetext] @{} (byte {}) matched rule {:?} -> {:?}",
charpos, bytepos, m.source, m.target
)
}
new_text += &m.target;
if !config.no_annotations {
sourceselectors.push(SelectorBuilder::TextSelector(
resource.handle().into(),
Offset::simple(
baseoffset + charpos,
baseoffset + charpos + m.source.chars().count(),
),
));
let targetlen = m.target.chars().count();
targetselectors.push(SelectorBuilder::TextSelector(
new_resource_id.clone().into(),
Offset::simple(targetcharpos, targetcharpos + targetlen),
));
targetcharpos += targetlen;
}
foundrule = true;
continue; }
}
if !foundrule && !config.discard_unmatched {
if config.debug {
eprintln!(
"[stam translatetext] @{} (byte {}) no rule matches {:?}, falling back",
charpos, bytepos, c
)
}
new_text.push(c);
if !config.no_annotations {
sourceselectors.push(SelectorBuilder::TextSelector(
resource.handle().into(),
Offset::simple(baseoffset + charpos, baseoffset + charpos + 1),
));
targetselectors.push(SelectorBuilder::TextSelector(
new_resource_id.clone().into(),
Offset::simple(targetcharpos, targetcharpos + 1),
));
}
targetcharpos += 1;
}
}
if !config.force_when_unchanged && new_text.as_str() == text {
eprintln!(
"[stam translatetext] text for {} has not changed after translation, skipping..",
new_resource_id
);
return Ok(());
}
let mut resourcebuilder = TextResourceBuilder::new()
.with_text(new_text)
.with_id(new_resource_id.clone());
if let Some(new_filename) = new_filename {
resourcebuilder = resourcebuilder.with_filename(new_filename);
}
resourcebuilders.push(resourcebuilder);
if !config.no_annotations {
annotations.push(
AnnotationBuilder::new()
.with_id(format!("{}.translation-source", new_resource_id.as_str()))
.with_target(SelectorBuilder::DirectionalSelector(sourceselectors)),
);
annotations.push(
AnnotationBuilder::new()
.with_id(format!("{}.translation-target", new_resource_id.as_str()))
.with_target(SelectorBuilder::DirectionalSelector(targetselectors)),
);
annotations.push(
AnnotationBuilder::new()
.with_id(format!("{}.translation", new_resource_id.as_str()))
.with_data(
"https://w3id.org/stam/extensions/stam-translate/",
"Translation",
DataValue::Null,
)
.with_target(SelectorBuilder::DirectionalSelector(vec![
SelectorBuilder::AnnotationSelector(
format!("{}.translation-source", &new_resource_id).into(),
None,
),
SelectorBuilder::AnnotationSelector(
format!("{}.translation-target", &new_resource_id).into(),
None,
),
])),
);
}
Ok(())
}