#![cfg_attr(not(test), warn(clippy::unwrap_used))]
pub mod error;
pub mod estarde;
pub mod exporter;
pub mod importer;
pub mod manipulator;
pub mod models;
pub mod progress;
#[cfg(test)]
pub(crate) mod test_util;
pub mod util;
pub mod workflow;
use std::{
fmt::Display,
path::{Path, PathBuf},
};
use error::Result;
use exporter::{
Exporter, conllu::ExportCoNLLU, exmaralda::ExportExmaralda, graphml::GraphMLExporter,
meta::ExportMeta, saltxml::ExportSaltXml, sequence::ExportSequence, table::ExportTable,
textgrid::ExportTextGrid, xlsx::ExportXlsx,
};
use facet::Facet;
use facet_reflect::Peek;
use graphannis::AnnotationGraph;
use importer::{
Importer, conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, file_nodes::CreateFileNodes,
graphml::GraphMLImporter, meta::AnnotateCorpus, none::CreateEmptyCorpus, opus::ImportOpusLinks,
ptb::ImportPTB, relannis::ImportRelAnnis, saltxml::ImportSaltXml, table::ImportTable,
textgrid::ImportTextgrid, toolbox::ImportToolBox, treetagger::ImportTreeTagger,
webanno::ImportWebAnnoTSV, whisper::ImportWhisper, xlsx::ImportSpreadsheet, xml::ImportXML,
};
use manipulator::{
Manipulator, align::AlignNodes, check::Check, chunker::Chunk, collapse::Collapse,
enumerate::EnumerateMatches, filter::FilterNodes, link::LinkNodes, map::MapAnnos, no_op::NoOp,
re::Revise, sleep::Sleep, split::SplitValues, time::Filltime, visualize::Visualize,
};
use serde::Serialize;
use serde_derive::Deserialize;
use tabled::Tabled;
use workflow::StatusSender;
use crate::{
exporter::treetagger::ExportTreeTagger,
importer::{GenericImportConfiguration, git::ImportGitMetadata, text::ImportText},
manipulator::{
diff::DiffSubgraphs, divide::DivideSegments, edit::EditGraph, span::CreateSpans,
},
};
#[derive(Tabled)]
pub struct ModuleConfiguration {
pub name: String,
pub description: String,
}
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
#[repr(u16)]
#[serde(tag = "format", rename_all = "lowercase", content = "config")]
pub enum WriteAs {
CoNLLU(#[serde(default)] Box<ExportCoNLLU>),
EXMARaLDA(#[serde(default)] ExportExmaralda),
GraphML(#[serde(default)] GraphMLExporter), Meta(#[serde(default)] ExportMeta),
SaltXml(#[serde(default)] ExportSaltXml),
Sequence(#[serde(default)] ExportSequence),
Table(#[serde(default)] ExportTable),
TextGrid(ExportTextGrid), TreeTagger(#[serde(default)] ExportTreeTagger),
Xlsx(#[serde(default)] ExportXlsx),
}
impl Default for WriteAs {
fn default() -> Self {
WriteAs::GraphML(GraphMLExporter::default())
}
}
impl WriteAs {
fn writer(&self) -> &dyn Exporter {
match self {
WriteAs::EXMARaLDA(m) => m,
WriteAs::GraphML(m) => m,
WriteAs::SaltXml(m) => m,
WriteAs::Sequence(m) => m,
WriteAs::Table(m) => m,
WriteAs::TextGrid(m) => m,
WriteAs::TreeTagger(m) => m,
WriteAs::Xlsx(m) => m,
WriteAs::CoNLLU(m) => m.as_ref(),
WriteAs::Meta(m) => m,
}
}
pub fn name(&self) -> Result<String> {
let parent_enum = Peek::new(self).into_enum()?;
let variant = parent_enum.active_variant()?;
Ok(variant.name.to_lowercase())
}
}
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
#[serde(tag = "format", rename_all = "lowercase", content = "config")]
#[repr(u16)]
pub enum ReadFrom {
CoNLLU(#[serde(default)] ImportCoNLLU),
EXMARaLDA(#[serde(default)] ImportEXMARaLDA),
Git(ImportGitMetadata),
GraphML(#[serde(default)] GraphMLImporter),
Meta(#[serde(default)] AnnotateCorpus),
None(#[serde(default)] CreateEmptyCorpus),
Opus(#[serde(default)] ImportOpusLinks),
Path(#[serde(default)] CreateFileNodes),
PTB(#[serde(default)] ImportPTB),
RelAnnis(#[serde(default)] ImportRelAnnis),
SaltXml(#[serde(default)] ImportSaltXml),
Table(#[serde(default)] ImportTable),
Text(#[serde(default)] ImportText),
TextGrid(#[serde(default)] ImportTextgrid),
Toolbox(#[serde(default)] ImportToolBox),
TreeTagger(#[serde(default)] ImportTreeTagger),
Webanno(#[serde(default)] ImportWebAnnoTSV),
Whisper(#[serde(default)] ImportWhisper),
Xlsx(#[serde(default)] ImportSpreadsheet),
Xml(ImportXML),
}
impl Default for ReadFrom {
fn default() -> Self {
ReadFrom::None(CreateEmptyCorpus::default())
}
}
impl ReadFrom {
fn reader(&self) -> &dyn Importer {
match self {
ReadFrom::CoNLLU(m) => m,
ReadFrom::EXMARaLDA(m) => m,
ReadFrom::GraphML(m) => m,
ReadFrom::Meta(m) => m,
ReadFrom::None(m) => m,
ReadFrom::Opus(m) => m,
ReadFrom::Path(m) => m,
ReadFrom::PTB(m) => m,
ReadFrom::RelAnnis(m) => m,
ReadFrom::SaltXml(m) => m,
ReadFrom::Table(m) => m,
ReadFrom::Text(m) => m,
ReadFrom::TextGrid(m) => m,
ReadFrom::Toolbox(m) => m,
ReadFrom::TreeTagger(m) => m,
ReadFrom::Whisper(m) => m,
ReadFrom::Xlsx(m) => m,
ReadFrom::Xml(m) => m,
ReadFrom::Webanno(m) => m,
ReadFrom::Git(m) => m,
}
}
pub fn name(&self) -> Result<String> {
let parent_enum = Peek::new(self).into_enum()?;
let variant = parent_enum.active_variant()?;
Ok(variant.name.to_lowercase())
}
}
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
#[serde(tag = "action", rename_all = "lowercase", content = "config")]
#[repr(u16)]
pub enum GraphOp {
Align(AlignNodes), Check(Check), Collapse(Collapse), #[serde(rename = "unstable:diff")]
#[facet(rename = "unstable:diff")]
Diff(DiffSubgraphs),
Divide(DivideSegments),
Edit(EditGraph),
Filter(FilterNodes),
Visualize(#[serde(default)] Visualize),
Enumerate(#[serde(default)] EnumerateMatches),
Link(LinkNodes), Map(MapAnnos), Revise(#[serde(default)] Revise), Span(CreateSpans),
Time(#[serde(default)] Filltime),
Chunk(#[serde(default)] Chunk),
Split(#[serde(default)] SplitValues), Sleep(#[serde(default)] Sleep),
None(#[serde(default)] NoOp), }
impl Default for GraphOp {
fn default() -> Self {
GraphOp::None(NoOp::default())
}
}
impl GraphOp {
fn processor(&self) -> &dyn Manipulator {
match self {
GraphOp::Check(m) => m,
GraphOp::Collapse(m) => m,
GraphOp::Visualize(m) => m,
GraphOp::Link(m) => m,
GraphOp::Map(m) => m,
GraphOp::Revise(m) => m,
GraphOp::None(m) => m,
GraphOp::Enumerate(m) => m,
GraphOp::Chunk(m) => m,
GraphOp::Split(m) => m,
GraphOp::Filter(m) => m,
GraphOp::Time(m) => m,
GraphOp::Sleep(m) => m,
GraphOp::Align(m) => m,
GraphOp::Diff(m) => m,
GraphOp::Edit(m) => m,
GraphOp::Span(m) => m,
GraphOp::Divide(m) => m,
}
}
pub fn name(&self) -> Result<String> {
let parent_enum = Peek::new(self).into_enum()?;
let variant = parent_enum.active_variant()?;
Ok(variant.name.to_lowercase())
}
}
#[derive(Eq, PartialEq, Hash, Debug, Clone)]
pub struct StepID {
pub module_name: String,
pub path: Option<PathBuf>,
}
impl StepID {
pub fn from_importer_step(step: &ImporterStep) -> StepID {
let module_name = if let Some(label) = &step.description {
label.to_string()
} else {
format!("import_{}", step.module.name().unwrap_or_default())
};
StepID {
module_name,
path: Some(step.path.clone()),
}
}
pub fn from_graphop_step(step: &ManipulatorStep, position_in_workflow: usize) -> StepID {
let module_name = if let Some(label) = &step.description {
label.to_string()
} else {
format!(
"{position_in_workflow}_{}",
step.module.name().unwrap_or_default()
)
};
StepID {
module_name,
path: None,
}
}
pub fn from_exporter_step(step: &ExporterStep) -> StepID {
let module_name = if let Some(label) = &step.description {
label.to_string()
} else {
format!("export_{}", step.module.name().unwrap_or_default())
};
StepID {
module_name,
path: Some(step.path.clone()),
}
}
}
impl Display for StepID {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(path) = &self.path {
write!(f, "{} ({})", self.module_name, path.to_string_lossy())
} else {
write!(f, "{}", self.module_name)
}
}
}
pub trait Step {}
#[derive(Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct ImporterStep {
#[serde(flatten)]
module: ReadFrom,
path: PathBuf,
#[serde(default, alias = "label")]
description: Option<String>,
#[serde(flatten, default)]
generic_config: Option<GenericImportConfiguration>,
}
impl ImporterStep {
pub fn new<P>(module: ReadFrom, path: P) -> Self
where
P: Into<PathBuf>,
{
let generic_config = Some(module.reader().default_configuration());
Self {
module,
path: path.into(),
description: None,
generic_config,
}
}
#[cfg(test)]
fn execute(
&self,
tx: Option<StatusSender>,
) -> std::result::Result<graphannis::update::GraphUpdate, Box<dyn std::error::Error>> {
let default_conf = self.module.reader().default_configuration();
self.module.reader().import_corpus(
&self.path,
StepID::from_importer_step(&self),
self.generic_config.clone().unwrap_or(default_conf),
tx,
)
}
}
impl Step for ImporterStep {}
#[derive(Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct ExporterStep {
#[serde(flatten)]
module: WriteAs,
path: PathBuf,
#[serde(default, alias = "label")]
description: Option<String>,
#[serde(default)]
extension: Option<String>,
}
impl ExporterStep {
pub fn new<P>(module: WriteAs, path: P) -> Self
where
P: Into<PathBuf>,
{
Self {
module,
path: path.into(),
description: None,
extension: None,
}
}
#[cfg(test)]
fn execute(
&self,
graph: &AnnotationGraph,
tx: Option<StatusSender>,
) -> std::result::Result<(), Box<dyn std::error::Error>> {
self.module
.writer()
.export_corpus(graph, &self.path, StepID::from_exporter_step(&self), tx)
}
}
impl Step for ExporterStep {}
#[derive(Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct ManipulatorStep {
#[serde(flatten)]
module: GraphOp,
workflow_directory: Option<PathBuf>,
#[serde(default, alias = "label")]
description: Option<String>,
}
impl ManipulatorStep {
pub fn new<P>(module: GraphOp, workflow_directory: Option<P>) -> Self
where
P: Into<PathBuf>,
{
Self {
module,
workflow_directory: workflow_directory.map(|d| d.into()),
description: None,
}
}
fn execute(
&self,
graph: &mut AnnotationGraph,
workflow_directory: &Path,
position_in_workflow: usize,
tx: Option<StatusSender>,
) -> std::result::Result<(), Box<dyn std::error::Error>> {
let step_id = StepID::from_graphop_step(self, position_in_workflow);
self.module
.processor()
.validate_graph(graph, step_id.clone(), tx.clone())?;
self.module
.processor()
.manipulate_corpus(graph, workflow_directory, step_id, tx)
}
}
impl Step for ManipulatorStep {}
#[cfg(test)]
mod tests {
use std::fs;
use insta::assert_snapshot;
use serde::de::DeserializeOwned;
use crate::{GraphOp, ReadFrom, WriteAs, workflow::Workflow};
#[test]
fn deser_read_from_pass() {
assert!(deserialize_toml::<ReadFrom>("tests/deser/deser_read_from.toml").is_ok());
}
#[test]
fn deser_read_from_fail_unknown() {
assert!(deserialize_toml::<ReadFrom>("tests/deser/deser_read_from_fail.toml").is_err());
}
#[test]
fn deser_graph_op_pass() {
assert!(deserialize_toml::<GraphOp>("tests/deser/deser_graph_op.toml").is_ok());
}
#[test]
fn deser_graph_op_fail_unknown() {
assert!(deserialize_toml::<GraphOp>("tests/deser/deser_graph_op_fail.toml").is_err());
}
#[test]
fn deser_write_as_pass() {
assert!(deserialize_toml::<WriteAs>("tests/deser/deser_write_as.toml").is_ok());
}
#[test]
fn deser_write_as_fail_unknown() {
assert!(deserialize_toml::<WriteAs>("tests/deser/deser_write_as_fail.toml").is_err());
}
fn deserialize_toml<E: DeserializeOwned>(path: &str) -> Result<E, toml::de::Error> {
let toml_string = fs::read_to_string(path);
assert!(toml_string.is_ok());
toml::from_str(&toml_string.unwrap())
}
#[test]
fn deserialize_with_custom_id() {
let d = deserialize_toml::<Workflow>("tests/deser/workflow-with-custom-labels.toml");
assert!(d.is_ok(), "Err: {:?}", d.err().unwrap());
}
#[test]
fn deserialize_with_generic_config() {
let d = deserialize_toml::<Workflow>("tests/deser/workflow-with-generic-config.toml");
assert!(d.is_ok());
let workflow = d.unwrap();
let import_step = &workflow.import_steps().unwrap()[0];
assert_eq!(
import_step
.generic_config
.as_ref()
.unwrap()
.custom_root_name()
.as_ref()
.unwrap(),
"custom_corpus_root"
);
assert_eq!(
import_step.generic_config.as_ref().unwrap().extensions(),
&["xml"]
);
assert_snapshot!(toml::to_string(&workflow).unwrap());
}
}