use std::path::PathBuf;
use encoding_rs::Encoding;
use encoding_rs_io::DecodeReaderBytesBuilder;
use facet::Facet;
use graphannis::update::{GraphUpdate, UpdateEvent};
use graphannis_core::graph::ANNIS_NS;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use crate::{
importer::{
GenericImportConfiguration, Importer,
text::tokenizer::{Token, TreeTaggerTokenizer},
},
progress::ProgressReporter,
util,
};
mod tokenizer;
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq, Default)]
#[serde(deny_unknown_fields)]
pub struct ImportText {
#[serde(default)]
file_encoding: Option<String>,
#[serde(default)]
tokenizer: Tokenizer,
}
#[derive(Facet, Deserialize, Serialize, Clone, PartialEq)]
#[repr(u8)]
#[serde(tag = "strategy", rename_all = "snake_case")]
pub enum Tokenizer {
Treetagger {
language: String,
},
}
impl Default for Tokenizer {
fn default() -> Self {
Tokenizer::Treetagger {
language: "".to_string(),
}
}
}
impl Importer for ImportText {
fn import_corpus(
&self,
input_path: &std::path::Path,
step_id: crate::StepID,
config: GenericImportConfiguration,
tx: Option<crate::workflow::StatusSender>,
) -> Result<GraphUpdate, Box<dyn std::error::Error>> {
let mut update = GraphUpdate::default();
let all_files =
util::graphupdate::import_corpus_graph_from_files(&mut update, input_path, &config)?;
let tokenizer = match &self.tokenizer {
Tokenizer::Treetagger { language } => TreeTaggerTokenizer::new(language.into())?,
};
let reporter = ProgressReporter::new(tx, step_id.clone(), all_files.len())?;
let mapper_vec = all_files
.into_iter()
.map(|(p, d)| TextfileMapper {
progress: &reporter,
path: p.to_path_buf(),
doc_node_name: d.to_string(),
file_encoding: self.file_encoding.clone(),
tokenizer: tokenizer.clone(),
})
.collect_vec();
mapper_vec
.into_iter()
.try_for_each(|m| m.import_textfile(&mut update))?;
Ok(update)
}
fn default_file_extensions(&self) -> &[&str] {
&["txt"]
}
}
struct TextfileMapper<'a> {
progress: &'a ProgressReporter,
path: PathBuf,
doc_node_name: String,
file_encoding: Option<String>,
tokenizer: TreeTaggerTokenizer,
}
impl<'a> TextfileMapper<'a> {
fn import_textfile(&self, update: &mut GraphUpdate) -> anyhow::Result<()> {
self.progress
.info(format!("Processing {}", &self.path.to_string_lossy()))?;
let prefix = format!("{}#", self.doc_node_name);
let decoder_builder = if let Some(encoding) = &self.file_encoding {
DecodeReaderBytesBuilder::new()
.encoding(Encoding::for_label(encoding.as_bytes()))
.clone()
} else {
DecodeReaderBytesBuilder::new()
};
let f = std::fs::File::open(&self.path)?;
let reader = decoder_builder.build(&f);
let token_strings = self.tokenizer.tokenize(reader)?;
for (i, t) in token_strings.iter().enumerate() {
let Token {
value: t,
whitespace_after: ws,
} = t;
create_token_node(
update,
&format!("{}tok{}", prefix, i),
t,
ws.as_ref(),
&self.doc_node_name,
)?;
}
for i in 0..(token_strings.len() - 1) {
update.add_event(UpdateEvent::AddEdge {
source_node: format!("{}tok{}", prefix, i),
target_node: format!("{}tok{}", prefix, i + 1),
layer: ANNIS_NS.to_string(),
component_type: "Ordering".to_string(),
component_name: "".to_string(),
})?;
}
Ok(())
}
}
pub fn create_token_node(
update: &mut GraphUpdate,
node_name: &str,
token_value: &str,
whitespace_after: Option<&String>,
document_node: &str,
) -> anyhow::Result<()> {
update.add_event(UpdateEvent::AddNode {
node_name: node_name.to_string(),
node_type: "node".to_string(),
})?;
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "tok".to_string(),
anno_value: token_value.to_string(),
})?;
if let Some(ws) = whitespace_after {
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "tok-whitespace-after".to_string(),
anno_value: ws.to_string(),
})?;
}
update.add_event(UpdateEvent::AddEdge {
source_node: node_name.to_string(),
target_node: document_node.to_string(),
layer: ANNIS_NS.to_string(),
component_type: "PartOf".to_string(),
component_name: "".to_string(),
})?;
Ok(())
}
#[cfg(test)]
mod tests {
use std::path::Path;
use graphannis::AnnotationGraph;
use insta::assert_snapshot;
use crate::{
exporter::graphml::GraphMLExporter,
importer::{GenericImportConfiguration, Importer, text::ImportText},
test_util::export_to_string,
util::update_graph_silent,
};
#[test]
fn import_text_with_default_tokenizer() {
let import_path = Path::new("tests/data/import/text/example/");
let importer: ImportText = toml::from_str("").unwrap();
let u = importer.import_corpus(
import_path,
crate::StepID {
module_name: "test_text".to_string(),
path: Some(import_path.to_path_buf()),
},
GenericImportConfiguration::new_with_default_extensions(&importer),
None,
);
assert!(u.is_ok(), "Err: {:?}", u.err());
let mut update = u.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(update_graph_silent(&mut graph, &mut update).is_ok());
let exporter: GraphMLExporter = toml::from_str("stable_order = true").unwrap();
let actual = export_to_string(&graph, exporter);
assert!(actual.is_ok());
assert_snapshot!(actual.unwrap());
}
#[test]
fn import_text_with_french_tokenizer() {
let import_path = Path::new("tests/data/import/text/example_french/");
let importer: ImportText =
toml::from_str(r#"tokenizer = {strategy = "treetagger", language="fr"}"#).unwrap();
let u = importer.import_corpus(
import_path,
crate::StepID {
module_name: "test_text".to_string(),
path: Some(import_path.to_path_buf()),
},
GenericImportConfiguration::new_with_default_extensions(&importer),
None,
);
assert!(u.is_ok(), "Err: {:?}", u.err());
let mut update = u.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(update_graph_silent(&mut graph, &mut update).is_ok());
let exporter: GraphMLExporter = toml::from_str("stable_order = true").unwrap();
let actual = export_to_string(&graph, exporter);
assert!(actual.is_ok());
assert_snapshot!(actual.unwrap());
}
}