use crate::importer::GenericImportConfiguration;
use crate::progress::ProgressReporter;
use super::{Importer, NODE_NAME_ENCODE_SET};
use anyhow::{Result, anyhow};
use facet::Facet;
use graphannis::model::AnnotationComponentType;
use graphannis::update::{GraphUpdate, UpdateEvent};
use percent_encoding::utf8_percent_encode;
use serde::{Deserialize, Serialize};
use graphannis_core::serializer::KeyVec;
use graphannis_core::{
graph::{ANNIS_NS, DEFAULT_NS},
serializer::KeySerializer,
types::{AnnoKey, Component, Edge, NodeID},
util::disk_collections::DiskMap,
};
use itertools::Itertools;
use lazy_static::lazy_static;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::File;
use std::io::prelude::*;
use std::ops::Bound;
use std::path::{Path, PathBuf};
lazy_static! {
static ref INVALID_STRING: String = std::char::MAX.to_string();
}
#[derive(Facet, Default, Deserialize, Serialize, Clone, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct ImportRelAnnis {}
impl Importer for ImportRelAnnis {
fn import_corpus(
&self,
input_path: &std::path::Path,
step_id: crate::StepID,
_config: GenericImportConfiguration,
tx: Option<crate::workflow::StatusSender>,
) -> Result<graphannis::update::GraphUpdate, Box<dyn std::error::Error>> {
let progress = ProgressReporter::new_unknown_total_work(tx, step_id.clone())?;
if input_path.is_dir() && input_path.exists() {
let annis_version_path = input_path.join("annis.version");
let is_annis_33 = if annis_version_path.exists() {
let mut file = File::open(&annis_version_path)?;
let mut version_str = std::string::String::new();
file.read_to_string(&mut version_str)?;
version_str == "3.3"
} else {
false
};
let mut updates = GraphUpdate::new();
let load_node_and_corpus_result =
load_node_and_corpus_tables(input_path, &mut updates, is_annis_33, &progress)?;
{
let text_coverage_edges = load_edge_tables(
input_path,
&mut updates,
is_annis_33,
&load_node_and_corpus_result.id_to_node_name,
&progress,
)?;
calculate_automatic_coverage_edges(
&mut updates,
&load_node_and_corpus_result,
&text_coverage_edges,
&progress,
)?;
}
Ok(updates)
} else {
Err(anyhow!("directory {} not found", input_path.to_string_lossy()).into())
}
}
fn default_file_extensions(&self) -> &[&str] {
&[]
}
}
const TOK_WHITESPACE_BEFORE: &str = "tok-whitespace-before";
const TOK_WHITESPACE_AFTER: &str = "tok-whitespace-after";
pub fn create_str_key(val: &str) -> KeyVec {
let mut result: KeyVec = KeyVec::default();
for b in val.as_bytes() {
result.push(*b)
}
result.push(0);
result
}
#[derive(Eq, PartialEq, PartialOrd, Ord, Hash, Clone, Debug, Serialize, Deserialize)]
pub struct TextProperty {
segmentation: String,
corpus_id: u32,
text_id: u32,
val: u32,
}
impl KeySerializer for TextProperty {
fn create_key(&self) -> KeyVec {
let mut result = KeyVec::new();
result.extend(create_str_key(&self.segmentation));
result.extend(self.corpus_id.to_be_bytes());
result.extend(self.text_id.to_be_bytes());
result.extend(self.val.to_be_bytes());
result
}
fn parse_key(
key: &[u8],
) -> std::result::Result<Self, Box<dyn std::error::Error + Send + Sync>> {
let id_size = std::mem::size_of::<u32>();
let mut id_offset = key.len() - id_size * 3;
let key_as_string: std::string::String = std::string::String::from_utf8_lossy(key).into();
let segmentation_vector: Vec<_> = key_as_string.split_terminator('\0').collect();
let corpus_id = u32::from_be_bytes(key[id_offset..(id_offset + id_size)].try_into()?);
id_offset += id_size;
let text_id = u32::from_be_bytes(key[id_offset..(id_offset + id_size)].try_into()?);
id_offset += id_size;
let val = u32::from_be_bytes(key[id_offset..(id_offset + id_size)].try_into()?);
let segmentation = if segmentation_vector.is_empty() {
String::from("")
} else {
segmentation_vector[0].into()
};
Ok(TextProperty {
segmentation,
corpus_id,
text_id,
val,
})
}
}
#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Debug)]
struct TextKey {
id: u32,
corpus_ref: Option<u32>,
}
impl KeySerializer for TextKey {
fn create_key(&self) -> KeyVec {
let mut result = KeyVec::new();
result.extend(self.id.to_be_bytes());
if let Some(corpus_ref) = self.corpus_ref {
result.extend(corpus_ref.to_be_bytes());
}
result
}
fn parse_key(
key: &[u8],
) -> std::result::Result<Self, Box<dyn std::error::Error + Send + Sync>> {
let id_size = std::mem::size_of::<u32>();
let id = u32::from_be_bytes(key[0..id_size].try_into()?);
let corpus_ref = if key.len() == id_size * 2 {
Some(u32::from_be_bytes(key[id_size..].try_into()?))
} else {
None
};
Ok(TextKey { id, corpus_ref })
}
}
#[derive(Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize, Debug)]
struct NodeByTextEntry {
text_id: u32,
corpus_ref: u32,
node_id: NodeID,
}
impl KeySerializer for NodeByTextEntry {
fn create_key(&self) -> KeyVec {
let mut result = KeyVec::new();
result.extend(self.text_id.to_be_bytes());
result.extend(self.corpus_ref.to_be_bytes());
result.extend(self.node_id.to_be_bytes());
result
}
fn parse_key(
key: &[u8],
) -> std::result::Result<Self, Box<dyn std::error::Error + Send + Sync>> {
let u32_size = std::mem::size_of::<u32>();
let text_id = u32::from_be_bytes(key[0..u32_size].try_into()?);
let mut offset = u32_size;
let corpus_ref = u32::from_be_bytes(key[offset..(offset + u32_size)].try_into()?);
offset += u32_size;
let node_id = NodeID::from_be_bytes(key[offset..].try_into()?);
Ok(NodeByTextEntry {
text_id,
corpus_ref,
node_id,
})
}
}
#[derive(Serialize, Deserialize, Clone)]
struct Text {
name: String,
val: String,
}
struct CorpusTableEntry {
pre: u32,
post: u32,
name: String,
normalized_name: String,
}
struct ParsedCorpusTable {
toplevel_corpus_name: String,
corpus_by_preorder: BTreeMap<u32, u32>,
corpus_by_id: BTreeMap<u32, CorpusTableEntry>,
}
struct TextPosTable {
token_by_left_textpos: DiskMap<TextProperty, NodeID>,
token_by_right_textpos: DiskMap<TextProperty, NodeID>,
token_by_index: DiskMap<TextProperty, NodeID>,
token_to_index: DiskMap<NodeID, TextProperty>,
node_to_left: DiskMap<NodeID, TextProperty>,
node_to_right: DiskMap<NodeID, TextProperty>,
node_to_left_char: DiskMap<NodeID, TextProperty>,
node_to_right_char: DiskMap<NodeID, TextProperty>,
}
struct LoadRankResult {
components_by_pre: DiskMap<u32, Component<AnnotationComponentType>>,
edges_by_pre: DiskMap<u32, Edge>,
text_coverage_edges: DiskMap<Edge, bool>,
component_for_parentless_target_node: DiskMap<NodeID, Component<AnnotationComponentType>>,
}
struct LoadNodeAndCorpusResult {
id_to_node_name: DiskMap<NodeID, String>,
textpos_table: TextPosTable,
}
struct NodeTabParseResult {
nodes_by_text: DiskMap<NodeByTextEntry, bool>,
missing_seg_span: DiskMap<NodeID, String>,
id_to_node_name: DiskMap<NodeID, String>,
textpos_table: TextPosTable,
}
struct LoadNodeResult {
nodes_by_text: DiskMap<NodeByTextEntry, bool>,
id_to_node_name: DiskMap<NodeID, String>,
textpos_table: TextPosTable,
}
fn load_node_and_corpus_tables(
path: &Path,
updates: &mut GraphUpdate,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<LoadNodeAndCorpusResult> {
let corpus_table = parse_corpus_tab(path, is_annis_33, progress)?;
let mut texts = parse_text_tab(path, is_annis_33, progress)?;
let corpus_id_to_annos = load_corpus_annotation(path, is_annis_33, progress)?;
let load_nodes_result = load_nodes(
path,
updates,
&mut texts,
&corpus_table,
is_annis_33,
progress,
)?;
add_subcorpora(
updates,
&corpus_table,
&load_nodes_result,
&texts,
&corpus_id_to_annos,
is_annis_33,
path,
)?;
add_white_space_token_labels(
updates,
&load_nodes_result.textpos_table,
&mut texts,
&load_nodes_result.id_to_node_name,
progress,
)?;
Ok(LoadNodeAndCorpusResult {
id_to_node_name: load_nodes_result.id_to_node_name,
textpos_table: load_nodes_result.textpos_table,
})
}
fn load_edge_tables(
path: &Path,
updates: &mut GraphUpdate,
is_annis_33: bool,
id_to_node_name: &DiskMap<NodeID, String>,
progress: &ProgressReporter,
) -> Result<LoadRankResult> {
let load_rank_result = {
let component_by_id = load_component_tab(path, is_annis_33, progress)?;
load_rank_tab(
path,
updates,
&component_by_id,
id_to_node_name,
is_annis_33,
progress,
)?
};
load_edge_annotation(
path,
updates,
&load_rank_result,
id_to_node_name,
is_annis_33,
progress,
)?;
Ok(load_rank_result)
}
fn add_external_data_files(
import_path: &Path,
parent_node_full_name: &str,
document: Option<&str>,
updates: &mut GraphUpdate,
) -> Result<()> {
let mut ext_data = import_path.join("ExtData");
if let Some(document) = document {
ext_data.push(document);
}
if ext_data.is_dir() {
for file in std::fs::read_dir(&ext_data)? {
let file = file?;
if file.file_type()?.is_file() {
let node_name = format!(
"{}/{}",
parent_node_full_name,
file.file_name().to_string_lossy()
);
updates.add_event(UpdateEvent::AddNode {
node_type: "file".to_string(),
node_name: node_name.clone(),
})?;
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.clone(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "file".to_string(),
anno_value: file.path().canonicalize()?.to_string_lossy().to_string(),
})?;
updates.add_event(UpdateEvent::AddEdge {
source_node: node_name.clone(),
target_node: parent_node_full_name.to_owned(),
layer: ANNIS_NS.to_owned(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: std::string::String::default(),
})?;
}
}
}
Ok(())
}
fn postgresql_import_reader(path: &Path) -> std::result::Result<csv::Reader<File>, csv::Error> {
csv::ReaderBuilder::new()
.has_headers(false)
.delimiter(b'\t')
.quote(0) .from_path(path)
}
fn get_field(
record: &csv::StringRecord,
i: usize,
column_name: &str,
file: &Path,
) -> Result<Option<String>> {
let r = record.get(i).ok_or_else(|| {
anyhow!(
"missing column at position {i} ({column_name}) in file {}",
file.to_string_lossy()
)
})?;
if r == "NULL" {
Ok(None)
} else {
Ok(Some(escape_field(r)))
}
}
fn escape_field(val: &str) -> String {
let mut chars = val.chars().peekable();
let mut unescaped = String::new();
loop {
match chars.next() {
None => break,
Some(c) => {
let escaped_char = if c == '\\' {
if let Some(escaped_char) = chars.peek() {
let escaped_char = *escaped_char;
match escaped_char {
_ if escaped_char == '\\'
|| escaped_char == '"'
|| escaped_char == '\''
|| escaped_char == '`'
|| escaped_char == '$' =>
{
Some(escaped_char)
}
'n' => Some('\n'),
'r' => Some('\r'),
't' => Some('\t'),
_ => None,
}
} else {
None
}
} else {
None
};
if let Some(escaped_char) = escaped_char {
unescaped.push(escaped_char);
chars.next();
} else {
unescaped.push(c);
};
}
}
}
unescaped
}
fn get_field_not_null(
record: &csv::StringRecord,
i: usize,
column_name: &str,
file: &Path,
) -> Result<String> {
let result = get_field(record, i, column_name, file)?.ok_or_else(|| {
anyhow!(
"unexpected value NULL in column {i} ({column_name}) in file {} at line {}",
file.to_string_lossy(),
record
.position()
.map(|p| p.line().to_string())
.unwrap_or_else(|| "<unknown>".to_string())
)
})?;
Ok(result)
}
fn parse_corpus_tab(
path: &Path,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<ParsedCorpusTable> {
let mut corpus_tab_path = PathBuf::from(path);
corpus_tab_path.push(if is_annis_33 {
"corpus.annis"
} else {
"corpus.tab"
});
progress.info(format!(
"loading {}",
corpus_tab_path.to_str().unwrap_or_default()
))?;
let mut corpus_by_preorder = BTreeMap::new();
let mut corpus_by_id = BTreeMap::new();
let mut corpus_tab_csv = postgresql_import_reader(corpus_tab_path.as_path())?;
let mut document_names: HashMap<String, usize> = HashMap::new();
for line in corpus_tab_csv.records() {
let line = line?;
let id = get_field_not_null(&line, 0, "id", &corpus_tab_path)?.parse::<u32>()?;
let mut name = get_field_not_null(&line, 1, "name", &corpus_tab_path)?;
let corpus_type = get_field_not_null(&line, 2, "type", &corpus_tab_path)?;
if corpus_type == "DOCUMENT" {
let existing_count = document_names
.entry(name.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
if *existing_count > 1 {
let old_name = name.clone();
name = format!("{name}_duplicated_document_name_{existing_count}");
progress.warn(format!(
"duplicated document name \"{old_name}\" detected: will be renamed to \"{name}\""
))?;
}
}
let pre_order = get_field_not_null(&line, 4, "pre", &corpus_tab_path)?.parse::<u32>()?;
let post_order = get_field_not_null(&line, 5, "post", &corpus_tab_path)?.parse::<u32>()?;
let normalized_name = utf8_percent_encode(&name, NODE_NAME_ENCODE_SET);
corpus_by_id.insert(
id,
CorpusTableEntry {
pre: pre_order,
post: post_order,
normalized_name: normalized_name.to_string(),
name,
},
);
corpus_by_preorder.insert(pre_order, id);
}
let toplevel_corpus_id = corpus_by_preorder
.iter()
.next()
.ok_or(anyhow!("toplevel corpus not found"))?
.1;
Ok(ParsedCorpusTable {
toplevel_corpus_name: corpus_by_id
.get(toplevel_corpus_id)
.ok_or_else(|| anyhow!("corpus with ID {toplevel_corpus_id} not found"))?
.name
.clone(),
corpus_by_preorder,
corpus_by_id,
})
}
fn parse_text_tab(
path: &Path,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<DiskMap<TextKey, Text>> {
let mut text_tab_path = PathBuf::from(path);
text_tab_path.push(if is_annis_33 {
"text.annis"
} else {
"text.tab"
});
progress.info(format!(
"loading {}",
text_tab_path.to_str().unwrap_or_default()
))?;
let mut texts: DiskMap<TextKey, Text> = DiskMap::default();
let mut text_tab_csv = postgresql_import_reader(text_tab_path.as_path())?;
for result in text_tab_csv.records() {
let line = result?;
let id = get_field_not_null(&line, if is_annis_33 { 1 } else { 0 }, "id", &text_tab_path)?
.parse::<u32>()?;
let name = get_field_not_null(
&line,
if is_annis_33 { 2 } else { 1 },
"name",
&text_tab_path,
)?;
let value = get_field_not_null(
&line,
if is_annis_33 { 3 } else { 2 },
"text",
&text_tab_path,
)?;
let corpus_ref = if is_annis_33 {
Some(get_field_not_null(&line, 0, "corpus_ref", &text_tab_path)?.parse::<u32>()?)
} else {
None
};
let key = TextKey { id, corpus_ref };
texts.insert(key.clone(), Text { name, val: value })?;
}
Ok(texts)
}
fn calculate_automatic_token_order(
updates: &mut GraphUpdate,
token_by_index: &DiskMap<TextProperty, NodeID>,
id_to_node_name: &DiskMap<NodeID, String>,
progress: &ProgressReporter,
) -> Result<()> {
progress.info("calculating the automatically generated Ordering edges")?;
let mut last_textprop: Option<TextProperty> = None;
let mut last_token: Option<NodeID> = None;
for token in token_by_index.iter()? {
let (current_textprop, current_token) = token?;
if let Some(last_token) = last_token
&& let Some(last_textprop) = last_textprop
&& last_textprop.corpus_id == current_textprop.corpus_id
&& last_textprop.text_id == current_textprop.text_id
&& last_textprop.segmentation == current_textprop.segmentation
{
let ordering_layer = if current_textprop.segmentation.is_empty() {
ANNIS_NS.to_owned()
} else {
DEFAULT_NS.to_owned()
};
updates.add_event(UpdateEvent::AddEdge {
source_node: id_to_node_name
.get(&last_token)?
.ok_or_else(|| anyhow!("node with ID {last_token} not found)"))?
.to_string(),
target_node: id_to_node_name
.get(¤t_token)?
.ok_or_else(|| anyhow!("node with ID {current_token} not found)"))?
.to_string(),
layer: ordering_layer,
component_type: AnnotationComponentType::Ordering.to_string(),
component_name: current_textprop.segmentation.clone(),
})?;
}
last_textprop = Some(current_textprop.clone());
last_token = Some(current_token);
}
Ok(())
}
fn add_automatic_cov_edge_for_node(
updates: &mut GraphUpdate,
n: NodeID,
load_node_and_corpus_result: &LoadNodeAndCorpusResult,
load_rank_result: &LoadRankResult,
) -> Result<()> {
let left_pos = load_node_and_corpus_result
.textpos_table
.node_to_left
.get(&n)?
.ok_or_else(|| anyhow!("can't get left position of node {n}"))?;
let right_pos = load_node_and_corpus_result
.textpos_table
.node_to_right
.get(&n)?
.ok_or_else(|| anyhow!("can't get right position of node {n}"))?;
let left_aligned_tok: Option<u64> = load_node_and_corpus_result
.textpos_table
.token_by_left_textpos
.get(&left_pos)?
.map(|v| *v);
let right_aligned_tok: Option<u64> = load_node_and_corpus_result
.textpos_table
.token_by_right_textpos
.get(&right_pos)?
.map(|v| *v);
let left_aligned_tok = if let Some(left_aligned_tok) = left_aligned_tok {
left_aligned_tok
} else {
right_aligned_tok.ok_or_else(|| {
anyhow!("can't get find any aligned token (left or right) for node with ID {n}")
})?
};
let right_aligned_tok = if let Some(right_aligned_tok) = right_aligned_tok {
right_aligned_tok
} else {
left_aligned_tok
};
let left_tok_pos = load_node_and_corpus_result
.textpos_table
.token_to_index
.get(&left_aligned_tok)?
.ok_or_else(|| anyhow!("can't get left aligned token for node with ID {n}"))?;
let right_tok_pos = load_node_and_corpus_result
.textpos_table
.token_to_index
.get(&right_aligned_tok)?
.ok_or_else(|| anyhow!("can't get right aligned token for node with ID {n}"))?;
let mut tok_idx = TextProperty {
segmentation: String::default(),
corpus_id: left_tok_pos.corpus_id,
text_id: left_tok_pos.text_id,
val: 0,
};
for i in left_tok_pos.val..=right_tok_pos.val {
tok_idx.val = i;
let tok_id = load_node_and_corpus_result
.textpos_table
.token_by_index
.get(&tok_idx)?
.ok_or_else(|| anyhow!("can't get token ID for position {tok_idx:?}"))?;
if n != *tok_id {
let edge = Edge {
source: n,
target: *tok_id,
};
if !load_rank_result.text_coverage_edges.contains_key(&edge)? {
let nodes_with_same_source = (
Bound::Included(Edge {
source: n,
target: NodeID::MIN,
}),
Bound::Included(Edge {
source: n,
target: NodeID::MAX,
}),
);
let has_any_outgoing_text_coverage_edge = load_rank_result
.text_coverage_edges
.range(nodes_with_same_source)
.next()
.is_some();
let (component_layer, component_name) = if has_any_outgoing_text_coverage_edge {
(ANNIS_NS.into(), "autogenerated-coverage".into())
} else {
load_rank_result
.component_for_parentless_target_node
.get(&n)?
.map_or_else(
|| (String::default(), String::default()),
|c| (c.layer.to_string(), c.name.to_string()),
)
};
updates.add_event(UpdateEvent::AddEdge {
source_node: load_node_and_corpus_result
.id_to_node_name
.get(&n)?
.ok_or_else(|| anyhow!("node with ID {n} not found"))?
.to_string(),
target_node: load_node_and_corpus_result
.id_to_node_name
.get(&tok_id)?
.ok_or_else(|| anyhow!("node with ID {tok_id} not found"))?
.to_string(),
layer: component_layer,
component_type: AnnotationComponentType::Coverage.to_string(),
component_name,
})?;
}
}
}
Ok(())
}
fn calculate_automatic_coverage_edges(
updates: &mut GraphUpdate,
load_node_and_corpus_result: &LoadNodeAndCorpusResult,
load_rank_result: &LoadRankResult,
progress: &ProgressReporter,
) -> Result<()> {
progress.info("calculating the automatically generated Coverage edges")?;
for item in load_node_and_corpus_result
.textpos_table
.node_to_left
.iter()?
{
let (n, textprop) = item?;
if textprop.segmentation.is_empty()
&& !load_node_and_corpus_result
.textpos_table
.token_to_index
.contains_key(&n)?
&& let Err(e) = add_automatic_cov_edge_for_node(
updates,
n,
load_node_and_corpus_result,
load_rank_result,
)
{
progress.warn(format!(
"Adding coverage edges (connects spans with tokens) failed: {e}"
))?;
} }
Ok(())
}
fn add_white_space_token_labels(
updates: &mut GraphUpdate,
textpos_table: &TextPosTable,
texts: &mut DiskMap<TextKey, Text>,
id_to_node_name: &DiskMap<NodeID, String>,
progress: &ProgressReporter,
) -> Result<()> {
progress.info("adding non-tokenized primary text segments as white-space label to tokens")?;
let mut added_whitespace_label_count = 0;
for text in texts.iter()? {
let (text_key, text) = text?;
let mut text_char_it = text.val.chars();
let mut current_text_offset = 0;
let min_text_prop = TextProperty {
corpus_id: text_key.corpus_ref.unwrap_or_default(),
text_id: text_key.id,
segmentation: "".into(),
val: u32::MIN,
};
let max_text_prop = TextProperty {
corpus_id: text_key.corpus_ref.unwrap_or_default(),
text_id: text_key.id,
segmentation: "".into(),
val: u32::MAX,
};
let mut previous_token_id = None;
let mut token_iterator = textpos_table
.token_by_index
.range(min_text_prop..max_text_prop)
.peekable();
while let Some(token) = token_iterator.next() {
let (_, current_token_id) = token?;
if let Some(left_text_pos) = textpos_table.node_to_left_char.get(¤t_token_id)?
&& let Some(right_text_pos) =
textpos_table.node_to_right_char.get(¤t_token_id)?
{
let token_left_char = left_text_pos.val as usize;
let token_right_char = right_text_pos.val as usize;
if previous_token_id.is_none() && current_text_offset < token_left_char {
let mut covered_text_before =
std::string::String::with_capacity(token_left_char - current_text_offset);
let mut skipped_before_token = 0;
for _ in current_text_offset..(token_left_char - 1) {
if let Some(c) = text_char_it.next() {
covered_text_before.push(c);
skipped_before_token += 1;
}
}
current_text_offset += skipped_before_token;
if let Some(token_name) = id_to_node_name.get(¤t_token_id)? {
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: token_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: TOK_WHITESPACE_BEFORE.to_string(),
anno_value: covered_text_before,
})?;
added_whitespace_label_count += 1;
}
}
let mut skipped_token_characters = 0;
for _ in current_text_offset..token_right_char {
if text_char_it.next().is_some() {
skipped_token_characters += 1;
}
}
current_text_offset += skipped_token_characters;
let mut whitespace_end_pos = None;
if let Some(Ok((_, next_token_id))) = token_iterator.peek()
&& let Some(next_token_left_pos) =
textpos_table.node_to_left_char.get(next_token_id)?
{
whitespace_end_pos = Some(next_token_left_pos.val as usize);
}
let mut covered_text_after = if let Some(end_pos) = whitespace_end_pos {
std::string::String::with_capacity(end_pos.saturating_sub(current_text_offset))
} else {
std::string::String::default()
};
if let Some(end_pos) = whitespace_end_pos {
let covered_text_start = current_text_offset;
for _ in covered_text_start..end_pos {
if let Some(c) = text_char_it.next() {
covered_text_after.push(c);
current_text_offset += 1;
}
}
} else {
#[allow(clippy::while_let_on_iterator)]
while let Some(c) = text_char_it.next() {
covered_text_after.push(c);
current_text_offset += 1;
}
}
if let Some(token_name) = id_to_node_name.get(¤t_token_id)? {
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: token_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: TOK_WHITESPACE_AFTER.to_string(),
anno_value: covered_text_after,
})?;
added_whitespace_label_count += 1;
}
}
previous_token_id = Some(current_token_id);
}
}
progress.info(format!(
"added {added_whitespace_label_count} non-tokenized primary text segments as white-space labels to the existing tokens"
))?;
Ok(())
}
fn load_node_tab(
path: &Path,
updates: &mut GraphUpdate,
texts: &mut DiskMap<TextKey, Text>,
corpus_table: &ParsedCorpusTable,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<NodeTabParseResult> {
let mut nodes_by_text: DiskMap<NodeByTextEntry, bool> = DiskMap::default();
let mut missing_seg_span: DiskMap<NodeID, String> = DiskMap::default();
let mut id_to_node_name: DiskMap<NodeID, String> = DiskMap::default();
let mut node_tab_path = PathBuf::from(path);
node_tab_path.push(if is_annis_33 {
"node.annis"
} else {
"node.tab"
});
progress.info(format!(
"loading {}",
node_tab_path.to_str().unwrap_or_default()
))?;
let mut textpos_table = TextPosTable {
token_by_left_textpos: DiskMap::default(),
token_by_right_textpos: DiskMap::default(),
node_to_left: DiskMap::default(),
node_to_right: DiskMap::default(),
token_by_index: DiskMap::default(),
token_to_index: DiskMap::default(),
node_to_left_char: DiskMap::default(),
node_to_right_char: DiskMap::default(),
};
{
let mut node_tab_csv = postgresql_import_reader(node_tab_path.as_path())?;
for result in node_tab_csv.records() {
let line = result?;
let node_nr = get_field_not_null(&line, 0, "id", &node_tab_path)?.parse::<NodeID>()?;
let has_segmentations = is_annis_33 || line.len() > 10;
let token_index_raw = get_field(&line, 7, "token_index", &node_tab_path)?;
let text_id =
get_field_not_null(&line, 1, "text_ref", &node_tab_path)?.parse::<u32>()?;
let corpus_id =
get_field_not_null(&line, 2, "corpus_ref", &node_tab_path)?.parse::<u32>()?;
let layer = get_field(&line, 3, "layer", &node_tab_path)?;
let node_name = get_field_not_null(&line, 4, "name", &node_tab_path)?;
nodes_by_text.insert(
NodeByTextEntry {
corpus_ref: corpus_id,
text_id,
node_id: node_nr,
},
true,
)?;
if !is_annis_33 {
let text_key_without_corpus = TextKey {
id: text_id,
corpus_ref: None,
};
if let Some(existing_text) = texts.remove(&text_key_without_corpus)? {
let text_key = TextKey {
id: text_id,
corpus_ref: Some(corpus_id),
};
texts.insert(text_key, existing_text)?;
}
}
let node_path = format!(
"{}#{}",
get_corpus_path(corpus_id, corpus_table)?,
node_name
);
updates.add_event(UpdateEvent::AddNode {
node_name: node_path.clone(),
node_type: "node".to_owned(),
})?;
id_to_node_name.insert(node_nr, node_path.clone())?;
if let Some(layer) = layer
&& !layer.is_empty()
{
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_path.clone(),
anno_ns: ANNIS_NS.to_owned(),
anno_name: "layer".to_owned(),
anno_value: layer.to_string(),
})?;
}
let left_char_val =
get_field_not_null(&line, 5, "left", &node_tab_path)?.parse::<u32>()?;
let left_char_pos = TextProperty {
segmentation: String::from(""),
val: left_char_val,
corpus_id,
text_id,
};
let right_char_val =
get_field_not_null(&line, 6, "right", &node_tab_path)?.parse::<u32>()?;
let right_char_pos = TextProperty {
segmentation: String::from(""),
val: right_char_val,
corpus_id,
text_id,
};
textpos_table
.node_to_left_char
.insert(node_nr, left_char_pos)?;
textpos_table
.node_to_right_char
.insert(node_nr, right_char_pos)?;
let left_alignment_column = if is_annis_33 { 8 } else { 5 };
let right_alignment_column = if is_annis_33 { 9 } else { 6 };
let left_alignment_val =
get_field_not_null(&line, left_alignment_column, "left_token", &node_tab_path)?
.parse::<u32>()?;
let left_alignment = TextProperty {
segmentation: String::from(""),
val: left_alignment_val,
corpus_id,
text_id,
};
let right_alignment_val =
get_field_not_null(&line, right_alignment_column, "right_token", &node_tab_path)?
.parse::<u32>()?;
let right_alignment = TextProperty {
segmentation: String::from(""),
val: right_alignment_val,
corpus_id,
text_id,
};
textpos_table
.node_to_left
.insert(node_nr, left_alignment.clone())?;
textpos_table
.node_to_right
.insert(node_nr, right_alignment.clone())?;
if let Some(token_index_raw) = token_index_raw {
let span = if has_segmentations {
get_field_not_null(&line, 12, "span", &node_tab_path)?
} else {
get_field_not_null(&line, 9, "span", &node_tab_path)?
};
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_path,
anno_ns: ANNIS_NS.to_owned(),
anno_name: "tok".to_owned(),
anno_value: span.to_string(),
})?;
let index = TextProperty {
segmentation: String::from(""),
val: token_index_raw.parse::<u32>()?,
text_id,
corpus_id,
};
textpos_table
.token_by_index
.insert(index.clone(), node_nr)?;
textpos_table.token_to_index.insert(node_nr, index)?;
textpos_table
.token_by_left_textpos
.insert(left_alignment, node_nr)?;
textpos_table
.token_by_right_textpos
.insert(right_alignment, node_nr)?;
} else if has_segmentations {
let segmentation_name = if is_annis_33 {
get_field(&line, 11, "seg_name", &node_tab_path)?
} else {
get_field(&line, 8, "seg_name", &node_tab_path)?
};
if let Some(segmentation_name) = segmentation_name {
let seg_index = if is_annis_33 {
get_field_not_null(&line, 10, "seg_index", &node_tab_path)?
.parse::<u32>()?
} else {
get_field_not_null(&line, 9, "seg_index", &node_tab_path)?.parse::<u32>()?
};
let span_for_seg_node = if is_annis_33 {
get_field(&line, 12, "span", &node_tab_path)?
} else {
None
};
if let Some(span_for_seg_node) = span_for_seg_node {
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_path,
anno_ns: ANNIS_NS.to_owned(),
anno_name: "tok".to_owned(),
anno_value: span_for_seg_node.to_string(),
})?;
} else {
missing_seg_span.insert(node_nr, segmentation_name.clone())?;
}
let index = TextProperty {
segmentation: segmentation_name,
val: seg_index,
corpus_id,
text_id,
};
textpos_table.token_by_index.insert(index, node_nr)?;
} } }
}
progress.info(format!(
"creating index for content of {}",
&node_tab_path.to_string_lossy()
))?;
id_to_node_name.compact()?;
nodes_by_text.compact()?;
missing_seg_span.compact()?;
textpos_table.node_to_left.compact()?;
textpos_table.node_to_right.compact()?;
textpos_table.node_to_left_char.compact()?;
textpos_table.node_to_right_char.compact()?;
textpos_table.token_to_index.compact()?;
textpos_table.token_by_index.compact()?;
textpos_table.token_by_left_textpos.compact()?;
textpos_table.token_by_right_textpos.compact()?;
if !(textpos_table.token_by_index.is_empty())? {
calculate_automatic_token_order(
updates,
&textpos_table.token_by_index,
&id_to_node_name,
progress,
)?;
}
Ok(NodeTabParseResult {
nodes_by_text,
missing_seg_span,
id_to_node_name,
textpos_table,
})
}
fn load_node_anno_tab(
path: &Path,
updates: &mut GraphUpdate,
missing_seg_span: &DiskMap<NodeID, String>,
id_to_node_name: &DiskMap<NodeID, String>,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<()> {
let mut node_anno_tab_path = PathBuf::from(path);
node_anno_tab_path.push(if is_annis_33 {
"node_annotation.annis"
} else {
"node_annotation.tab"
});
progress.info(format!(
"loading {}",
node_anno_tab_path.to_str().unwrap_or_default()
))?;
let mut node_anno_tab_csv = postgresql_import_reader(node_anno_tab_path.as_path())?;
for line in node_anno_tab_csv.records() {
let line = line?;
let col_id = get_field_not_null(&line, 0, "id", &node_anno_tab_path)?;
let node_id: NodeID = col_id.parse()?;
let node_name = id_to_node_name
.get(&node_id)?
.ok_or_else(|| anyhow!("node with ID {node_id} not found"))?;
let col_ns = get_field(&line, 1, "namespace", &node_anno_tab_path)?.unwrap_or_default();
let col_name = get_field_not_null(&line, 2, "name", &node_anno_tab_path)?;
let col_val = get_field(&line, 3, "value", &node_anno_tab_path)?;
if col_ns != "annis" || col_name != "tok" {
let has_valid_value = col_val.is_some();
let anno_val = &col_val.unwrap_or_else(|| INVALID_STRING.clone());
if let Some(seg) = missing_seg_span.get(&node_id)?
&& seg.as_str() == col_name.as_str()
&& has_valid_value
{
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_owned(),
anno_name: "tok".to_owned(),
anno_value: anno_val.to_string(),
})?;
}
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: col_ns.to_string(),
anno_name: col_name.to_string(),
anno_value: anno_val.to_string(),
})?;
}
}
Ok(())
}
fn load_component_tab(
path: &Path,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<BTreeMap<u32, Component<AnnotationComponentType>>> {
let mut component_tab_path = PathBuf::from(path);
component_tab_path.push(if is_annis_33 {
"component.annis"
} else {
"component.tab"
});
progress.info(format!(
"loading {}",
component_tab_path.to_str().unwrap_or_default()
))?;
let mut component_by_id: BTreeMap<u32, Component<AnnotationComponentType>> = BTreeMap::new();
let mut component_tab_csv = postgresql_import_reader(component_tab_path.as_path())?;
for result in component_tab_csv.records() {
let line = result?;
let cid: u32 = get_field_not_null(&line, 0, "id", &component_tab_path)?.parse()?;
if let Some(col_type) = get_field(&line, 1, "type", &component_tab_path)? {
let layer = get_field(&line, 2, "layer", &component_tab_path)?.unwrap_or_default();
let name = get_field(&line, 3, "name", &component_tab_path)?.unwrap_or_default();
let ctype = component_type_from_short_name(&col_type)?;
component_by_id.insert(cid, Component::new(ctype, layer, name));
}
}
Ok(component_by_id)
}
fn load_nodes(
path: &Path,
updates: &mut GraphUpdate,
texts: &mut DiskMap<TextKey, Text>,
corpus_table: &ParsedCorpusTable,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<LoadNodeResult> {
let node_tab_parse_result =
load_node_tab(path, updates, texts, corpus_table, is_annis_33, progress)?;
load_node_anno_tab(
path,
updates,
&node_tab_parse_result.missing_seg_span,
&node_tab_parse_result.id_to_node_name,
is_annis_33,
progress,
)?;
Ok(LoadNodeResult {
nodes_by_text: node_tab_parse_result.nodes_by_text,
id_to_node_name: node_tab_parse_result.id_to_node_name,
textpos_table: node_tab_parse_result.textpos_table,
})
}
fn load_rank_tab(
path: &Path,
updates: &mut GraphUpdate,
component_by_id: &BTreeMap<u32, Component<AnnotationComponentType>>,
id_to_node_name: &DiskMap<NodeID, String>,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<LoadRankResult> {
let mut rank_tab_path = PathBuf::from(path);
rank_tab_path.push(if is_annis_33 {
"rank.annis"
} else {
"rank.tab"
});
progress.info(format!(
"loading {}",
rank_tab_path.to_str().unwrap_or_default()
))?;
let mut load_rank_result = LoadRankResult {
components_by_pre: DiskMap::default(),
edges_by_pre: DiskMap::default(),
text_coverage_edges: DiskMap::default(),
component_for_parentless_target_node: DiskMap::default(),
};
let mut rank_tab_csv = postgresql_import_reader(rank_tab_path.as_path())?;
let pos_node_ref = if is_annis_33 { 3 } else { 2 };
let pos_component_ref = if is_annis_33 { 4 } else { 3 };
let pos_parent = if is_annis_33 { 5 } else { 4 };
let mut pre_to_node_id: DiskMap<u32, NodeID> = DiskMap::default();
for result in rank_tab_csv.records() {
let line = result?;
let pre: u32 = get_field_not_null(&line, 0, "pre", &rank_tab_path)?.parse()?;
let node_id: NodeID =
get_field_not_null(&line, pos_node_ref, "node_ref", &rank_tab_path)?.parse()?;
pre_to_node_id.insert(pre, node_id)?;
}
let mut rank_tab_csv = postgresql_import_reader(rank_tab_path.as_path())?;
for result in rank_tab_csv.records() {
let line = result?;
let component_ref: u32 =
get_field_not_null(&line, pos_component_ref, "component_ref", &rank_tab_path)?
.parse()?;
let target: NodeID =
get_field_not_null(&line, pos_node_ref, "node_ref", &rank_tab_path)?.parse()?;
if let Some(parent_as_str) = get_field(&line, pos_parent, "parent", &rank_tab_path)? {
let parent: u32 = parent_as_str.parse()?;
if let Some(source) = pre_to_node_id.get(&parent)?
&& let Some(c) = component_by_id.get(&component_ref)
{
updates.add_event(UpdateEvent::AddEdge {
source_node: id_to_node_name
.get(&source)?
.ok_or_else(|| anyhow!("node with ID {source} not found"))?
.to_string(),
target_node: id_to_node_name
.get(&target)?
.ok_or_else(|| anyhow!("node with ID {target} not found"))?
.to_string(),
layer: c.layer.clone(),
component_type: c.get_type().to_string(),
component_name: c.name.clone(),
})?;
let pre: u32 = get_field_not_null(&line, 0, "pre", &rank_tab_path)?.parse()?;
let e = Edge {
source: *source,
target,
};
if c.get_type() == AnnotationComponentType::Coverage {
load_rank_result
.text_coverage_edges
.insert(e.clone(), true)?;
}
load_rank_result.components_by_pre.insert(pre, c.clone())?;
load_rank_result.edges_by_pre.insert(pre, e)?;
}
} else if let Some(c) = component_by_id.get(&component_ref)
&& c.get_type() == AnnotationComponentType::Coverage
{
load_rank_result
.component_for_parentless_target_node
.insert(target, c.clone())?;
}
}
progress.info(format!(
"creating index for content of {}",
&rank_tab_path.to_string_lossy()
))?;
load_rank_result.components_by_pre.compact()?;
load_rank_result.edges_by_pre.compact()?;
load_rank_result.text_coverage_edges.compact()?;
load_rank_result
.component_for_parentless_target_node
.compact()?;
Ok(load_rank_result)
}
fn load_edge_annotation(
path: &Path,
updates: &mut GraphUpdate,
rank_result: &LoadRankResult,
id_to_node_name: &DiskMap<NodeID, String>,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<()> {
let mut edge_anno_tab_path = PathBuf::from(path);
edge_anno_tab_path.push(if is_annis_33 {
"edge_annotation.annis"
} else {
"edge_annotation.tab"
});
progress.info(format!(
"loading {}",
edge_anno_tab_path.to_str().unwrap_or_default()
))?;
let mut edge_anno_tab_csv = postgresql_import_reader(edge_anno_tab_path.as_path())?;
for result in edge_anno_tab_csv.records() {
let line = result?;
let pre = get_field_not_null(&line, 0, "pre", &edge_anno_tab_path)?.parse::<u32>()?;
if let Some(c) = rank_result.components_by_pre.get(&pre)?
&& let Some(e) = rank_result.edges_by_pre.get(&pre)?
{
let ns = get_field(&line, 1, "namespace", &edge_anno_tab_path)?.unwrap_or_default();
let name = get_field_not_null(&line, 2, "name", &edge_anno_tab_path)?;
let val = get_field(&line, 3, "value", &edge_anno_tab_path)?
.unwrap_or_else(|| INVALID_STRING.clone());
updates.add_event(UpdateEvent::AddEdgeLabel {
source_node: id_to_node_name
.get(&e.source)?
.ok_or_else(|| anyhow!("node with ID {} not found", e.source))?
.to_string(),
target_node: id_to_node_name
.get(&e.target)?
.ok_or_else(|| anyhow!("node with ID {} not found", e.target))?
.to_string(),
layer: c.layer.clone(),
component_type: c.get_type().to_string(),
component_name: c.name.to_string(),
anno_ns: ns.to_string(),
anno_name: name.to_string(),
anno_value: val.to_string(),
})?;
}
}
Ok(())
}
fn load_corpus_annotation(
path: &Path,
is_annis_33: bool,
progress: &ProgressReporter,
) -> Result<BTreeMap<(u32, AnnoKey), std::string::String>> {
let mut corpus_id_to_anno = BTreeMap::new();
let mut corpus_anno_tab_path = PathBuf::from(path);
corpus_anno_tab_path.push(if is_annis_33 {
"corpus_annotation.annis"
} else {
"corpus_annotation.tab"
});
progress.info(format!(
"loading {}",
corpus_anno_tab_path.to_str().unwrap_or_default()
))?;
let mut corpus_anno_tab_csv = postgresql_import_reader(corpus_anno_tab_path.as_path())?;
for result in corpus_anno_tab_csv.records() {
let line = result?;
let id = get_field_not_null(&line, 0, "id", &corpus_anno_tab_path)?.parse::<u32>()?;
let ns = get_field(&line, 1, "namespace", &corpus_anno_tab_path)?.unwrap_or_default();
let name = get_field_not_null(&line, 2, "name", &corpus_anno_tab_path)?;
let val = get_field(&line, 3, "value", &corpus_anno_tab_path)?
.unwrap_or_else(|| INVALID_STRING.clone());
let anno_key = AnnoKey { ns, name };
corpus_id_to_anno.insert((id, anno_key), val.to_string());
}
Ok(corpus_id_to_anno)
}
fn get_parent_path(cid: u32, corpus_table: &ParsedCorpusTable) -> Result<std::string::String> {
let corpus = corpus_table
.corpus_by_id
.get(&cid)
.ok_or_else(|| anyhow!("corpus with ID {cid} not found"))?;
let pre = corpus.pre;
let post = corpus.post;
Ok(corpus_table
.corpus_by_preorder
.range(0..pre)
.filter_map(|(_, cid)| corpus_table.corpus_by_id.get(cid))
.filter(|parent_corpus| post < parent_corpus.post)
.map(|parent_corpus| parent_corpus.normalized_name.clone())
.join("/"))
}
fn get_corpus_path(cid: u32, corpus_table: &ParsedCorpusTable) -> Result<String> {
let mut result: String = get_parent_path(cid, corpus_table)?;
let corpus = corpus_table
.corpus_by_id
.get(&cid)
.ok_or_else(|| anyhow!("corpus with ID {cid} not found"))?;
result.push('/');
result.push_str(&corpus.normalized_name);
Ok(result)
}
fn add_subcorpora(
updates: &mut GraphUpdate,
corpus_table: &ParsedCorpusTable,
node_node_result: &LoadNodeResult,
texts: &DiskMap<TextKey, Text>,
corpus_id_to_annos: &BTreeMap<(u32, AnnoKey), std::string::String>,
is_annis_33: bool,
path: &Path,
) -> Result<()> {
{
updates.add_event(UpdateEvent::AddNode {
node_name: corpus_table.toplevel_corpus_name.as_str().into(),
node_type: "corpus".into(),
})?;
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: corpus_table.toplevel_corpus_name.as_str().into(),
anno_ns: ANNIS_NS.to_owned(),
anno_name: "relannis-version".to_owned(),
anno_value: if is_annis_33 {
"3.3".to_owned()
} else {
"3.2".to_owned()
},
})?;
if let Some(cid) = corpus_table.corpus_by_preorder.get(&0) {
let start_key = (
*cid,
AnnoKey {
ns: "".into(),
name: "".into(),
},
);
for ((entry_cid, anno_key), val) in corpus_id_to_annos.range(start_key..) {
if entry_cid == cid {
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: corpus_table.toplevel_corpus_name.as_str().into(),
anno_ns: anno_key.ns.clone(),
anno_name: anno_key.name.clone(),
anno_value: val.into(),
})?;
} else {
break;
}
}
add_external_data_files(path, &corpus_table.toplevel_corpus_name, None, updates)?;
}
}
for (pre, corpus_id) in corpus_table.corpus_by_preorder.iter() {
if *pre != 0 {
let corpus = corpus_table
.corpus_by_id
.get(corpus_id)
.ok_or_else(|| anyhow!("corpus with ID {corpus_id} not found"))?;
let corpus_name = &corpus.name;
let subcorpus_full_name = get_corpus_path(*corpus_id, corpus_table)?;
updates.add_event(UpdateEvent::AddNode {
node_name: subcorpus_full_name.to_string(),
node_type: "corpus".to_owned(),
})?;
if corpus.pre + 1 == corpus.post {
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: subcorpus_full_name.to_string(),
anno_ns: ANNIS_NS.to_owned(),
anno_name: "doc".to_owned(),
anno_value: corpus_name.as_str().into(),
})?;
}
let start_key = (
*corpus_id,
AnnoKey {
ns: "".into(),
name: "".into(),
},
);
for ((entry_cid, anno_key), val) in corpus_id_to_annos.range(start_key..) {
if entry_cid == corpus_id {
updates.add_event(UpdateEvent::AddNodeLabel {
node_name: subcorpus_full_name.to_string(),
anno_ns: anno_key.ns.clone(),
anno_name: anno_key.name.clone(),
anno_value: val.clone(),
})?;
} else {
break;
}
}
let parent_path = get_parent_path(*corpus_id, corpus_table)?;
updates.add_event(UpdateEvent::AddEdge {
source_node: subcorpus_full_name.to_string(),
target_node: parent_path,
layer: ANNIS_NS.to_owned(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: std::string::String::default(),
})?;
add_external_data_files(path, &subcorpus_full_name, Some(corpus_name), updates)?;
} }
for text in texts.iter()? {
let (text_key, text) = text?;
if let Some(corpus_ref) = text_key.corpus_ref {
let text_name = utf8_percent_encode(&text.name, NODE_NAME_ENCODE_SET).to_string();
let subcorpus_full_name = get_corpus_path(corpus_ref, corpus_table)?;
let text_full_name = format!("{}#{}", &subcorpus_full_name, &text_name);
updates.add_event(UpdateEvent::AddNode {
node_name: text_full_name.clone(),
node_type: "datasource".to_owned(),
})?;
updates.add_event(UpdateEvent::AddEdge {
source_node: text_full_name.clone(),
target_node: subcorpus_full_name.to_string(),
layer: ANNIS_NS.to_owned(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: std::string::String::default(),
})?;
let min_key = NodeByTextEntry {
corpus_ref,
text_id: text_key.id,
node_id: NodeID::MIN,
};
let max_key = NodeByTextEntry {
corpus_ref,
text_id: text_key.id,
node_id: NodeID::MAX,
};
for item in node_node_result.nodes_by_text.range(min_key..=max_key) {
let (text_entry, _) = item?;
let n = text_entry.node_id;
updates.add_event(UpdateEvent::AddEdge {
source_node: node_node_result
.id_to_node_name
.get(&n)?
.ok_or_else(|| anyhow!("node with ID {n} not found"))?
.to_string(),
target_node: text_full_name.clone(),
layer: ANNIS_NS.to_owned(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: std::string::String::default(),
})?;
}
}
} Ok(())
}
fn component_type_from_short_name(short_type: &str) -> Result<AnnotationComponentType> {
match short_type {
"c" => Ok(AnnotationComponentType::Coverage),
"d" => Ok(AnnotationComponentType::Dominance),
"p" => Ok(AnnotationComponentType::Pointing),
"o" => Ok(AnnotationComponentType::Ordering),
_ => Err(anyhow!("invalid component type short name '{short_type}'")),
}
}
#[cfg(test)]
mod tests;