use crate::{config::Format, Document};
use serde::{Deserialize, Serialize};
use std::io::Write;
#[derive(Eq, PartialEq, Serialize, Deserialize, Clone, Hash, Debug)]
pub struct Text {
pub text: String,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct SpacyEntity {
pub entity: Vec<(usize, usize, String)>,
}
impl Format {
pub fn save(&self, annotations: &Vec<Document>, path: &str) -> Result<String, std::io::Error> {
match self {
Format::Spacy => Format::spacy(annotations, path),
Format::Jsonl => Format::jsonl(annotations, path),
Format::Csv => Format::csv(annotations, path),
Format::Brat => Format::brat(annotations, path),
Format::Conll => Format::conll(annotations, path),
}
}
fn remove_extension_from_path(path: &str) -> String {
let mut path = path.to_string();
if path.contains('.') {
path.truncate(path.rfind('.').unwrap());
}
path
}
fn spacy(documents: &Vec<Document>, path: &str) -> Result<String, std::io::Error> {
let path = Format::remove_extension_from_path(path);
let mut file = std::fs::File::create(format!("{path}.json"))?;
let annotations_tranformed: Vec<(String, SpacyEntity)> = documents
.into_iter()
.map(|annotation| {
(
(*annotation.text).to_string(),
SpacyEntity {
entity: (*annotation.label).to_vec(),
},
)
})
.collect();
let json = serde_json::to_string(&annotations_tranformed).unwrap();
file.write_all(json.as_bytes())?;
Ok(path)
}
fn jsonl(documents: &Vec<Document>, path: &str) -> Result<String, std::io::Error> {
let path = Format::remove_extension_from_path(path);
let mut file = std::fs::File::create(format!("{path}.jsonl"))?;
for document in documents {
let json = serde_json::to_string(&document).unwrap();
file.write_all(json.as_bytes())?;
file.write_all(b"\n")?;
}
Ok(path)
}
fn csv(documents: &Vec<Document>, path: &str) -> Result<String, std::io::Error> {
let path = Format::remove_extension_from_path(path);
let mut file = std::fs::File::create(format!("{path}.csv"))?;
for document in documents {
let json = serde_json::to_string(&document).unwrap();
file.write_all(json.as_bytes())?;
file.write_all(b"\n")?;
}
Ok(path)
}
fn brat(documents: &Vec<Document>, path: &str) -> Result<String, std::io::Error> {
let path = Format::remove_extension_from_path(path);
let mut file_ann = std::fs::File::create(format!("{path}.ann"))?;
let mut file_txt = std::fs::File::create(format!("{path}.txt"))?;
for document in documents {
let text = &document.text;
file_txt.write_all(text.as_bytes())?;
file_txt.write_all(b"\n")?;
for (id, (start, end, label)) in (*document.label).to_vec().into_iter().enumerate() {
let entity = text[start..end].to_string();
let line = format!("T{id}\t{label}\t{start}\t{end}\t{entity}");
file_ann.write_all(line.as_bytes())?;
file_ann.write_all(b"\n")?;
}
}
Ok(path)
}
fn conll(documents: &Vec<Document>, path: &str) -> Result<String, std::io::Error> {
let path = Format::remove_extension_from_path(path);
let mut file = std::fs::File::create(format!("{path}.txt"))?;
let annotations_tranformed: Vec<Vec<(String, String)>> = documents
.into_iter()
.map(|annotation| {
let text = &annotation.text;
let words: Vec<&str> = text.split_whitespace().collect();
let mut labels: Vec<String> = vec!["O".to_string(); words.len()];
for (start, end, label) in (*annotation.label).to_vec() {
let entity = text[start..end].to_string();
let index = words.iter().position(|&word| word.contains(&entity));
if index.is_none() {
continue;
}
let index = index.unwrap();
labels[index] = label;
}
words
.iter()
.zip(labels.iter())
.map(|(word, label)| (word.to_string(), label.to_string()))
.collect()
})
.collect();
for annotation in annotations_tranformed {
for (word, label) in annotation {
let line = format!("{word}\t{label}");
file.write_all(line.as_bytes())?;
file.write_all(b"\n")?;
}
file.write_all(b"\n")?;
}
Ok(path)
}
}