pub mod error;
pub mod types;
pub mod utils;
use std::{
collections::{HashMap, HashSet},
fs,
path::{Path, PathBuf},
};
use crate::corpus::{
error::CorpusLoaderError,
types::{Document, Target},
utils::{CorpusLoaderOptions, LoadValidator, build_value_pools, parse_document},
};
#[derive(Debug, Clone)]
pub struct ValueProvenance {
pub source_id: String,
pub document_title: String,
pub section: String,
pub field: String,
}
#[derive(Debug, Clone)]
pub struct PooledValue {
pub value: String,
pub provenance: Vec<ValueProvenance>,
}
#[derive(Debug, Clone)]
pub struct ValuePool {
pub section: String,
pub field: String,
values: Vec<PooledValue>,
}
impl ValuePool {
pub fn values(&self) -> &[PooledValue] {
&self.values
}
}
#[derive(Debug, Clone)]
pub struct Corpus {
pub target: Target,
pub documents: Vec<Document>,
pub pools: Vec<ValuePool>,
}
impl Corpus {
pub fn builder(target: impl Into<String>) -> CorpusBuilder {
CorpusBuilder::new(target.into())
}
pub fn from_files(
paths: &[impl AsRef<Path>],
target: &str,
opts: Option<CorpusLoaderOptions>,
validator: Option<Box<dyn LoadValidator>>,
) -> Result<Corpus, CorpusLoaderError> {
let mut builder = Corpus::builder(target);
if let Some(validator) = validator {
builder = builder.with_validator(validator);
}
if let Some(opts) = opts {
builder = builder.with_options(opts);
}
for path in paths {
builder = builder.add_file(path);
}
builder.build()
}
pub fn target(&self) -> &str {
self.target.as_str()
}
pub fn combine(self, other: Corpus) -> Result<Self, CorpusLoaderError> {
let Corpus {
target,
mut documents,
pools: _,
} = self;
let Corpus {
target: other_target,
documents: other_documents,
pools: _,
} = other;
if target != other_target {
return Err(CorpusLoaderError::TargetMismatch {
expected: target,
found: other_target,
});
}
documents.extend(other_documents);
let mut seen_source_hashes: HashMap<String, usize> = HashMap::new();
for document in &mut documents {
let count = seen_source_hashes
.entry(document.source_hash.clone())
.or_insert(0);
*count += 1;
if *count == 1 {
document.source_id = document.source_hash.clone();
} else {
document.source_id = format!("{}:{}", document.source_hash, count);
}
}
let pools = build_value_pools(&documents);
Ok(Self {
target,
documents,
pools,
})
}
pub fn source_ids(&self) -> Vec<&str> {
self.documents
.iter()
.map(|document| document.source_id.as_str())
.collect()
}
pub fn source_paths(&self) -> Vec<&str> {
self.documents
.iter()
.map(|document| document.source_path.as_str())
.collect()
}
pub fn find_source(&self, source_id: &str) -> Option<&Document> {
self.documents
.iter()
.find(|document| document.source_id == source_id)
}
pub fn pooled_values_for_field_section(
&self,
field: &str,
section: &str,
) -> Option<&[PooledValue]> {
self.pools
.iter()
.find(|pool| pool.field == field && pool.section == section)
.map(ValuePool::values)
}
}
enum PendingSource {
File(PathBuf),
Str { name: String, raw: String },
}
pub struct CorpusBuilder {
target: String,
opts: CorpusLoaderOptions,
validator: Option<Box<dyn LoadValidator>>,
pending: Vec<PendingSource>,
documents: Vec<Document>,
}
impl CorpusBuilder {
pub(crate) fn new(target: String) -> Self {
Self {
target,
opts: CorpusLoaderOptions::default(),
validator: None,
pending: Vec::new(),
documents: Vec::new(),
}
}
pub fn add_file(mut self, path: impl AsRef<Path>) -> Self {
self.pending
.push(PendingSource::File(path.as_ref().to_path_buf()));
self
}
pub fn add_str(mut self, name: impl Into<String>, raw: impl Into<String>) -> Self {
self.pending.push(PendingSource::Str {
name: name.into(),
raw: raw.into(),
});
self
}
pub fn add_document(mut self, document: Document) -> Self {
self.documents.push(document);
self
}
pub fn with_options(mut self, opts: CorpusLoaderOptions) -> Self {
self.opts = opts;
self
}
pub fn with_validator(mut self, validator: impl LoadValidator + 'static) -> Self {
self.validator = Some(Box::new(validator));
self
}
pub fn build(self) -> Result<Corpus, CorpusLoaderError> {
if self.pending.is_empty() && self.documents.is_empty() {
return Err(CorpusLoaderError::InvalidInput(
"at least one source is required to build a corpus".to_string(),
));
}
let mut seen_source_ids: HashMap<String, usize> = HashMap::new();
let mut seen_titles: HashSet<String> = HashSet::new();
let mut documents: Vec<Document> = Vec::with_capacity(self.pending.len());
for pending in &self.pending {
let (source_path, raw) = match pending {
PendingSource::File(path) => {
let raw = fs::read_to_string(path)
.map_err(|e| CorpusLoaderError::read_for_path(path, e))?;
(path.to_string_lossy().to_string(), raw)
}
PendingSource::Str { name, raw } => (name.clone(), raw.clone()),
};
let (metadata, sections) = parse_document(&source_path, &raw)?;
if metadata.target != self.target {
if self.opts.skip_source_with_target_mismatch {
continue;
}
return Err(CorpusLoaderError::TargetMismatch {
expected: self.target.clone(),
found: metadata.target,
});
}
if !self.opts.identical_title_allowed && !seen_titles.insert(metadata.title.clone()) {
return Err(CorpusLoaderError::OptionViolation(format!(
"duplicate header.title '{}' is not allowed when identical_title_allowed is false",
metadata.title
)));
}
let source_hash = utils::hash_source_content(&self.target, &raw);
let source_id = utils::make_unique_source_id(&source_hash, &mut seen_source_ids);
let doc = Document {
metadata,
sections,
source_id,
source_hash,
source_path,
};
if let Some(validator) = &self.validator {
match validator.validate(&doc) {
Ok(_) => {}
Err(e) => {
if self.opts.skip_invalid_sources {
continue;
} else {
return Err(e);
}
}
}
}
documents.push(doc);
}
for value in &self.documents {
if value.metadata.target != self.target {
if self.opts.skip_source_with_target_mismatch {
continue;
}
return Err(CorpusLoaderError::TargetMismatch {
expected: self.target.clone(),
found: value.metadata.target.clone(),
});
}
if !self.opts.identical_title_allowed
&& !seen_titles.insert(value.metadata.title.clone())
{
return Err(CorpusLoaderError::OptionViolation(format!(
"duplicate header.title '{}' is not allowed when identical_title_allowed is false",
value.metadata.title
)));
}
let mut doc = value.clone();
doc.source_id = utils::make_unique_source_id(&doc.source_hash, &mut seen_source_ids);
if let Some(validator) = &self.validator {
match validator.validate(&doc) {
Ok(_) => {}
Err(e) => {
if self.opts.skip_invalid_sources {
continue;
} else {
return Err(e);
}
}
}
}
documents.push(doc);
}
let pools = build_value_pools(&documents);
Ok(Corpus {
target: self.target,
documents,
pools,
})
}
}
#[cfg(test)]
mod tests {
use crate::corpus::types::{Document, DocumentMetadata, Section};
use super::Corpus;
fn doc(hash: &str, id: &str, target: &str) -> Document {
Document {
source_id: id.to_string(),
source_hash: hash.to_string(),
source_path: format!("/{id}.toml"),
metadata: DocumentMetadata {
title: id.to_string(),
target: target.to_string(),
desc: None,
author: None,
version: None,
schema: None,
},
sections: Vec::<Section>::new(),
}
}
#[test]
fn combine_appends_documents_and_renumbers_duplicate_hash_ids() {
let left = Corpus {
target: "weapon".to_string(),
documents: vec![doc("hash-a", "old-1", "weapon")],
pools: vec![],
};
let right = Corpus {
target: "weapon".to_string(),
documents: vec![
doc("hash-a", "old-2", "weapon"),
doc("hash-b", "old-3", "weapon"),
],
pools: vec![],
};
let combined = left
.combine(right)
.expect("combine should succeed for matching targets");
assert_eq!(combined.target, "weapon");
assert_eq!(combined.documents.len(), 3);
assert_eq!(combined.documents[0].source_id, "hash-a");
assert_eq!(combined.documents[1].source_id, "hash-a:2");
assert_eq!(combined.documents[2].source_id, "hash-b");
}
#[test]
fn combine_returns_error_for_different_targets() {
let left = Corpus {
target: "weapon".to_string(),
documents: vec![doc("hash-a", "old-1", "weapon")],
pools: vec![],
};
let right = Corpus {
target: "person".to_string(),
documents: vec![doc("hash-b", "old-2", "person")],
pools: vec![],
};
let err = left
.combine(right)
.expect_err("combine should fail for different targets");
assert!(matches!(
err,
crate::corpus::error::CorpusLoaderError::TargetMismatch { .. }
));
}
#[test]
fn pooled_values_lookup_keeps_same_field_names_separate_per_section() {
let doc1 = r#"
[header]
title = "doc one"
target = "weapon"
[name]
first = ["ash", "birch"]
[aliases]
first = ["ember"]
"#;
let doc2 = r#"
[header]
title = "doc two"
target = "weapon"
[name]
first = ["cedar"]
"#;
let corpus = Corpus::builder("weapon")
.add_str("doc-1", doc1)
.add_str("doc-2", doc2)
.build()
.expect("corpus should build");
let name_first = corpus
.pooled_values_for_field_section("first", "name")
.expect("name.first pool should exist");
let aliases_first = corpus
.pooled_values_for_field_section("first", "aliases")
.expect("aliases.first pool should exist");
assert_eq!(name_first.len(), 3);
assert_eq!(aliases_first.len(), 1);
assert!(
name_first
.iter()
.all(|value| value.provenance.iter().all(|p| p.section == "name"))
);
assert!(
aliases_first
.iter()
.all(|value| value.provenance.iter().all(|p| p.section == "aliases"))
);
}
#[test]
fn pooled_values_lookup_returns_none_for_missing_section_field_pair() {
let raw = r#"
[header]
title = "doc one"
target = "person"
[name]
first = ["al"]
"#;
let corpus = Corpus::builder("person")
.add_str("doc-1", raw)
.build()
.expect("corpus should build");
assert!(
corpus
.pooled_values_for_field_section("last", "name")
.is_none()
);
assert!(
corpus
.pooled_values_for_field_section("first", "aliases")
.is_none()
);
}
#[test]
fn corpus_builder_add_document_successfully_appends_to_corpus() {
let doc = doc("hash-a", "old-2", "weapon");
let corpus = Corpus::builder("weapon")
.add_document(doc)
.build()
.expect("corpus should build");
assert!(!corpus.documents.is_empty());
assert_eq!(corpus.documents[0].metadata.target, "weapon")
}
}