use crate::parser::ParseError;
const MAX_RELATIONSHIPS_PER_FILE: usize = 200;
const KNOWN_REL_TYPES: &[&str] = &[
"affiliated_with",
"appointed_by",
"charged_in",
"contracted_with",
"convicted_in",
"donated_to",
"employed_by",
"endorsed_by",
"family_of",
"funded_by",
"investigated_by",
"lobbied_for",
"member_of",
"next",
"owns",
"related_to",
"represented_by",
"sponsored_by",
"sued_by",
"transferred_to",
];
const REL_FIELDS: &[&str] = &[
"id",
"source",
"description",
"amount",
"currency",
"effective_date",
"expiry_date",
];
#[derive(Debug)]
#[allow(clippy::struct_field_names)]
pub struct Rel {
pub source_name: String,
pub target_name: String,
pub rel_type: String,
pub source_urls: Vec<String>,
pub fields: Vec<(String, String)>,
pub id: Option<String>,
pub line: usize,
}
#[allow(clippy::too_many_lines)]
pub fn parse_relationships(
body: &str,
section_start_line: usize,
entity_names: &[&str],
default_sources: &[String],
errors: &mut Vec<ParseError>,
) -> Vec<Rel> {
let lines: Vec<&str> = body.lines().collect();
let mut rels: Vec<Rel> = Vec::new();
let mut current: Option<RelBuilder> = None;
for (i, line) in lines.iter().enumerate() {
let file_line = section_start_line + 1 + i;
let trimmed = line.trim();
if trimmed.starts_with("- ") && !line.starts_with(" ") {
if let Some(builder) = current.take() {
rels.push(builder.finish(default_sources));
}
let item = &trimmed[2..];
match parse_rel_line(item) {
Some((source, target, rel_type)) => {
if !KNOWN_REL_TYPES.contains(&rel_type.as_str()) {
errors.push(ParseError {
line: file_line,
message: format!(
"unknown relationship type {rel_type:?} (known: {})",
KNOWN_REL_TYPES.join(", ")
),
});
}
if !entity_names.contains(&source.as_str()) {
errors.push(ParseError {
line: file_line,
message: format!(
"entity {source:?} in relationship not defined in file"
),
});
}
if !entity_names.contains(&target.as_str()) {
errors.push(ParseError {
line: file_line,
message: format!(
"entity {target:?} in relationship not defined in file"
),
});
}
current = Some(RelBuilder {
source_name: source,
target_name: target,
rel_type,
source_urls: Vec::new(),
fields: Vec::new(),
id: None,
line: file_line,
});
}
None => {
errors.push(ParseError {
line: file_line,
message: format!(
"invalid relationship syntax: expected `- Source -> Target: type`, got {trimmed:?}"
),
});
}
}
continue;
}
if line.starts_with(" - ") && current.is_some() {
let nested = trimmed.strip_prefix("- ").unwrap_or(trimmed);
if let Some((key, value)) = parse_kv(nested) {
if !REL_FIELDS.contains(&key.as_str()) {
errors.push(ParseError {
line: file_line,
message: format!("unknown relationship field {key:?}"),
});
continue;
}
let builder = current.as_mut().unwrap_or_else(|| unreachable!());
if key == "id" {
builder.id = Some(value);
} else if key == "source" {
if !value.starts_with("https://") {
errors.push(ParseError {
line: file_line,
message: format!("relationship source URL must be HTTPS: {value:?}"),
});
}
builder.source_urls.push(value);
} else {
validate_rel_field(&key, &value, file_line, errors);
builder.fields.push((key, value));
}
} else {
errors.push(ParseError {
line: file_line,
message: format!(
"invalid nested field syntax: expected `- key: value`, got {trimmed:?}"
),
});
}
}
}
if let Some(builder) = current.take() {
rels.push(builder.finish(default_sources));
}
if rels.len() > MAX_RELATIONSHIPS_PER_FILE {
errors.push(ParseError {
line: section_start_line,
message: format!(
"too many relationships (max {MAX_RELATIONSHIPS_PER_FILE}, got {})",
rels.len()
),
});
}
rels
}
struct RelBuilder {
source_name: String,
target_name: String,
rel_type: String,
source_urls: Vec<String>,
fields: Vec<(String, String)>,
id: Option<String>,
line: usize,
}
impl RelBuilder {
fn finish(self, default_sources: &[String]) -> Rel {
let source_urls = if self.source_urls.is_empty() {
default_sources.to_vec()
} else {
self.source_urls
};
Rel {
source_name: self.source_name,
target_name: self.target_name,
rel_type: self.rel_type,
source_urls,
fields: self.fields,
id: self.id,
line: self.line,
}
}
}
fn parse_rel_line(item: &str) -> Option<(String, String, String)> {
let arrow_pos = item.find(" -> ")?;
let source = item[..arrow_pos].trim();
let after_arrow = &item[arrow_pos + 4..];
let colon_pos = after_arrow.rfind(':')?;
let target = after_arrow[..colon_pos].trim();
let rel_type = after_arrow[colon_pos + 1..]
.trim()
.to_lowercase()
.replace(' ', "_");
if source.is_empty() || target.is_empty() || rel_type.is_empty() {
return None;
}
Some((source.to_string(), target.to_string(), rel_type))
}
fn parse_kv(s: &str) -> Option<(String, String)> {
let colon = s.find(':')?;
let key = s[..colon].trim();
if key.is_empty() {
return None;
}
let value = s[colon + 1..].trim();
Some((key.to_string(), value.to_string()))
}
fn validate_rel_field(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
let max = match key {
"description" => 1000,
"amount" => 50,
"currency" | "effective_date" | "expiry_date" => 10,
_ => return,
};
if value.len() > max {
errors.push(ParseError {
line,
message: format!(
"relationship field {key:?} exceeds {max} chars (got {})",
value.len()
),
});
}
if matches!(key, "effective_date" | "expiry_date") && !value.is_empty() {
let valid = matches!(value.len(), 4 | 7 | 10)
&& value.chars().enumerate().all(|(i, c)| match i {
4 | 7 => c == '-',
_ => c.is_ascii_digit(),
});
if !valid {
errors.push(ParseError {
line,
message: format!(
"relationship field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"
),
});
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_basic_relationship() {
let body = "\n- Alice -> Bob: employed_by\n";
let names = vec!["Alice", "Bob"];
let sources = vec!["https://example.com/src".to_string()];
let mut errors = Vec::new();
let rels = parse_relationships(body, 50, &names, &sources, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(rels.len(), 1);
assert_eq!(rels[0].source_name, "Alice");
assert_eq!(rels[0].target_name, "Bob");
assert_eq!(rels[0].rel_type, "employed_by");
assert_eq!(rels[0].source_urls, vec!["https://example.com/src"]);
}
#[test]
fn parse_relationship_with_source_override() {
let body = [
"",
"- Alice -> Bob: related_to",
" - source: https://specific.com/article",
"",
]
.join("\n");
let names = vec!["Alice", "Bob"];
let sources = vec!["https://default.com".to_string()];
let mut errors = Vec::new();
let rels = parse_relationships(&body, 10, &names, &sources, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(rels[0].source_urls, vec!["https://specific.com/article"]);
}
#[test]
fn parse_relationship_with_fields() {
let body = [
"",
"- Alice -> Corp: donated_to",
" - amount: EUR 50,000",
" - currency: EUR",
" - effective_date: 2020-01",
" - description: Campaign donation",
"",
]
.join("\n");
let names = vec!["Alice", "Corp"];
let mut errors = Vec::new();
let rels = parse_relationships(&body, 10, &names, &[], &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(rels[0].fields.len(), 4);
}
#[test]
fn reject_unknown_rel_type() {
let body = "\n- Alice -> Bob: best_friends\n";
let names = vec!["Alice", "Bob"];
let mut errors = Vec::new();
parse_relationships(body, 1, &names, &[], &mut errors);
assert!(
errors
.iter()
.any(|e| e.message.contains("unknown relationship type"))
);
}
#[test]
fn reject_unresolved_entity() {
let body = "\n- Alice -> Unknown: employed_by\n";
let names = vec!["Alice"];
let mut errors = Vec::new();
parse_relationships(body, 1, &names, &[], &mut errors);
assert!(
errors
.iter()
.any(|e| e.message.contains("not defined in file"))
);
}
#[test]
fn reject_non_https_source_override() {
let body = [
"",
"- Alice -> Bob: related_to",
" - source: http://insecure.com",
"",
]
.join("\n");
let names = vec!["Alice", "Bob"];
let mut errors = Vec::new();
parse_relationships(&body, 1, &names, &[], &mut errors);
assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
}
#[test]
fn reject_unknown_rel_field() {
let body = ["", "- Alice -> Bob: related_to", " - foobar: value", ""].join("\n");
let names = vec!["Alice", "Bob"];
let mut errors = Vec::new();
parse_relationships(&body, 1, &names, &[], &mut errors);
assert!(
errors
.iter()
.any(|e| e.message.contains("unknown relationship field"))
);
}
#[test]
fn multiple_relationships() {
let body = [
"",
"- Alice -> Bob: employed_by",
"- Bob -> Corp: member_of",
"- Corp -> Alice: sued_by",
"",
]
.join("\n");
let names = vec!["Alice", "Bob", "Corp"];
let mut errors = Vec::new();
let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(rels.len(), 3);
}
#[test]
fn parse_rel_line_syntax() {
let result = parse_rel_line("Mark Bonnick -> Arsenal FC: employed_by");
assert_eq!(
result,
Some((
"Mark Bonnick".into(),
"Arsenal FC".into(),
"employed_by".into()
))
);
}
#[test]
fn parse_rel_line_invalid() {
assert!(parse_rel_line("not a relationship").is_none());
assert!(parse_rel_line("-> Target: type").is_none());
assert!(parse_rel_line("Source -> : type").is_none());
}
#[test]
fn relationship_date_validation() {
let body = [
"",
"- Alice -> Bob: related_to",
" - effective_date: not-a-date",
"",
]
.join("\n");
let names = vec!["Alice", "Bob"];
let mut errors = Vec::new();
parse_relationships(&body, 1, &names, &[], &mut errors);
assert!(errors.iter().any(|e| e.message.contains("YYYY")));
}
#[test]
fn multiple_source_overrides() {
let body = [
"",
"- Alice -> Bob: related_to",
" - source: https://first.com",
" - source: https://second.com",
"",
]
.join("\n");
let names = vec!["Alice", "Bob"];
let mut errors = Vec::new();
let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(rels[0].source_urls.len(), 2);
}
}