use std::fmt;
use crate::parser::{ParseError, SectionKind};
const MAX_ENTITIES_PER_FILE: usize = 50;
const MAX_NAME_LEN: usize = 300;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Label {
Actor,
Institution,
PublicRecord,
}
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Actor => write!(f, "actor"),
Self::Institution => write!(f, "institution"),
Self::PublicRecord => write!(f, "public_record"),
}
}
}
impl Label {
pub fn from_section(kind: SectionKind) -> Option<Self> {
match kind {
SectionKind::Actors => Some(Self::Actor),
SectionKind::Institutions => Some(Self::Institution),
SectionKind::Events => Some(Self::PublicRecord),
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct Entity {
pub name: String,
pub label: Label,
pub fields: Vec<(String, FieldValue)>,
pub id: Option<String>,
pub line: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FieldValue {
Single(String),
List(Vec<String>),
}
pub fn parse_entity_file_body(
name: &str,
body: &str,
label: Label,
id: Option<String>,
title_line: usize,
errors: &mut Vec<ParseError>,
) -> Entity {
let section_kind = match label {
Label::Actor => SectionKind::Actors,
Label::Institution => SectionKind::Institutions,
Label::PublicRecord => SectionKind::Events,
};
let wrapped = format!("### {name}\n{body}");
let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
if let Some(mut entity) = entities.pop() {
entity.id = id;
entity.line = title_line;
entity
} else {
Entity {
name: name.to_string(),
label,
fields: Vec::new(),
id,
line: title_line,
}
}
}
#[allow(clippy::too_many_lines)]
pub fn parse_entities(
body: &str,
section_kind: SectionKind,
section_start_line: usize,
errors: &mut Vec<ParseError>,
) -> Vec<Entity> {
let Some(label) = Label::from_section(section_kind) else {
return Vec::new();
};
let lines: Vec<&str> = body.lines().collect();
let mut entities: Vec<Entity> = Vec::new();
let mut current_name: Option<String> = None;
let mut current_line: usize = 0;
let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
let mut pending_list_key: Option<String> = None;
let mut pending_list_items: Vec<String> = Vec::new();
for (i, line) in lines.iter().enumerate() {
let file_line = section_start_line + 1 + i;
if let Some(name) = strip_h3(line) {
flush_pending_list(
&mut pending_list_key,
&mut pending_list_items,
&mut current_fields,
);
if let Some(entity_name) = current_name.take() {
let entity = build_entity(
entity_name,
label,
current_line,
&mut current_fields,
errors,
);
entities.push(entity);
}
current_name = Some(name.to_string());
current_line = file_line;
current_fields.clear();
continue;
}
if current_name.is_none() {
if !line.trim().is_empty() {
errors.push(ParseError {
line: file_line,
message: "content before first entity heading (### Name)".into(),
});
}
continue;
}
let trimmed = line.trim();
if let Some(item) = trimmed.strip_prefix("- ") {
if line.starts_with(" - ") && pending_list_key.is_some() {
pending_list_items.push(item.trim().to_string());
continue;
}
flush_pending_list(
&mut pending_list_key,
&mut pending_list_items,
&mut current_fields,
);
if let Some((key, value)) = parse_bullet(item) {
if value.is_empty() {
pending_list_key = Some(key);
pending_list_items.clear();
} else if is_list_field(&key) && value.contains(',') {
let items: Vec<String> = value
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
current_fields.push((key, FieldValue::List(items)));
} else {
current_fields.push((key, FieldValue::Single(value)));
}
} else {
errors.push(ParseError {
line: file_line,
message: format!(
"invalid field syntax: expected `- key: value`, got {trimmed:?}"
),
});
}
continue;
}
if line.starts_with(" ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
if pending_list_key.is_some() {
errors.push(ParseError {
line: file_line,
message: "unexpected indented text in list context".into(),
});
} else if let Some(last) = current_fields.last_mut() {
if let FieldValue::Single(ref mut val) = last.1 {
val.push('\n');
val.push_str(trimmed);
}
}
continue;
}
if !trimmed.is_empty() {
flush_pending_list(
&mut pending_list_key,
&mut pending_list_items,
&mut current_fields,
);
}
}
flush_pending_list(
&mut pending_list_key,
&mut pending_list_items,
&mut current_fields,
);
if let Some(entity_name) = current_name.take() {
let entity = build_entity(
entity_name,
label,
current_line,
&mut current_fields,
errors,
);
entities.push(entity);
}
if entities.len() > MAX_ENTITIES_PER_FILE {
errors.push(ParseError {
line: section_start_line,
message: format!(
"too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
entities.len()
),
});
}
entities
}
fn flush_pending_list(
pending_key: &mut Option<String>,
pending_items: &mut Vec<String>,
fields: &mut Vec<(String, FieldValue)>,
) {
if let Some(key) = pending_key.take() {
fields.push((key, FieldValue::List(std::mem::take(pending_items))));
}
}
fn build_entity(
name: String,
label: Label,
line: usize,
fields: &mut Vec<(String, FieldValue)>,
errors: &mut Vec<ParseError>,
) -> Entity {
if name.trim().is_empty() {
errors.push(ParseError {
line,
message: "entity name must not be empty".into(),
});
} else if name.len() > MAX_NAME_LEN {
errors.push(ParseError {
line,
message: format!(
"entity name exceeds {MAX_NAME_LEN} chars (got {})",
name.len()
),
});
}
let id = extract_id_field(fields);
apply_type_shorthand(fields, label);
validate_fields(fields, label, line, errors);
Entity {
name,
label,
fields: std::mem::take(fields),
id,
line,
}
}
fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
let pos = fields.iter().position(|(k, _)| k == "id")?;
let (_, value) = fields.remove(pos);
match value {
FieldValue::Single(s) if !s.is_empty() => Some(s),
_ => None,
}
}
fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
for field in fields.iter_mut() {
if field.0 == "type" {
field.0 = match label {
Label::Institution => "institution_type".to_string(),
Label::PublicRecord => "document_type".to_string(),
Label::Actor => "type".to_string(), };
}
}
}
fn parse_bullet(item: &str) -> Option<(String, String)> {
let colon_pos = item.find(':')?;
let key = item[..colon_pos].trim();
if key.is_empty() {
return None;
}
let value = item[colon_pos + 1..].trim();
Some((key.to_string(), value.to_string()))
}
fn is_list_field(key: &str) -> bool {
matches!(key, "aliases" | "urls")
}
fn strip_h3(line: &str) -> Option<&str> {
let trimmed = line.trim_start();
if let Some(rest) = trimmed.strip_prefix("### ") {
if !rest.starts_with('#') {
return Some(rest.trim());
}
}
None
}
const COMMON_FIELDS: &[&str] = &[
"qualifier",
"aliases",
"thumbnail",
"thumbnail_source",
"occurred_at",
"urls",
"description",
];
const ACTOR_FIELDS: &[&str] = &[
"date_of_birth",
"place_of_birth",
"nationality",
"occupation",
];
const INSTITUTION_FIELDS: &[&str] = &[
"institution_type",
"jurisdiction",
"headquarters",
"founded_date",
"registration_number",
];
const PUBLIC_RECORD_FIELDS: &[&str] = &[
"document_type",
"case_number",
"filing_date",
"issuing_authority",
];
const OCCUPATION_VALUES: &[&str] = &[
"politician",
"executive",
"journalist",
"lawyer",
"footballer",
"activist",
"civil_servant",
"military",
"academic",
"lobbyist",
];
const INSTITUTION_TYPE_VALUES: &[&str] = &[
"football_club",
"political_party",
"corporation",
"government_agency",
"court",
"law_enforcement",
"ngo",
"media",
"regulatory_body",
"military",
"university",
"trade_union",
"lobby_group",
"sports_body",
];
const DOCUMENT_TYPE_VALUES: &[&str] = &[
"court_ruling",
"criminal_charge",
"contract",
"legislation",
"filing",
"investigation",
"termination",
"transfer",
"election_result",
"financial_disclosure",
"sanctions",
"permit",
"audit_report",
];
struct FieldConstraint {
max_len: usize,
enum_values: Option<&'static [&'static str]>,
}
fn field_constraint(key: &str) -> Option<FieldConstraint> {
match key {
"description" => Some(FieldConstraint {
max_len: 2000,
enum_values: None,
}),
"thumbnail" | "thumbnail_source" => Some(FieldConstraint {
max_len: 2048,
enum_values: None,
}),
"occurred_at" | "date_of_birth" | "founded_date" | "filing_date" => Some(FieldConstraint {
max_len: 10,
enum_values: None,
}),
"place_of_birth" | "jurisdiction" | "headquarters" | "issuing_authority" => {
Some(FieldConstraint {
max_len: 200,
enum_values: None,
})
}
"occupation" => Some(FieldConstraint {
max_len: 100,
enum_values: Some(OCCUPATION_VALUES),
}),
"institution_type" => Some(FieldConstraint {
max_len: 100,
enum_values: Some(INSTITUTION_TYPE_VALUES),
}),
"document_type" => Some(FieldConstraint {
max_len: 100,
enum_values: Some(DOCUMENT_TYPE_VALUES),
}),
"qualifier" | "nationality" | "case_number" | "registration_number" => {
Some(FieldConstraint {
max_len: 100,
enum_values: None,
})
}
_ => None,
}
}
const MAX_ALIASES: usize = 10;
const MAX_ALIAS_LEN: usize = 200;
const MAX_URLS: usize = 10;
const MAX_URL_LEN: usize = 2048;
fn validate_fields(
fields: &[(String, FieldValue)],
label: Label,
line: usize,
errors: &mut Vec<ParseError>,
) {
let label_fields: &[&str] = match label {
Label::Actor => ACTOR_FIELDS,
Label::Institution => INSTITUTION_FIELDS,
Label::PublicRecord => PUBLIC_RECORD_FIELDS,
};
for (key, value) in fields {
if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
errors.push(ParseError {
line,
message: format!("unknown field {key:?} for {label}"),
});
continue;
}
match value {
FieldValue::Single(val) => {
if let Some(constraint) = field_constraint(key) {
if val.len() > constraint.max_len {
errors.push(ParseError {
line,
message: format!(
"field {key:?} exceeds {} chars (got {})",
constraint.max_len,
val.len()
),
});
}
if let Some(allowed) = constraint.enum_values {
validate_enum_value(key, val, allowed, line, errors);
}
if matches!(
key.as_str(),
"occurred_at" | "date_of_birth" | "founded_date" | "filing_date"
) && !val.is_empty()
{
validate_date_format(key, val, line, errors);
}
if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
&& !val.is_empty()
&& !val.starts_with("https://")
{
errors.push(ParseError {
line,
message: format!("field {key:?} must be HTTPS URL"),
});
}
}
}
FieldValue::List(items) => match key.as_str() {
"aliases" => {
if items.len() > MAX_ALIASES {
errors.push(ParseError {
line,
message: format!(
"aliases exceeds {MAX_ALIASES} items (got {})",
items.len()
),
});
}
for item in items {
if item.len() > MAX_ALIAS_LEN {
errors.push(ParseError {
line,
message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
});
}
}
}
"urls" => {
if items.len() > MAX_URLS {
errors.push(ParseError {
line,
message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
});
}
for item in items {
if item.len() > MAX_URL_LEN {
errors.push(ParseError {
line,
message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
});
}
if !item.starts_with("https://") {
errors.push(ParseError {
line,
message: format!("url must be HTTPS: {item:?}"),
});
}
}
}
_ => {}
},
}
}
}
fn validate_enum_value(
key: &str,
value: &str,
allowed: &[&str],
line: usize,
errors: &mut Vec<ParseError>,
) {
if let Some(custom) = value.strip_prefix("custom:") {
if custom.is_empty() || custom.len() > 100 {
errors.push(ParseError {
line,
message: format!(
"field {key:?} custom value must be 1-100 chars, got {}",
custom.len()
),
});
}
return;
}
let normalized = value.to_lowercase().replace(' ', "_");
if !allowed.contains(&normalized.as_str()) {
errors.push(ParseError {
line,
message: format!(
"invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
allowed.join(", ")
),
});
}
}
fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
let valid = matches!(value.len(), 4 | 7 | 10)
&& value.chars().enumerate().all(|(i, c)| match i {
4 | 7 => c == '-',
_ => c.is_ascii_digit(),
});
if !valid {
errors.push(ParseError {
line,
message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
});
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_actor_entity() {
let body = [
"",
"### Mark Bonnick",
"- qualifier: Arsenal Kit Manager",
"- nationality: British",
"- occupation: custom:Kit Manager",
"- date_of_birth: 1962",
"- description: Academy kit manager at Arsenal FC for 22 years",
" (2001-2024). Age 62 at time of dismissal.",
"",
]
.join("\n");
let mut errors = Vec::new();
let entities = parse_entities(&body, SectionKind::Actors, 10, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(entities.len(), 1);
let e = &entities[0];
assert_eq!(e.name, "Mark Bonnick");
assert_eq!(e.label, Label::Actor);
assert_eq!(e.fields.len(), 5);
let desc = e
.fields
.iter()
.find(|(k, _)| k == "description")
.map(|(_, v)| v);
assert_eq!(
desc,
Some(&FieldValue::Single(
"Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
))
);
}
#[test]
fn parse_institution_with_type_shorthand() {
let body = [
"",
"### Arsenal FC",
"- type: football_club",
"- jurisdiction: England",
"- aliases: Arsenal, The Gunners, Arsenal Football Club",
"- urls:",
" - https://www.arsenal.com",
" - https://en.wikipedia.org/wiki/Arsenal_F.C.",
"",
]
.join("\n");
let mut errors = Vec::new();
let entities = parse_entities(&body, SectionKind::Institutions, 20, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(entities.len(), 1);
let e = &entities[0];
assert_eq!(e.name, "Arsenal FC");
assert_eq!(e.label, Label::Institution);
let it = e.fields.iter().find(|(k, _)| k == "institution_type");
assert_eq!(
it.map(|(_, v)| v),
Some(&FieldValue::Single("football_club".into()))
);
let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
assert_eq!(
aliases.map(|(_, v)| v),
Some(&FieldValue::List(vec![
"Arsenal".into(),
"The Gunners".into(),
"Arsenal Football Club".into(),
]))
);
let urls = e.fields.iter().find(|(k, _)| k == "urls");
assert_eq!(
urls.map(|(_, v)| v),
Some(&FieldValue::List(vec![
"https://www.arsenal.com".into(),
"https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
]))
);
}
#[test]
fn parse_event_with_type_shorthand() {
let body = [
"",
"### Bonnick dismissal",
"- occurred_at: 2024-12-24",
"- type: termination",
"- description: Arsenal dismisses Bonnick.",
"",
]
.join("\n");
let mut errors = Vec::new();
let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
let e = &entities[0];
assert_eq!(e.label, Label::PublicRecord);
let dt = e.fields.iter().find(|(k, _)| k == "document_type");
assert_eq!(
dt.map(|(_, v)| v),
Some(&FieldValue::Single("termination".into()))
);
}
#[test]
fn reject_unknown_field() {
let body = "### Test\n- foobar: value\n";
let mut errors = Vec::new();
parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(errors.iter().any(|e| e.message.contains("unknown field")));
}
#[test]
fn reject_wrong_label_field() {
let body = "### Test\n- institution_type: court\n";
let mut errors = Vec::new();
parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(errors.iter().any(|e| e.message.contains("unknown field")));
}
#[test]
fn reject_invalid_enum_value() {
let body = "### Test\n- occupation: wizard\n";
let mut errors = Vec::new();
parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(
errors
.iter()
.any(|e| e.message.contains("invalid occupation"))
);
}
#[test]
fn accept_custom_enum_value() {
let body = "### Test\n- occupation: custom:Kit Manager\n";
let mut errors = Vec::new();
let entities = parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(entities.len(), 1);
}
#[test]
fn reject_invalid_date_format() {
let body = "### Test\n- date_of_birth: January 1990\n";
let mut errors = Vec::new();
parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(errors.iter().any(|e| e.message.contains("YYYY")));
}
#[test]
fn accept_valid_date_formats() {
for date in &["2024", "2024-01", "2024-01-15"] {
let body = format!("### Test\n- date_of_birth: {date}\n");
let mut errors = Vec::new();
parse_entities(&body, SectionKind::Actors, 1, &mut errors);
assert!(
errors.is_empty(),
"date {date:?} should be valid: {errors:?}"
);
}
}
#[test]
fn reject_non_https_url() {
let body = "### Test\n- urls:\n - http://example.com\n";
let mut errors = Vec::new();
parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
}
#[test]
fn reject_non_https_thumbnail() {
let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
let mut errors = Vec::new();
parse_entities(body, SectionKind::Actors, 1, &mut errors);
assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
}
#[test]
fn multiple_entities() {
let body = [
"",
"### Alice",
"- nationality: Dutch",
"",
"### Bob",
"- nationality: British",
"",
]
.join("\n");
let mut errors = Vec::new();
let entities = parse_entities(&body, SectionKind::Actors, 1, &mut errors);
assert!(errors.is_empty(), "errors: {errors:?}");
assert_eq!(entities.len(), 2);
assert_eq!(entities[0].name, "Alice");
assert_eq!(entities[1].name, "Bob");
}
#[test]
fn field_max_length_violation() {
let long_val = "a".repeat(201);
let body = format!("### Test\n- nationality: {long_val}\n");
let mut errors = Vec::new();
parse_entities(&body, SectionKind::Actors, 1, &mut errors);
assert!(
errors
.iter()
.any(|e| e.message.contains("exceeds 100 chars"))
);
}
#[test]
fn too_many_aliases() {
let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
let mut errors = Vec::new();
parse_entities(&body, SectionKind::Actors, 1, &mut errors);
assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
}
}