1use std::collections::HashSet;
2
3use crate::parser::{ParseError, SourceEntry};
4
5const MAX_RELATIONSHIPS_PER_FILE: usize = 200;
7
8const KNOWN_REL_TYPES: &[&str] = &[
10 "employed_by",
12 "member_of",
13 "leads",
14 "founded",
15 "owns",
16 "subsidiary_of",
17 "charged_with",
19 "convicted_of",
20 "investigated_by",
21 "prosecuted_by",
22 "defended_by",
23 "testified_in",
24 "sentenced_to",
25 "appealed",
26 "acquitted_of",
27 "pardoned_by",
28 "arrested_by",
29 "paid_to",
31 "received_from",
32 "funded_by",
33 "awarded_contract",
34 "approved_budget",
35 "seized_from",
36 "appointed_by",
38 "approved_by",
39 "regulated_by",
40 "licensed_by",
41 "lobbied",
42 "family_of",
44 "associate_of",
45 "preceded_by",
47 "documents",
49 "authorizes",
50 "references",
51 "part_of",
53 "involved_in",
54 "sourced_by",
56];
57
58const REL_FIELDS: &[&str] = &[
60 "id",
61 "source",
62 "description",
63 "amount",
64 "currency",
65 "valid_from",
66 "valid_until",
67];
68
69#[derive(Debug)]
71#[allow(clippy::struct_field_names)]
72pub struct Rel {
73 pub source_name: String,
74 pub target_name: String,
75 pub rel_type: String,
76 pub source_urls: Vec<String>,
77 pub fields: Vec<(String, String)>,
78 pub id: Option<String>,
80 pub line: usize,
82}
83
84#[allow(clippy::implicit_hasher)]
89#[allow(clippy::too_many_lines)]
90pub fn parse_relationships(
91 body: &str,
92 section_start_line: usize,
93 entity_names: &HashSet<&str>,
94 default_sources: &[SourceEntry],
95 errors: &mut Vec<ParseError>,
96) -> Vec<Rel> {
97 let lines: Vec<&str> = body.lines().collect();
98 let mut rels: Vec<Rel> = Vec::new();
99
100 let mut current: Option<RelBuilder> = None;
102
103 for (i, line) in lines.iter().enumerate() {
104 let file_line = section_start_line + 1 + i;
105 let trimmed = line.trim();
106
107 if trimmed.starts_with("- ") && !line.starts_with(" ") {
109 if let Some(builder) = current.take() {
111 rels.push(builder.finish(default_sources));
112 }
113
114 let item = &trimmed[2..];
115 match parse_rel_line(item) {
116 Some((source, target, rel_type)) => {
117 if !KNOWN_REL_TYPES.contains(&rel_type.as_str()) {
119 errors.push(ParseError {
120 line: file_line,
121 message: format!(
122 "unknown relationship type {rel_type:?} (known: {})",
123 KNOWN_REL_TYPES.join(", ")
124 ),
125 });
126 }
127
128 if !entity_names.contains(&source.as_str()) {
130 errors.push(ParseError {
131 line: file_line,
132 message: format!(
133 "entity {source:?} in relationship not defined in file"
134 ),
135 });
136 }
137 if !entity_names.contains(&target.as_str()) {
138 errors.push(ParseError {
139 line: file_line,
140 message: format!(
141 "entity {target:?} in relationship not defined in file"
142 ),
143 });
144 }
145
146 current = Some(RelBuilder {
147 source_name: source,
148 target_name: target,
149 rel_type,
150 source_urls: Vec::new(),
151 fields: Vec::new(),
152 id: None,
153 line: file_line,
154 });
155 }
156 None => {
157 errors.push(ParseError {
158 line: file_line,
159 message: format!(
160 "invalid relationship syntax: expected `- Source -> Target: type`, got {trimmed:?}"
161 ),
162 });
163 }
164 }
165 continue;
166 }
167
168 if line.starts_with(" - ") && current.is_some() {
170 let nested = trimmed.strip_prefix("- ").unwrap_or(trimmed);
171 if let Some((key, value)) = parse_kv(nested) {
172 if !REL_FIELDS.contains(&key.as_str()) {
173 errors.push(ParseError {
174 line: file_line,
175 message: format!("unknown relationship field {key:?}"),
176 });
177 continue;
178 }
179
180 let builder = current.as_mut().unwrap_or_else(|| unreachable!());
181
182 if key == "id" {
183 builder.id = Some(value);
184 } else if key == "source" {
185 if !value.starts_with("https://") {
186 errors.push(ParseError {
187 line: file_line,
188 message: format!("relationship source URL must be HTTPS: {value:?}"),
189 });
190 }
191 builder.source_urls.push(value);
192 } else {
193 validate_rel_field(&key, &value, file_line, errors);
195 builder.fields.push((key, value));
196 }
197 } else {
198 errors.push(ParseError {
199 line: file_line,
200 message: format!(
201 "invalid nested field syntax: expected `- key: value`, got {trimmed:?}"
202 ),
203 });
204 }
205 }
206
207 }
209
210 if let Some(builder) = current.take() {
212 rels.push(builder.finish(default_sources));
213 }
214
215 if rels.len() > MAX_RELATIONSHIPS_PER_FILE {
217 errors.push(ParseError {
218 line: section_start_line,
219 message: format!(
220 "too many relationships (max {MAX_RELATIONSHIPS_PER_FILE}, got {})",
221 rels.len()
222 ),
223 });
224 }
225
226 rels
227}
228
229struct RelBuilder {
230 source_name: String,
231 target_name: String,
232 rel_type: String,
233 source_urls: Vec<String>,
234 fields: Vec<(String, String)>,
235 id: Option<String>,
236 line: usize,
237}
238
239impl RelBuilder {
240 fn finish(self, default_sources: &[SourceEntry]) -> Rel {
241 let source_urls = if self.source_urls.is_empty() {
242 default_sources
243 .iter()
244 .map(|s| s.url().to_string())
245 .collect()
246 } else {
247 self.source_urls
248 };
249
250 Rel {
251 source_name: self.source_name,
252 target_name: self.target_name,
253 rel_type: self.rel_type,
254 source_urls,
255 fields: self.fields,
256 id: self.id,
257 line: self.line,
258 }
259 }
260}
261
262fn parse_rel_line(item: &str) -> Option<(String, String, String)> {
264 let arrow_pos = item.find(" -> ")?;
265 let source = item[..arrow_pos].trim();
266 let after_arrow = &item[arrow_pos + 4..];
267
268 let colon_pos = after_arrow.rfind(':')?;
269 let target = after_arrow[..colon_pos].trim();
270 let rel_type = after_arrow[colon_pos + 1..]
271 .trim()
272 .to_lowercase()
273 .replace(' ', "_");
274
275 if source.is_empty() || target.is_empty() || rel_type.is_empty() {
276 return None;
277 }
278
279 Some((source.to_string(), target.to_string(), rel_type))
280}
281
282fn parse_kv(s: &str) -> Option<(String, String)> {
283 let colon = s.find(':')?;
284 let key = s[..colon].trim();
285 if key.is_empty() {
286 return None;
287 }
288 let value = s[colon + 1..].trim();
289 Some((key.to_string(), value.to_string()))
290}
291
292fn validate_rel_field(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
293 let max = match key {
294 "description" => 1000,
295 "amount" => 50,
296 "currency" | "valid_from" | "valid_until" => 10,
297 _ => return,
298 };
299
300 if value.len() > max {
301 errors.push(ParseError {
302 line,
303 message: format!(
304 "relationship field {key:?} exceeds {max} chars (got {})",
305 value.len()
306 ),
307 });
308 }
309
310 if matches!(key, "valid_from" | "valid_until") && !value.is_empty() {
312 let valid = matches!(value.len(), 4 | 7 | 10)
313 && value.chars().enumerate().all(|(i, c)| match i {
314 4 | 7 => c == '-',
315 _ => c.is_ascii_digit(),
316 });
317 if !valid {
318 errors.push(ParseError {
319 line,
320 message: format!(
321 "relationship field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"
322 ),
323 });
324 }
325 }
326}
327
328#[cfg(test)]
329mod tests {
330 use super::*;
331
332 #[test]
333 fn parse_basic_relationship() {
334 let body = "\n- Alice -> Bob: employed_by\n";
335 let names = HashSet::from(["Alice", "Bob"]);
336 let sources = vec![SourceEntry::Url("https://example.com/src".into())];
337 let mut errors = Vec::new();
338
339 let rels = parse_relationships(body, 50, &names, &sources, &mut errors);
340 assert!(errors.is_empty(), "errors: {errors:?}");
341 assert_eq!(rels.len(), 1);
342 assert_eq!(rels[0].source_name, "Alice");
343 assert_eq!(rels[0].target_name, "Bob");
344 assert_eq!(rels[0].rel_type, "employed_by");
345 assert_eq!(rels[0].source_urls, vec!["https://example.com/src"]);
347 }
348
349 #[test]
350 fn parse_relationship_with_source_override() {
351 let body = [
352 "",
353 "- Alice -> Bob: associate_of",
354 " - source: https://specific.com/article",
355 "",
356 ]
357 .join("\n");
358 let names = HashSet::from(["Alice", "Bob"]);
359 let sources = vec![SourceEntry::Url("https://default.com".into())];
360 let mut errors = Vec::new();
361
362 let rels = parse_relationships(&body, 10, &names, &sources, &mut errors);
363 assert!(errors.is_empty(), "errors: {errors:?}");
364 assert_eq!(rels[0].source_urls, vec!["https://specific.com/article"]);
365 }
366
367 #[test]
368 fn parse_relationship_with_fields() {
369 let body = [
370 "",
371 "- Alice -> Corp: paid_to",
372 " - amount: EUR 50,000",
373 " - currency: EUR",
374 " - valid_from: 2020-01",
375 " - description: Campaign donation",
376 "",
377 ]
378 .join("\n");
379 let names = HashSet::from(["Alice", "Corp"]);
380 let mut errors = Vec::new();
381
382 let rels = parse_relationships(&body, 10, &names, &[], &mut errors);
383 assert!(errors.is_empty(), "errors: {errors:?}");
384 assert_eq!(rels[0].fields.len(), 4);
385 }
386
387 #[test]
388 fn reject_unknown_rel_type() {
389 let body = "\n- Alice -> Bob: best_friends\n";
390 let names = HashSet::from(["Alice", "Bob"]);
391 let mut errors = Vec::new();
392
393 parse_relationships(body, 1, &names, &[], &mut errors);
394 assert!(
395 errors
396 .iter()
397 .any(|e| e.message.contains("unknown relationship type"))
398 );
399 }
400
401 #[test]
402 fn reject_unresolved_entity() {
403 let body = "\n- Alice -> Unknown: employed_by\n";
404 let names = HashSet::from(["Alice"]);
405 let mut errors = Vec::new();
406
407 parse_relationships(body, 1, &names, &[], &mut errors);
408 assert!(
409 errors
410 .iter()
411 .any(|e| e.message.contains("not defined in file"))
412 );
413 }
414
415 #[test]
416 fn reject_non_https_source_override() {
417 let body = [
418 "",
419 "- Alice -> Bob: associate_of",
420 " - source: http://insecure.com",
421 "",
422 ]
423 .join("\n");
424 let names = HashSet::from(["Alice", "Bob"]);
425 let mut errors = Vec::new();
426
427 parse_relationships(&body, 1, &names, &[], &mut errors);
428 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
429 }
430
431 #[test]
432 fn reject_unknown_rel_field() {
433 let body = ["", "- Alice -> Bob: associate_of", " - foobar: value", ""].join("\n");
434 let names = HashSet::from(["Alice", "Bob"]);
435 let mut errors = Vec::new();
436
437 parse_relationships(&body, 1, &names, &[], &mut errors);
438 assert!(
439 errors
440 .iter()
441 .any(|e| e.message.contains("unknown relationship field"))
442 );
443 }
444
445 #[test]
446 fn multiple_relationships() {
447 let body = [
448 "",
449 "- Alice -> Bob: employed_by",
450 "- Bob -> Corp: member_of",
451 "- Corp -> Alice: charged_with",
452 "",
453 ]
454 .join("\n");
455 let names = HashSet::from(["Alice", "Bob", "Corp"]);
456 let mut errors = Vec::new();
457
458 let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
459 assert!(errors.is_empty(), "errors: {errors:?}");
460 assert_eq!(rels.len(), 3);
461 }
462
463 #[test]
464 fn parse_rel_line_syntax() {
465 let result = parse_rel_line("Mark Bonnick -> Arsenal FC: employed_by");
466 assert_eq!(
467 result,
468 Some((
469 "Mark Bonnick".into(),
470 "Arsenal FC".into(),
471 "employed_by".into()
472 ))
473 );
474 }
475
476 #[test]
477 fn parse_rel_line_invalid() {
478 assert!(parse_rel_line("not a relationship").is_none());
479 assert!(parse_rel_line("-> Target: type").is_none());
480 assert!(parse_rel_line("Source -> : type").is_none());
481 }
482
483 #[test]
484 fn relationship_date_validation() {
485 let body = [
486 "",
487 "- Alice -> Bob: associate_of",
488 " - valid_from: not-a-date",
489 "",
490 ]
491 .join("\n");
492 let names = HashSet::from(["Alice", "Bob"]);
493 let mut errors = Vec::new();
494
495 parse_relationships(&body, 1, &names, &[], &mut errors);
496 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
497 }
498
499 #[test]
500 fn multiple_source_overrides() {
501 let body = [
502 "",
503 "- Alice -> Bob: associate_of",
504 " - source: https://first.com",
505 " - source: https://second.com",
506 "",
507 ]
508 .join("\n");
509 let names = HashSet::from(["Alice", "Bob"]);
510 let mut errors = Vec::new();
511
512 let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
513 assert!(errors.is_empty(), "errors: {errors:?}");
514 assert_eq!(rels[0].source_urls.len(), 2);
515 }
516}