1use std::collections::HashSet;
2
3use crate::parser::{ParseError, SourceEntry};
4
5const MAX_RELATIONSHIPS_PER_FILE: usize = 200;
7
8const KNOWN_REL_TYPES: &[&str] = &[
10 "employed_by",
12 "member_of",
13 "leads",
14 "founded",
15 "owns",
16 "subsidiary_of",
17 "charged_with",
19 "convicted_of",
20 "investigated_by",
21 "prosecuted_by",
22 "defended_by",
23 "testified_in",
24 "sentenced_to",
25 "appealed",
26 "acquitted_of",
27 "pardoned_by",
28 "arrested_by",
29 "paid_to",
31 "received_from",
32 "funded_by",
33 "awarded_contract",
34 "approved_budget",
35 "seized_from",
36 "appointed_by",
38 "approved_by",
39 "regulated_by",
40 "licensed_by",
41 "lobbied",
42 "family_of",
44 "associate_of",
45 "preceded_by",
47 "documents",
49 "authorizes",
50 "references",
51 "part_of",
53 "sourced_by",
55];
56
57const REL_FIELDS: &[&str] = &[
59 "id",
60 "source",
61 "description",
62 "amount",
63 "currency",
64 "valid_from",
65 "valid_until",
66];
67
68#[derive(Debug)]
70#[allow(clippy::struct_field_names)]
71pub struct Rel {
72 pub source_name: String,
73 pub target_name: String,
74 pub rel_type: String,
75 pub source_urls: Vec<String>,
76 pub fields: Vec<(String, String)>,
77 pub id: Option<String>,
79 pub line: usize,
81}
82
83#[allow(clippy::implicit_hasher)]
88#[allow(clippy::too_many_lines)]
89pub fn parse_relationships(
90 body: &str,
91 section_start_line: usize,
92 entity_names: &HashSet<&str>,
93 default_sources: &[SourceEntry],
94 errors: &mut Vec<ParseError>,
95) -> Vec<Rel> {
96 let lines: Vec<&str> = body.lines().collect();
97 let mut rels: Vec<Rel> = Vec::new();
98
99 let mut current: Option<RelBuilder> = None;
101
102 for (i, line) in lines.iter().enumerate() {
103 let file_line = section_start_line + 1 + i;
104 let trimmed = line.trim();
105
106 if trimmed.starts_with("- ") && !line.starts_with(" ") {
108 if let Some(builder) = current.take() {
110 rels.push(builder.finish(default_sources));
111 }
112
113 let item = &trimmed[2..];
114 match parse_rel_line(item) {
115 Some((source, target, rel_type)) => {
116 if !KNOWN_REL_TYPES.contains(&rel_type.as_str()) {
118 errors.push(ParseError {
119 line: file_line,
120 message: format!(
121 "unknown relationship type {rel_type:?} (known: {})",
122 KNOWN_REL_TYPES.join(", ")
123 ),
124 });
125 }
126
127 if !entity_names.contains(&source.as_str()) {
129 errors.push(ParseError {
130 line: file_line,
131 message: format!(
132 "entity {source:?} in relationship not defined in file"
133 ),
134 });
135 }
136 if !entity_names.contains(&target.as_str()) {
137 errors.push(ParseError {
138 line: file_line,
139 message: format!(
140 "entity {target:?} in relationship not defined in file"
141 ),
142 });
143 }
144
145 current = Some(RelBuilder {
146 source_name: source,
147 target_name: target,
148 rel_type,
149 source_urls: Vec::new(),
150 fields: Vec::new(),
151 id: None,
152 line: file_line,
153 });
154 }
155 None => {
156 errors.push(ParseError {
157 line: file_line,
158 message: format!(
159 "invalid relationship syntax: expected `- Source -> Target: type`, got {trimmed:?}"
160 ),
161 });
162 }
163 }
164 continue;
165 }
166
167 if line.starts_with(" - ") && current.is_some() {
169 let nested = trimmed.strip_prefix("- ").unwrap_or(trimmed);
170 if let Some((key, value)) = parse_kv(nested) {
171 if !REL_FIELDS.contains(&key.as_str()) {
172 errors.push(ParseError {
173 line: file_line,
174 message: format!("unknown relationship field {key:?}"),
175 });
176 continue;
177 }
178
179 let builder = current.as_mut().unwrap_or_else(|| unreachable!());
180
181 if key == "id" {
182 builder.id = Some(value);
183 } else if key == "source" {
184 if !value.starts_with("https://") {
185 errors.push(ParseError {
186 line: file_line,
187 message: format!("relationship source URL must be HTTPS: {value:?}"),
188 });
189 }
190 builder.source_urls.push(value);
191 } else {
192 validate_rel_field(&key, &value, file_line, errors);
194 builder.fields.push((key, value));
195 }
196 } else {
197 errors.push(ParseError {
198 line: file_line,
199 message: format!(
200 "invalid nested field syntax: expected `- key: value`, got {trimmed:?}"
201 ),
202 });
203 }
204 }
205
206 }
208
209 if let Some(builder) = current.take() {
211 rels.push(builder.finish(default_sources));
212 }
213
214 if rels.len() > MAX_RELATIONSHIPS_PER_FILE {
216 errors.push(ParseError {
217 line: section_start_line,
218 message: format!(
219 "too many relationships (max {MAX_RELATIONSHIPS_PER_FILE}, got {})",
220 rels.len()
221 ),
222 });
223 }
224
225 rels
226}
227
228struct RelBuilder {
229 source_name: String,
230 target_name: String,
231 rel_type: String,
232 source_urls: Vec<String>,
233 fields: Vec<(String, String)>,
234 id: Option<String>,
235 line: usize,
236}
237
238impl RelBuilder {
239 fn finish(self, default_sources: &[SourceEntry]) -> Rel {
240 let source_urls = if self.source_urls.is_empty() {
241 default_sources
242 .iter()
243 .map(|s| s.url().to_string())
244 .collect()
245 } else {
246 self.source_urls
247 };
248
249 Rel {
250 source_name: self.source_name,
251 target_name: self.target_name,
252 rel_type: self.rel_type,
253 source_urls,
254 fields: self.fields,
255 id: self.id,
256 line: self.line,
257 }
258 }
259}
260
261fn parse_rel_line(item: &str) -> Option<(String, String, String)> {
263 let arrow_pos = item.find(" -> ")?;
264 let source = item[..arrow_pos].trim();
265 let after_arrow = &item[arrow_pos + 4..];
266
267 let colon_pos = after_arrow.rfind(':')?;
268 let target = after_arrow[..colon_pos].trim();
269 let rel_type = after_arrow[colon_pos + 1..]
270 .trim()
271 .to_lowercase()
272 .replace(' ', "_");
273
274 if source.is_empty() || target.is_empty() || rel_type.is_empty() {
275 return None;
276 }
277
278 Some((source.to_string(), target.to_string(), rel_type))
279}
280
281fn parse_kv(s: &str) -> Option<(String, String)> {
282 let colon = s.find(':')?;
283 let key = s[..colon].trim();
284 if key.is_empty() {
285 return None;
286 }
287 let value = s[colon + 1..].trim();
288 Some((key.to_string(), value.to_string()))
289}
290
291fn validate_rel_field(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
292 let max = match key {
293 "description" => 1000,
294 "amount" => 50,
295 "currency" | "valid_from" | "valid_until" => 10,
296 _ => return,
297 };
298
299 if value.len() > max {
300 errors.push(ParseError {
301 line,
302 message: format!(
303 "relationship field {key:?} exceeds {max} chars (got {})",
304 value.len()
305 ),
306 });
307 }
308
309 if matches!(key, "valid_from" | "valid_until") && !value.is_empty() {
311 let valid = matches!(value.len(), 4 | 7 | 10)
312 && value.chars().enumerate().all(|(i, c)| match i {
313 4 | 7 => c == '-',
314 _ => c.is_ascii_digit(),
315 });
316 if !valid {
317 errors.push(ParseError {
318 line,
319 message: format!(
320 "relationship field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"
321 ),
322 });
323 }
324 }
325}
326
327#[cfg(test)]
328mod tests {
329 use super::*;
330
331 #[test]
332 fn parse_basic_relationship() {
333 let body = "\n- Alice -> Bob: employed_by\n";
334 let names = HashSet::from(["Alice", "Bob"]);
335 let sources = vec![SourceEntry::Url("https://example.com/src".into())];
336 let mut errors = Vec::new();
337
338 let rels = parse_relationships(body, 50, &names, &sources, &mut errors);
339 assert!(errors.is_empty(), "errors: {errors:?}");
340 assert_eq!(rels.len(), 1);
341 assert_eq!(rels[0].source_name, "Alice");
342 assert_eq!(rels[0].target_name, "Bob");
343 assert_eq!(rels[0].rel_type, "employed_by");
344 assert_eq!(rels[0].source_urls, vec!["https://example.com/src"]);
346 }
347
348 #[test]
349 fn parse_relationship_with_source_override() {
350 let body = [
351 "",
352 "- Alice -> Bob: associate_of",
353 " - source: https://specific.com/article",
354 "",
355 ]
356 .join("\n");
357 let names = HashSet::from(["Alice", "Bob"]);
358 let sources = vec![SourceEntry::Url("https://default.com".into())];
359 let mut errors = Vec::new();
360
361 let rels = parse_relationships(&body, 10, &names, &sources, &mut errors);
362 assert!(errors.is_empty(), "errors: {errors:?}");
363 assert_eq!(rels[0].source_urls, vec!["https://specific.com/article"]);
364 }
365
366 #[test]
367 fn parse_relationship_with_fields() {
368 let body = [
369 "",
370 "- Alice -> Corp: paid_to",
371 " - amount: EUR 50,000",
372 " - currency: EUR",
373 " - valid_from: 2020-01",
374 " - description: Campaign donation",
375 "",
376 ]
377 .join("\n");
378 let names = HashSet::from(["Alice", "Corp"]);
379 let mut errors = Vec::new();
380
381 let rels = parse_relationships(&body, 10, &names, &[], &mut errors);
382 assert!(errors.is_empty(), "errors: {errors:?}");
383 assert_eq!(rels[0].fields.len(), 4);
384 }
385
386 #[test]
387 fn reject_unknown_rel_type() {
388 let body = "\n- Alice -> Bob: best_friends\n";
389 let names = HashSet::from(["Alice", "Bob"]);
390 let mut errors = Vec::new();
391
392 parse_relationships(body, 1, &names, &[], &mut errors);
393 assert!(
394 errors
395 .iter()
396 .any(|e| e.message.contains("unknown relationship type"))
397 );
398 }
399
400 #[test]
401 fn reject_unresolved_entity() {
402 let body = "\n- Alice -> Unknown: employed_by\n";
403 let names = HashSet::from(["Alice"]);
404 let mut errors = Vec::new();
405
406 parse_relationships(body, 1, &names, &[], &mut errors);
407 assert!(
408 errors
409 .iter()
410 .any(|e| e.message.contains("not defined in file"))
411 );
412 }
413
414 #[test]
415 fn reject_non_https_source_override() {
416 let body = [
417 "",
418 "- Alice -> Bob: associate_of",
419 " - source: http://insecure.com",
420 "",
421 ]
422 .join("\n");
423 let names = HashSet::from(["Alice", "Bob"]);
424 let mut errors = Vec::new();
425
426 parse_relationships(&body, 1, &names, &[], &mut errors);
427 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
428 }
429
430 #[test]
431 fn reject_unknown_rel_field() {
432 let body = ["", "- Alice -> Bob: associate_of", " - foobar: value", ""].join("\n");
433 let names = HashSet::from(["Alice", "Bob"]);
434 let mut errors = Vec::new();
435
436 parse_relationships(&body, 1, &names, &[], &mut errors);
437 assert!(
438 errors
439 .iter()
440 .any(|e| e.message.contains("unknown relationship field"))
441 );
442 }
443
444 #[test]
445 fn multiple_relationships() {
446 let body = [
447 "",
448 "- Alice -> Bob: employed_by",
449 "- Bob -> Corp: member_of",
450 "- Corp -> Alice: charged_with",
451 "",
452 ]
453 .join("\n");
454 let names = HashSet::from(["Alice", "Bob", "Corp"]);
455 let mut errors = Vec::new();
456
457 let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
458 assert!(errors.is_empty(), "errors: {errors:?}");
459 assert_eq!(rels.len(), 3);
460 }
461
462 #[test]
463 fn parse_rel_line_syntax() {
464 let result = parse_rel_line("Mark Bonnick -> Arsenal FC: employed_by");
465 assert_eq!(
466 result,
467 Some((
468 "Mark Bonnick".into(),
469 "Arsenal FC".into(),
470 "employed_by".into()
471 ))
472 );
473 }
474
475 #[test]
476 fn parse_rel_line_invalid() {
477 assert!(parse_rel_line("not a relationship").is_none());
478 assert!(parse_rel_line("-> Target: type").is_none());
479 assert!(parse_rel_line("Source -> : type").is_none());
480 }
481
482 #[test]
483 fn relationship_date_validation() {
484 let body = [
485 "",
486 "- Alice -> Bob: associate_of",
487 " - valid_from: not-a-date",
488 "",
489 ]
490 .join("\n");
491 let names = HashSet::from(["Alice", "Bob"]);
492 let mut errors = Vec::new();
493
494 parse_relationships(&body, 1, &names, &[], &mut errors);
495 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
496 }
497
498 #[test]
499 fn multiple_source_overrides() {
500 let body = [
501 "",
502 "- Alice -> Bob: associate_of",
503 " - source: https://first.com",
504 " - source: https://second.com",
505 "",
506 ]
507 .join("\n");
508 let names = HashSet::from(["Alice", "Bob"]);
509 let mut errors = Vec::new();
510
511 let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
512 assert!(errors.is_empty(), "errors: {errors:?}");
513 assert_eq!(rels[0].source_urls.len(), 2);
514 }
515}