1use std::collections::HashSet;
2
3use crate::parser::{ParseError, SourceEntry};
4
5const MAX_RELATIONSHIPS_PER_FILE: usize = 200;
7
8const KNOWN_REL_TYPES: &[&str] = &[
10 "employed_by",
12 "member_of",
13 "leads",
14 "founded",
15 "owns",
16 "subsidiary_of",
17 "charged_with",
19 "convicted_of",
20 "investigated_by",
21 "prosecuted_by",
22 "defended_by",
23 "testified_in",
24 "sentenced_to",
25 "appealed",
26 "acquitted_of",
27 "pardoned_by",
28 "arrested_by",
29 "paid_to",
31 "received_from",
32 "funded_by",
33 "awarded_contract",
34 "approved_budget",
35 "seized_from",
36 "appointed_by",
38 "approved_by",
39 "regulated_by",
40 "licensed_by",
41 "lobbied",
42 "family_of",
44 "associate_of",
45 "preceded_by",
47 "documents",
49 "authorizes",
50 "references",
51 "related_to",
53 "part_of",
54 "involved_in",
55 "sourced_by",
57];
58
59const REL_FIELDS: &[&str] = &[
61 "id",
62 "source",
63 "description",
64 "amounts",
65 "valid_from",
66 "valid_until",
67];
68
69#[derive(Debug)]
71pub struct Rel {
72 pub source_name: String,
73 pub target_name: String,
74 pub rel_type: String,
75 pub source_urls: Vec<String>,
76 pub fields: Vec<(String, String)>,
77 pub id: Option<String>,
79 pub line: usize,
81}
82
83#[allow(clippy::too_many_lines)]
88pub fn parse_relationships(
89 body: &str,
90 section_start_line: usize,
91 entity_names: &HashSet<&str>,
92 default_sources: &[SourceEntry],
93 errors: &mut Vec<ParseError>,
94) -> Vec<Rel> {
95 let lines: Vec<&str> = body.lines().collect();
96 let mut rels: Vec<Rel> = Vec::new();
97
98 let mut current: Option<RelBuilder> = None;
100
101 for (i, line) in lines.iter().enumerate() {
102 let file_line = section_start_line + 1 + i;
103 let trimmed = line.trim();
104
105 if trimmed.starts_with("- ") && !line.starts_with(" ") {
107 if let Some(builder) = current.take() {
109 rels.push(builder.finish(default_sources));
110 }
111
112 let item = &trimmed[2..];
113 match parse_rel_line(item) {
114 Some((source, target, rel_type)) => {
115 if !KNOWN_REL_TYPES.contains(&rel_type.as_str()) {
117 errors.push(ParseError {
118 line: file_line,
119 message: format!(
120 "unknown relationship type {rel_type:?} (known: {})",
121 KNOWN_REL_TYPES.join(", ")
122 ),
123 });
124 }
125
126 if !entity_names.contains(&source.as_str()) {
128 errors.push(ParseError {
129 line: file_line,
130 message: format!(
131 "entity {source:?} in relationship not defined in file"
132 ),
133 });
134 }
135 if !entity_names.contains(&target.as_str()) {
136 errors.push(ParseError {
137 line: file_line,
138 message: format!(
139 "entity {target:?} in relationship not defined in file"
140 ),
141 });
142 }
143
144 current = Some(RelBuilder {
145 source_name: source,
146 target_name: target,
147 rel_type,
148 source_urls: Vec::new(),
149 fields: Vec::new(),
150 id: None,
151 line: file_line,
152 });
153 }
154 None => {
155 errors.push(ParseError {
156 line: file_line,
157 message: format!(
158 "invalid relationship syntax: expected `- Source -> Target: type`, got {trimmed:?}"
159 ),
160 });
161 }
162 }
163 continue;
164 }
165
166 if line.starts_with(" ") && current.is_some() {
168 if let Some((key, value)) = parse_kv(trimmed) {
169 if !REL_FIELDS.contains(&key.as_str()) {
170 errors.push(ParseError {
171 line: file_line,
172 message: format!("unknown relationship field {key:?}"),
173 });
174 continue;
175 }
176
177 let builder = current.as_mut().unwrap_or_else(|| unreachable!());
178
179 if key == "id" {
180 builder.id = Some(value);
181 } else if key == "source" {
182 if !value.starts_with("https://") {
183 errors.push(ParseError {
184 line: file_line,
185 message: format!("relationship source URL must be HTTPS: {value:?}"),
186 });
187 }
188 builder.source_urls.push(value);
189 } else {
190 validate_rel_field(&key, &value, file_line, errors);
192 builder.fields.push((key, value));
193 }
194 } else {
195 errors.push(ParseError {
196 line: file_line,
197 message: format!(
198 "invalid field syntax: expected `key: value`, got {trimmed:?}"
199 ),
200 });
201 }
202 }
203
204 }
206
207 if let Some(builder) = current.take() {
209 rels.push(builder.finish(default_sources));
210 }
211
212 if rels.len() > MAX_RELATIONSHIPS_PER_FILE {
214 errors.push(ParseError {
215 line: section_start_line,
216 message: format!(
217 "too many relationships (max {MAX_RELATIONSHIPS_PER_FILE}, got {})",
218 rels.len()
219 ),
220 });
221 }
222
223 rels
224}
225
226struct RelBuilder {
227 source_name: String,
228 target_name: String,
229 rel_type: String,
230 source_urls: Vec<String>,
231 fields: Vec<(String, String)>,
232 id: Option<String>,
233 line: usize,
234}
235
236impl RelBuilder {
237 fn finish(self, default_sources: &[SourceEntry]) -> Rel {
238 let source_urls = if self.source_urls.is_empty() {
239 default_sources
240 .iter()
241 .map(|s| s.url().to_string())
242 .collect()
243 } else {
244 self.source_urls
245 };
246
247 Rel {
248 source_name: self.source_name,
249 target_name: self.target_name,
250 rel_type: self.rel_type,
251 source_urls,
252 fields: self.fields,
253 id: self.id,
254 line: self.line,
255 }
256 }
257}
258
259fn parse_rel_line(item: &str) -> Option<(String, String, String)> {
261 let arrow_pos = item.find(" -> ")?;
262 let source = item[..arrow_pos].trim();
263 let after_arrow = &item[arrow_pos + 4..];
264
265 let colon_pos = after_arrow.rfind(':')?;
266 let target = after_arrow[..colon_pos].trim();
267 let rel_type = after_arrow[colon_pos + 1..]
268 .trim()
269 .to_lowercase()
270 .replace(' ', "_");
271
272 if source.is_empty() || target.is_empty() || rel_type.is_empty() {
273 return None;
274 }
275
276 Some((source.to_string(), target.to_string(), rel_type))
277}
278
279fn parse_kv(s: &str) -> Option<(String, String)> {
280 let colon = s.find(':')?;
281 let key = s[..colon].trim();
282 if key.is_empty() {
283 return None;
284 }
285 let value = s[colon + 1..].trim();
286 Some((key.to_string(), value.to_string()))
287}
288
289fn validate_rel_field(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
290 let max = match key {
291 "description" => 1000,
292 "amounts" => 200,
293 "valid_from" | "valid_until" => 10,
294 _ => return,
295 };
296
297 if value.len() > max {
298 errors.push(ParseError {
299 line,
300 message: format!(
301 "relationship field {key:?} exceeds {max} chars (got {})",
302 value.len()
303 ),
304 });
305 }
306
307 if matches!(key, "valid_from" | "valid_until") && !value.is_empty() {
309 let valid = matches!(value.len(), 4 | 7 | 10)
310 && value.chars().enumerate().all(|(i, c)| match i {
311 4 | 7 => c == '-',
312 _ => c.is_ascii_digit(),
313 });
314 if !valid {
315 errors.push(ParseError {
316 line,
317 message: format!(
318 "relationship field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"
319 ),
320 });
321 }
322 }
323}
324
325#[cfg(test)]
326mod tests {
327 use super::*;
328
329 #[test]
330 fn parse_basic_relationship() {
331 let body = "\n- Alice -> Bob: employed_by\n";
332 let names = HashSet::from(["Alice", "Bob"]);
333 let sources = vec![SourceEntry::Url("https://example.com/src".into())];
334 let mut errors = Vec::new();
335
336 let rels = parse_relationships(body, 50, &names, &sources, &mut errors);
337 assert!(errors.is_empty(), "errors: {errors:?}");
338 assert_eq!(rels.len(), 1);
339 assert_eq!(rels[0].source_name, "Alice");
340 assert_eq!(rels[0].target_name, "Bob");
341 assert_eq!(rels[0].rel_type, "employed_by");
342 assert_eq!(rels[0].source_urls, vec!["https://example.com/src"]);
344 }
345
346 #[test]
347 fn parse_relationship_with_source_override() {
348 let body = [
349 "",
350 "- Alice -> Bob: associate_of",
351 " source: https://specific.com/article",
352 "",
353 ]
354 .join("\n");
355 let names = HashSet::from(["Alice", "Bob"]);
356 let sources = vec![SourceEntry::Url("https://default.com".into())];
357 let mut errors = Vec::new();
358
359 let rels = parse_relationships(&body, 10, &names, &sources, &mut errors);
360 assert!(errors.is_empty(), "errors: {errors:?}");
361 assert_eq!(rels[0].source_urls, vec!["https://specific.com/article"]);
362 }
363
364 #[test]
365 fn parse_relationship_with_fields() {
366 let body = [
367 "",
368 "- Alice -> Corp: paid_to",
369 " amounts: 50000 EUR",
370 " valid_from: 2020-01",
371 " description: Campaign donation",
372 "",
373 ]
374 .join("\n");
375 let names = HashSet::from(["Alice", "Corp"]);
376 let mut errors = Vec::new();
377
378 let rels = parse_relationships(&body, 10, &names, &[], &mut errors);
379 assert!(errors.is_empty(), "errors: {errors:?}");
380 assert_eq!(rels[0].fields.len(), 3);
381 }
382
383 #[test]
384 fn reject_unknown_rel_type() {
385 let body = "\n- Alice -> Bob: best_friends\n";
386 let names = HashSet::from(["Alice", "Bob"]);
387 let mut errors = Vec::new();
388
389 parse_relationships(body, 1, &names, &[], &mut errors);
390 assert!(
391 errors
392 .iter()
393 .any(|e| e.message.contains("unknown relationship type"))
394 );
395 }
396
397 #[test]
398 fn reject_unresolved_entity() {
399 let body = "\n- Alice -> Unknown: employed_by\n";
400 let names = HashSet::from(["Alice"]);
401 let mut errors = Vec::new();
402
403 parse_relationships(body, 1, &names, &[], &mut errors);
404 assert!(
405 errors
406 .iter()
407 .any(|e| e.message.contains("not defined in file"))
408 );
409 }
410
411 #[test]
412 fn reject_non_https_source_override() {
413 let body = [
414 "",
415 "- Alice -> Bob: associate_of",
416 " source: http://insecure.com",
417 "",
418 ]
419 .join("\n");
420 let names = HashSet::from(["Alice", "Bob"]);
421 let mut errors = Vec::new();
422
423 parse_relationships(&body, 1, &names, &[], &mut errors);
424 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
425 }
426
427 #[test]
428 fn reject_unknown_rel_field() {
429 let body = ["", "- Alice -> Bob: associate_of", " foobar: value", ""].join("\n");
430 let names = HashSet::from(["Alice", "Bob"]);
431 let mut errors = Vec::new();
432
433 parse_relationships(&body, 1, &names, &[], &mut errors);
434 assert!(
435 errors
436 .iter()
437 .any(|e| e.message.contains("unknown relationship field"))
438 );
439 }
440
441 #[test]
442 fn multiple_relationships() {
443 let body = [
444 "",
445 "- Alice -> Bob: employed_by",
446 "- Bob -> Corp: member_of",
447 "- Corp -> Alice: charged_with",
448 "",
449 ]
450 .join("\n");
451 let names = HashSet::from(["Alice", "Bob", "Corp"]);
452 let mut errors = Vec::new();
453
454 let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
455 assert!(errors.is_empty(), "errors: {errors:?}");
456 assert_eq!(rels.len(), 3);
457 }
458
459 #[test]
460 fn parse_rel_line_syntax() {
461 let result = parse_rel_line("Mark Bonnick -> Arsenal FC: employed_by");
462 assert_eq!(
463 result,
464 Some((
465 "Mark Bonnick".into(),
466 "Arsenal FC".into(),
467 "employed_by".into()
468 ))
469 );
470 }
471
472 #[test]
473 fn parse_rel_line_invalid() {
474 assert!(parse_rel_line("not a relationship").is_none());
475 assert!(parse_rel_line("-> Target: type").is_none());
476 assert!(parse_rel_line("Source -> : type").is_none());
477 }
478
479 #[test]
480 fn relationship_date_validation() {
481 let body = [
482 "",
483 "- Alice -> Bob: associate_of",
484 " valid_from: not-a-date",
485 "",
486 ]
487 .join("\n");
488 let names = HashSet::from(["Alice", "Bob"]);
489 let mut errors = Vec::new();
490
491 parse_relationships(&body, 1, &names, &[], &mut errors);
492 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
493 }
494
495 #[test]
496 fn multiple_source_overrides() {
497 let body = [
498 "",
499 "- Alice -> Bob: associate_of",
500 " source: https://first.com",
501 " source: https://second.com",
502 "",
503 ]
504 .join("\n");
505 let names = HashSet::from(["Alice", "Bob"]);
506 let mut errors = Vec::new();
507
508 let rels = parse_relationships(&body, 1, &names, &[], &mut errors);
509 assert!(errors.is_empty(), "errors: {errors:?}");
510 assert_eq!(rels[0].source_urls.len(), 2);
511 }
512}