1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5const MAX_ENTITIES_PER_FILE: usize = 50;
7
8const MAX_NAME_LEN: usize = 300;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14 Actor,
15 Institution,
16 PublicRecord,
17}
18
19impl fmt::Display for Label {
20 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
21 match self {
22 Self::Actor => write!(f, "actor"),
23 Self::Institution => write!(f, "institution"),
24 Self::PublicRecord => write!(f, "public_record"),
25 }
26 }
27}
28
29impl Label {
30 pub fn from_section(kind: SectionKind) -> Option<Self> {
31 match kind {
32 SectionKind::Actors => Some(Self::Actor),
33 SectionKind::Institutions => Some(Self::Institution),
34 SectionKind::Events => Some(Self::PublicRecord),
35 _ => None,
36 }
37 }
38}
39
40#[derive(Debug, Clone)]
42pub struct Entity {
43 pub name: String,
44 pub label: Label,
45 pub fields: Vec<(String, FieldValue)>,
46 pub id: Option<String>,
48 pub line: usize,
50}
51
52#[derive(Debug, Clone, PartialEq, Eq)]
54pub enum FieldValue {
55 Single(String),
56 List(Vec<String>),
57}
58
59pub fn parse_entity_file_body(
64 name: &str,
65 body: &str,
66 label: Label,
67 id: Option<String>,
68 title_line: usize,
69 errors: &mut Vec<ParseError>,
70) -> Entity {
71 let section_kind = match label {
72 Label::Actor => SectionKind::Actors,
73 Label::Institution => SectionKind::Institutions,
74 Label::PublicRecord => SectionKind::Events,
75 };
76
77 let wrapped = format!("### {name}\n{body}");
79 let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
80
81 if let Some(mut entity) = entities.pop() {
82 entity.id = id;
83 entity.line = title_line;
84 entity
85 } else {
86 Entity {
87 name: name.to_string(),
88 label,
89 fields: Vec::new(),
90 id,
91 line: title_line,
92 }
93 }
94}
95
96#[allow(clippy::too_many_lines)]
100pub fn parse_entities(
101 body: &str,
102 section_kind: SectionKind,
103 section_start_line: usize,
104 errors: &mut Vec<ParseError>,
105) -> Vec<Entity> {
106 let Some(label) = Label::from_section(section_kind) else {
107 return Vec::new();
108 };
109
110 let lines: Vec<&str> = body.lines().collect();
111 let mut entities: Vec<Entity> = Vec::new();
112 let mut current_name: Option<String> = None;
113 let mut current_line: usize = 0;
114 let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
115 let mut pending_list_key: Option<String> = None;
117 let mut pending_list_items: Vec<String> = Vec::new();
118
119 for (i, line) in lines.iter().enumerate() {
120 let file_line = section_start_line + 1 + i; if let Some(name) = strip_h3(line) {
124 flush_pending_list(
126 &mut pending_list_key,
127 &mut pending_list_items,
128 &mut current_fields,
129 );
130
131 if let Some(entity_name) = current_name.take() {
133 let entity = build_entity(
134 entity_name,
135 label,
136 current_line,
137 &mut current_fields,
138 errors,
139 );
140 entities.push(entity);
141 }
142
143 current_name = Some(name.to_string());
144 current_line = file_line;
145 current_fields.clear();
146 continue;
147 }
148
149 if current_name.is_none() {
151 if !line.trim().is_empty() {
152 errors.push(ParseError {
153 line: file_line,
154 message: "content before first entity heading (### Name)".into(),
155 });
156 }
157 continue;
158 }
159
160 let trimmed = line.trim();
161
162 if let Some(item) = trimmed.strip_prefix("- ") {
164 if line.starts_with(" - ") && pending_list_key.is_some() {
165 pending_list_items.push(item.trim().to_string());
167 continue;
168 }
169
170 flush_pending_list(
172 &mut pending_list_key,
173 &mut pending_list_items,
174 &mut current_fields,
175 );
176
177 if let Some((key, value)) = parse_bullet(item) {
179 if value.is_empty() {
180 pending_list_key = Some(key);
182 pending_list_items.clear();
183 } else if is_list_field(&key) && value.contains(',') {
184 let items: Vec<String> = value
186 .split(',')
187 .map(|s| s.trim().to_string())
188 .filter(|s| !s.is_empty())
189 .collect();
190 current_fields.push((key, FieldValue::List(items)));
191 } else {
192 current_fields.push((key, FieldValue::Single(value)));
193 }
194 } else {
195 errors.push(ParseError {
196 line: file_line,
197 message: format!(
198 "invalid field syntax: expected `- key: value`, got {trimmed:?}"
199 ),
200 });
201 }
202 continue;
203 }
204
205 if line.starts_with(" ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
207 if pending_list_key.is_some() {
208 errors.push(ParseError {
210 line: file_line,
211 message: "unexpected indented text in list context".into(),
212 });
213 } else if let Some(last) = current_fields.last_mut() {
214 if let FieldValue::Single(ref mut val) = last.1 {
216 val.push('\n');
217 val.push_str(trimmed);
218 }
219 }
220 continue;
221 }
222
223 if !trimmed.is_empty() {
225 flush_pending_list(
227 &mut pending_list_key,
228 &mut pending_list_items,
229 &mut current_fields,
230 );
231 }
232 }
233
234 flush_pending_list(
236 &mut pending_list_key,
237 &mut pending_list_items,
238 &mut current_fields,
239 );
240
241 if let Some(entity_name) = current_name.take() {
242 let entity = build_entity(
243 entity_name,
244 label,
245 current_line,
246 &mut current_fields,
247 errors,
248 );
249 entities.push(entity);
250 }
251
252 if entities.len() > MAX_ENTITIES_PER_FILE {
254 errors.push(ParseError {
255 line: section_start_line,
256 message: format!(
257 "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
258 entities.len()
259 ),
260 });
261 }
262
263 entities
264}
265
266fn flush_pending_list(
267 pending_key: &mut Option<String>,
268 pending_items: &mut Vec<String>,
269 fields: &mut Vec<(String, FieldValue)>,
270) {
271 if let Some(key) = pending_key.take() {
272 fields.push((key, FieldValue::List(std::mem::take(pending_items))));
273 }
274}
275
276fn build_entity(
277 name: String,
278 label: Label,
279 line: usize,
280 fields: &mut Vec<(String, FieldValue)>,
281 errors: &mut Vec<ParseError>,
282) -> Entity {
283 if name.trim().is_empty() {
285 errors.push(ParseError {
286 line,
287 message: "entity name must not be empty".into(),
288 });
289 } else if name.len() > MAX_NAME_LEN {
290 errors.push(ParseError {
291 line,
292 message: format!(
293 "entity name exceeds {MAX_NAME_LEN} chars (got {})",
294 name.len()
295 ),
296 });
297 }
298
299 let id = extract_id_field(fields);
301
302 apply_type_shorthand(fields, label);
304
305 validate_fields(fields, label, line, errors);
307
308 Entity {
309 name,
310 label,
311 fields: std::mem::take(fields),
312 id,
313 line,
314 }
315}
316
317fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
319 let pos = fields.iter().position(|(k, _)| k == "id")?;
320 let (_, value) = fields.remove(pos);
321 match value {
322 FieldValue::Single(s) if !s.is_empty() => Some(s),
323 _ => None,
324 }
325}
326
327fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
329 for field in fields.iter_mut() {
330 if field.0 == "type" {
331 field.0 = match label {
332 Label::Institution => "institution_type".to_string(),
333 Label::PublicRecord => "document_type".to_string(),
334 Label::Actor => "type".to_string(), };
336 }
337 }
338}
339
340fn parse_bullet(item: &str) -> Option<(String, String)> {
342 let colon_pos = item.find(':')?;
343 let key = item[..colon_pos].trim();
344 if key.is_empty() {
345 return None;
346 }
347 let value = item[colon_pos + 1..].trim();
348 Some((key.to_string(), value.to_string()))
349}
350
351fn is_list_field(key: &str) -> bool {
353 matches!(key, "aliases" | "urls")
354}
355
356fn strip_h3(line: &str) -> Option<&str> {
358 let trimmed = line.trim_start();
359 if let Some(rest) = trimmed.strip_prefix("### ") {
360 if !rest.starts_with('#') {
362 return Some(rest.trim());
363 }
364 }
365 None
366}
367
368const COMMON_FIELDS: &[&str] = &[
372 "qualifier",
373 "aliases",
374 "thumbnail",
375 "thumbnail_source",
376 "occurred_at",
377 "urls",
378 "description",
379];
380
381const ACTOR_FIELDS: &[&str] = &[
382 "date_of_birth",
383 "place_of_birth",
384 "nationality",
385 "occupation",
386];
387
388const INSTITUTION_FIELDS: &[&str] = &[
389 "institution_type",
390 "jurisdiction",
391 "headquarters",
392 "founded_date",
393 "registration_number",
394];
395
396const PUBLIC_RECORD_FIELDS: &[&str] = &[
397 "document_type",
398 "case_number",
399 "filing_date",
400 "issuing_authority",
401];
402
403const OCCUPATION_VALUES: &[&str] = &[
405 "politician",
406 "executive",
407 "journalist",
408 "lawyer",
409 "footballer",
410 "activist",
411 "civil_servant",
412 "military",
413 "academic",
414 "lobbyist",
415];
416
417const INSTITUTION_TYPE_VALUES: &[&str] = &[
418 "football_club",
419 "political_party",
420 "corporation",
421 "government_agency",
422 "court",
423 "law_enforcement",
424 "ngo",
425 "media",
426 "regulatory_body",
427 "military",
428 "university",
429 "trade_union",
430 "lobby_group",
431 "sports_body",
432];
433
434const DOCUMENT_TYPE_VALUES: &[&str] = &[
435 "court_ruling",
436 "criminal_charge",
437 "contract",
438 "legislation",
439 "filing",
440 "investigation",
441 "termination",
442 "transfer",
443 "election_result",
444 "financial_disclosure",
445 "sanctions",
446 "permit",
447 "audit_report",
448];
449
450struct FieldConstraint {
452 max_len: usize,
453 enum_values: Option<&'static [&'static str]>,
455}
456
457fn field_constraint(key: &str) -> Option<FieldConstraint> {
458 match key {
459 "description" => Some(FieldConstraint {
460 max_len: 2000,
461 enum_values: None,
462 }),
463 "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
464 max_len: 2048,
465 enum_values: None,
466 }),
467 "occurred_at" | "date_of_birth" | "founded_date" | "filing_date" => Some(FieldConstraint {
468 max_len: 10,
469 enum_values: None,
470 }),
471 "place_of_birth" | "jurisdiction" | "headquarters" | "issuing_authority" => {
472 Some(FieldConstraint {
473 max_len: 200,
474 enum_values: None,
475 })
476 }
477 "occupation" => Some(FieldConstraint {
478 max_len: 100,
479 enum_values: Some(OCCUPATION_VALUES),
480 }),
481 "institution_type" => Some(FieldConstraint {
482 max_len: 100,
483 enum_values: Some(INSTITUTION_TYPE_VALUES),
484 }),
485 "document_type" => Some(FieldConstraint {
486 max_len: 100,
487 enum_values: Some(DOCUMENT_TYPE_VALUES),
488 }),
489 "qualifier" | "nationality" | "case_number" | "registration_number" => {
490 Some(FieldConstraint {
491 max_len: 100,
492 enum_values: None,
493 })
494 }
495 _ => None,
497 }
498}
499
500const MAX_ALIASES: usize = 10;
502const MAX_ALIAS_LEN: usize = 200;
503const MAX_URLS: usize = 10;
504const MAX_URL_LEN: usize = 2048;
505
506fn validate_fields(
507 fields: &[(String, FieldValue)],
508 label: Label,
509 line: usize,
510 errors: &mut Vec<ParseError>,
511) {
512 let label_fields: &[&str] = match label {
513 Label::Actor => ACTOR_FIELDS,
514 Label::Institution => INSTITUTION_FIELDS,
515 Label::PublicRecord => PUBLIC_RECORD_FIELDS,
516 };
517
518 for (key, value) in fields {
519 if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
521 errors.push(ParseError {
522 line,
523 message: format!("unknown field {key:?} for {label}"),
524 });
525 continue;
526 }
527
528 match value {
529 FieldValue::Single(val) => {
530 if let Some(constraint) = field_constraint(key) {
531 if val.len() > constraint.max_len {
532 errors.push(ParseError {
533 line,
534 message: format!(
535 "field {key:?} exceeds {} chars (got {})",
536 constraint.max_len,
537 val.len()
538 ),
539 });
540 }
541
542 if let Some(allowed) = constraint.enum_values {
544 validate_enum_value(key, val, allowed, line, errors);
545 }
546
547 if matches!(
549 key.as_str(),
550 "occurred_at" | "date_of_birth" | "founded_date" | "filing_date"
551 ) && !val.is_empty()
552 {
553 validate_date_format(key, val, line, errors);
554 }
555
556 if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
558 && !val.is_empty()
559 && !val.starts_with("https://")
560 {
561 errors.push(ParseError {
562 line,
563 message: format!("field {key:?} must be HTTPS URL"),
564 });
565 }
566 }
567 }
568 FieldValue::List(items) => match key.as_str() {
569 "aliases" => {
570 if items.len() > MAX_ALIASES {
571 errors.push(ParseError {
572 line,
573 message: format!(
574 "aliases exceeds {MAX_ALIASES} items (got {})",
575 items.len()
576 ),
577 });
578 }
579 for item in items {
580 if item.len() > MAX_ALIAS_LEN {
581 errors.push(ParseError {
582 line,
583 message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
584 });
585 }
586 }
587 }
588 "urls" => {
589 if items.len() > MAX_URLS {
590 errors.push(ParseError {
591 line,
592 message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
593 });
594 }
595 for item in items {
596 if item.len() > MAX_URL_LEN {
597 errors.push(ParseError {
598 line,
599 message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
600 });
601 }
602 if !item.starts_with("https://") {
603 errors.push(ParseError {
604 line,
605 message: format!("url must be HTTPS: {item:?}"),
606 });
607 }
608 }
609 }
610 _ => {}
611 },
612 }
613 }
614}
615
616fn validate_enum_value(
617 key: &str,
618 value: &str,
619 allowed: &[&str],
620 line: usize,
621 errors: &mut Vec<ParseError>,
622) {
623 if let Some(custom) = value.strip_prefix("custom:") {
625 if custom.is_empty() || custom.len() > 100 {
626 errors.push(ParseError {
627 line,
628 message: format!(
629 "field {key:?} custom value must be 1-100 chars, got {}",
630 custom.len()
631 ),
632 });
633 }
634 return;
635 }
636
637 let normalized = value.to_lowercase().replace(' ', "_");
638 if !allowed.contains(&normalized.as_str()) {
639 errors.push(ParseError {
640 line,
641 message: format!(
642 "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
643 allowed.join(", ")
644 ),
645 });
646 }
647}
648
649fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
650 let valid = matches!(value.len(), 4 | 7 | 10)
652 && value.chars().enumerate().all(|(i, c)| match i {
653 4 | 7 => c == '-',
654 _ => c.is_ascii_digit(),
655 });
656
657 if !valid {
658 errors.push(ParseError {
659 line,
660 message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
661 });
662 }
663}
664
665#[cfg(test)]
666mod tests {
667 use super::*;
668
669 #[test]
670 fn parse_actor_entity() {
671 let body = [
672 "",
673 "### Mark Bonnick",
674 "- qualifier: Arsenal Kit Manager",
675 "- nationality: British",
676 "- occupation: custom:Kit Manager",
677 "- date_of_birth: 1962",
678 "- description: Academy kit manager at Arsenal FC for 22 years",
679 " (2001-2024). Age 62 at time of dismissal.",
680 "",
681 ]
682 .join("\n");
683
684 let mut errors = Vec::new();
685 let entities = parse_entities(&body, SectionKind::Actors, 10, &mut errors);
686 assert!(errors.is_empty(), "errors: {errors:?}");
687 assert_eq!(entities.len(), 1);
688
689 let e = &entities[0];
690 assert_eq!(e.name, "Mark Bonnick");
691 assert_eq!(e.label, Label::Actor);
692 assert_eq!(e.fields.len(), 5);
693
694 let desc = e
696 .fields
697 .iter()
698 .find(|(k, _)| k == "description")
699 .map(|(_, v)| v);
700 assert_eq!(
701 desc,
702 Some(&FieldValue::Single(
703 "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
704 ))
705 );
706 }
707
708 #[test]
709 fn parse_institution_with_type_shorthand() {
710 let body = [
711 "",
712 "### Arsenal FC",
713 "- type: football_club",
714 "- jurisdiction: England",
715 "- aliases: Arsenal, The Gunners, Arsenal Football Club",
716 "- urls:",
717 " - https://www.arsenal.com",
718 " - https://en.wikipedia.org/wiki/Arsenal_F.C.",
719 "",
720 ]
721 .join("\n");
722
723 let mut errors = Vec::new();
724 let entities = parse_entities(&body, SectionKind::Institutions, 20, &mut errors);
725 assert!(errors.is_empty(), "errors: {errors:?}");
726 assert_eq!(entities.len(), 1);
727
728 let e = &entities[0];
729 assert_eq!(e.name, "Arsenal FC");
730 assert_eq!(e.label, Label::Institution);
731
732 let it = e.fields.iter().find(|(k, _)| k == "institution_type");
734 assert_eq!(
735 it.map(|(_, v)| v),
736 Some(&FieldValue::Single("football_club".into()))
737 );
738
739 let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
741 assert_eq!(
742 aliases.map(|(_, v)| v),
743 Some(&FieldValue::List(vec![
744 "Arsenal".into(),
745 "The Gunners".into(),
746 "Arsenal Football Club".into(),
747 ]))
748 );
749
750 let urls = e.fields.iter().find(|(k, _)| k == "urls");
752 assert_eq!(
753 urls.map(|(_, v)| v),
754 Some(&FieldValue::List(vec![
755 "https://www.arsenal.com".into(),
756 "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
757 ]))
758 );
759 }
760
761 #[test]
762 fn parse_event_with_type_shorthand() {
763 let body = [
764 "",
765 "### Bonnick dismissal",
766 "- occurred_at: 2024-12-24",
767 "- type: termination",
768 "- description: Arsenal dismisses Bonnick.",
769 "",
770 ]
771 .join("\n");
772
773 let mut errors = Vec::new();
774 let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
775 assert!(errors.is_empty(), "errors: {errors:?}");
776
777 let e = &entities[0];
778 assert_eq!(e.label, Label::PublicRecord);
779 let dt = e.fields.iter().find(|(k, _)| k == "document_type");
780 assert_eq!(
781 dt.map(|(_, v)| v),
782 Some(&FieldValue::Single("termination".into()))
783 );
784 }
785
786 #[test]
787 fn reject_unknown_field() {
788 let body = "### Test\n- foobar: value\n";
789 let mut errors = Vec::new();
790 parse_entities(body, SectionKind::Actors, 1, &mut errors);
791 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
792 }
793
794 #[test]
795 fn reject_wrong_label_field() {
796 let body = "### Test\n- institution_type: court\n";
798 let mut errors = Vec::new();
799 parse_entities(body, SectionKind::Actors, 1, &mut errors);
800 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
801 }
802
803 #[test]
804 fn reject_invalid_enum_value() {
805 let body = "### Test\n- occupation: wizard\n";
806 let mut errors = Vec::new();
807 parse_entities(body, SectionKind::Actors, 1, &mut errors);
808 assert!(
809 errors
810 .iter()
811 .any(|e| e.message.contains("invalid occupation"))
812 );
813 }
814
815 #[test]
816 fn accept_custom_enum_value() {
817 let body = "### Test\n- occupation: custom:Kit Manager\n";
818 let mut errors = Vec::new();
819 let entities = parse_entities(body, SectionKind::Actors, 1, &mut errors);
820 assert!(errors.is_empty(), "errors: {errors:?}");
821 assert_eq!(entities.len(), 1);
822 }
823
824 #[test]
825 fn reject_invalid_date_format() {
826 let body = "### Test\n- date_of_birth: January 1990\n";
827 let mut errors = Vec::new();
828 parse_entities(body, SectionKind::Actors, 1, &mut errors);
829 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
830 }
831
832 #[test]
833 fn accept_valid_date_formats() {
834 for date in &["2024", "2024-01", "2024-01-15"] {
835 let body = format!("### Test\n- date_of_birth: {date}\n");
836 let mut errors = Vec::new();
837 parse_entities(&body, SectionKind::Actors, 1, &mut errors);
838 assert!(
839 errors.is_empty(),
840 "date {date:?} should be valid: {errors:?}"
841 );
842 }
843 }
844
845 #[test]
846 fn reject_non_https_url() {
847 let body = "### Test\n- urls:\n - http://example.com\n";
848 let mut errors = Vec::new();
849 parse_entities(body, SectionKind::Actors, 1, &mut errors);
850 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
851 }
852
853 #[test]
854 fn reject_non_https_thumbnail() {
855 let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
856 let mut errors = Vec::new();
857 parse_entities(body, SectionKind::Actors, 1, &mut errors);
858 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
859 }
860
861 #[test]
862 fn multiple_entities() {
863 let body = [
864 "",
865 "### Alice",
866 "- nationality: Dutch",
867 "",
868 "### Bob",
869 "- nationality: British",
870 "",
871 ]
872 .join("\n");
873
874 let mut errors = Vec::new();
875 let entities = parse_entities(&body, SectionKind::Actors, 1, &mut errors);
876 assert!(errors.is_empty(), "errors: {errors:?}");
877 assert_eq!(entities.len(), 2);
878 assert_eq!(entities[0].name, "Alice");
879 assert_eq!(entities[1].name, "Bob");
880 }
881
882 #[test]
883 fn field_max_length_violation() {
884 let long_val = "a".repeat(201);
885 let body = format!("### Test\n- nationality: {long_val}\n");
886 let mut errors = Vec::new();
887 parse_entities(&body, SectionKind::Actors, 1, &mut errors);
888 assert!(
889 errors
890 .iter()
891 .any(|e| e.message.contains("exceeds 100 chars"))
892 );
893 }
894
895 #[test]
896 fn too_many_aliases() {
897 let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
898 let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
899 let mut errors = Vec::new();
900 parse_entities(&body, SectionKind::Actors, 1, &mut errors);
901 assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
902 }
903}