1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5const MAX_ENTITIES_PER_FILE: usize = 50;
7
8const MAX_NAME_LEN: usize = 300;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14 Person,
15 Organization,
16 Event,
17 Document,
18 Asset,
19}
20
21impl fmt::Display for Label {
22 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23 match self {
24 Self::Person => write!(f, "person"),
25 Self::Organization => write!(f, "organization"),
26 Self::Event => write!(f, "event"),
27 Self::Document => write!(f, "document"),
28 Self::Asset => write!(f, "asset"),
29 }
30 }
31}
32
33impl Label {
34 pub fn from_section(kind: SectionKind) -> Option<Self> {
35 match kind {
36 SectionKind::People => Some(Self::Person),
37 SectionKind::Organizations => Some(Self::Organization),
38 SectionKind::Events => Some(Self::Event),
39 SectionKind::Documents => Some(Self::Document),
40 SectionKind::Assets => Some(Self::Asset),
41 _ => None,
42 }
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct Entity {
49 pub name: String,
50 pub label: Label,
51 pub fields: Vec<(String, FieldValue)>,
52 pub id: Option<String>,
54 pub line: usize,
56 pub tags: Vec<String>,
58 pub slug: Option<String>,
61}
62
63#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66 Single(String),
67 List(Vec<String>),
68}
69
70pub fn parse_entity_file_body(
75 name: &str,
76 body: &str,
77 label: Label,
78 id: Option<String>,
79 title_line: usize,
80 errors: &mut Vec<ParseError>,
81) -> Entity {
82 let section_kind = match label {
83 Label::Person => SectionKind::People,
84 Label::Organization => SectionKind::Organizations,
85 Label::Event => SectionKind::Events,
86 Label::Document => SectionKind::Documents,
87 Label::Asset => SectionKind::Assets,
88 };
89
90 let wrapped = format!("### {name}\n{body}");
92 let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94 if let Some(mut entity) = entities.pop() {
95 entity.id = id;
96 entity.line = title_line;
97 entity
98 } else {
99 Entity {
100 name: name.to_string(),
101 label,
102 fields: Vec::new(),
103 id,
104 line: title_line,
105 tags: Vec::new(),
106 slug: None,
107 }
108 }
109}
110
111#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116 body: &str,
117 section_kind: SectionKind,
118 section_start_line: usize,
119 errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121 let Some(label) = Label::from_section(section_kind) else {
122 return Vec::new();
123 };
124
125 let lines: Vec<&str> = body.lines().collect();
126 let mut entities: Vec<Entity> = Vec::new();
127 let mut current_name: Option<String> = None;
128 let mut current_line: usize = 0;
129 let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130 let mut pending_list_key: Option<String> = None;
132 let mut pending_list_items: Vec<String> = Vec::new();
133
134 for (i, line) in lines.iter().enumerate() {
135 let file_line = section_start_line + 1 + i; if let Some(name) = strip_h3(line) {
139 flush_pending_list(
141 &mut pending_list_key,
142 &mut pending_list_items,
143 &mut current_fields,
144 );
145
146 if let Some(entity_name) = current_name.take() {
148 let entity = build_entity(
149 entity_name,
150 label,
151 current_line,
152 &mut current_fields,
153 errors,
154 );
155 entities.push(entity);
156 }
157
158 current_name = Some(name.to_string());
159 current_line = file_line;
160 current_fields.clear();
161 continue;
162 }
163
164 if current_name.is_none() {
166 if !line.trim().is_empty() {
167 errors.push(ParseError {
168 line: file_line,
169 message: "content before first entity heading (### Name)".into(),
170 });
171 }
172 continue;
173 }
174
175 let trimmed = line.trim();
176
177 if let Some(item) = trimmed.strip_prefix("- ") {
179 if line.starts_with(" - ") && pending_list_key.is_some() {
180 pending_list_items.push(item.trim().to_string());
182 continue;
183 }
184
185 flush_pending_list(
187 &mut pending_list_key,
188 &mut pending_list_items,
189 &mut current_fields,
190 );
191
192 if let Some((key, value)) = parse_bullet(item) {
194 if value.is_empty() {
195 pending_list_key = Some(key);
197 pending_list_items.clear();
198 } else if is_list_field(&key) && value.contains(',') {
199 let items: Vec<String> = value
201 .split(',')
202 .map(|s| s.trim().to_string())
203 .filter(|s| !s.is_empty())
204 .collect();
205 current_fields.push((key, FieldValue::List(items)));
206 } else {
207 current_fields.push((key, FieldValue::Single(value)));
208 }
209 } else {
210 errors.push(ParseError {
211 line: file_line,
212 message: format!(
213 "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214 ),
215 });
216 }
217 continue;
218 }
219
220 if line.starts_with(" ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222 if pending_list_key.is_some() {
223 errors.push(ParseError {
225 line: file_line,
226 message: "unexpected indented text in list context".into(),
227 });
228 } else if let Some(last) = current_fields.last_mut() {
229 if let FieldValue::Single(ref mut val) = last.1 {
231 val.push('\n');
232 val.push_str(trimmed);
233 }
234 }
235 continue;
236 }
237
238 if !trimmed.is_empty() {
240 flush_pending_list(
242 &mut pending_list_key,
243 &mut pending_list_items,
244 &mut current_fields,
245 );
246 }
247 }
248
249 flush_pending_list(
251 &mut pending_list_key,
252 &mut pending_list_items,
253 &mut current_fields,
254 );
255
256 if let Some(entity_name) = current_name.take() {
257 let entity = build_entity(
258 entity_name,
259 label,
260 current_line,
261 &mut current_fields,
262 errors,
263 );
264 entities.push(entity);
265 }
266
267 if entities.len() > MAX_ENTITIES_PER_FILE {
269 errors.push(ParseError {
270 line: section_start_line,
271 message: format!(
272 "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
273 entities.len()
274 ),
275 });
276 }
277
278 entities
279}
280
281fn flush_pending_list(
282 pending_key: &mut Option<String>,
283 pending_items: &mut Vec<String>,
284 fields: &mut Vec<(String, FieldValue)>,
285) {
286 if let Some(key) = pending_key.take() {
287 fields.push((key, FieldValue::List(std::mem::take(pending_items))));
288 }
289}
290
291fn build_entity(
292 name: String,
293 label: Label,
294 line: usize,
295 fields: &mut Vec<(String, FieldValue)>,
296 errors: &mut Vec<ParseError>,
297) -> Entity {
298 if name.trim().is_empty() {
300 errors.push(ParseError {
301 line,
302 message: "entity name must not be empty".into(),
303 });
304 } else if name.len() > MAX_NAME_LEN {
305 errors.push(ParseError {
306 line,
307 message: format!(
308 "entity name exceeds {MAX_NAME_LEN} chars (got {})",
309 name.len()
310 ),
311 });
312 }
313
314 let id = extract_id_field(fields);
316
317 apply_type_shorthand(fields, label);
319
320 normalize_enum_fields(fields);
322
323 validate_fields(fields, label, line, errors);
325
326 Entity {
327 name,
328 label,
329 fields: std::mem::take(fields),
330 id,
331 line,
332 tags: Vec::new(),
333 slug: None,
334 }
335}
336
337fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
339 let pos = fields.iter().position(|(k, _)| k == "id")?;
340 let (_, value) = fields.remove(pos);
341 match value {
342 FieldValue::Single(s) if !s.is_empty() => Some(s),
343 _ => None,
344 }
345}
346
347fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
349 for field in fields.iter_mut() {
350 if field.0 == "type" {
351 field.0 = match label {
352 Label::Organization => "org_type".to_string(),
353 Label::Event => "event_type".to_string(),
354 Label::Document => "doc_type".to_string(),
355 Label::Asset => "asset_type".to_string(),
356 Label::Person => "type".to_string(), };
358 }
359 }
360}
361
362fn parse_bullet(item: &str) -> Option<(String, String)> {
364 let colon_pos = item.find(':')?;
365 let key = item[..colon_pos].trim();
366 if key.is_empty() {
367 return None;
368 }
369 let value = item[colon_pos + 1..].trim();
370 Some((key.to_string(), value.to_string()))
371}
372
373fn is_list_field(key: &str) -> bool {
375 matches!(key, "aliases" | "urls" | "role")
376}
377
378fn strip_h3(line: &str) -> Option<&str> {
380 let trimmed = line.trim_start();
381 if let Some(rest) = trimmed.strip_prefix("### ") {
382 if !rest.starts_with('#') {
384 return Some(rest.trim());
385 }
386 }
387 None
388}
389
390const COMMON_FIELDS: &[&str] = &[
394 "qualifier",
395 "aliases",
396 "thumbnail",
397 "thumbnail_source",
398 "urls",
399 "description",
400];
401
402const PERSON_FIELDS: &[&str] = &[
403 "role",
404 "nationality",
405 "date_of_birth",
406 "place_of_birth",
407 "status",
408];
409
410const ORGANIZATION_FIELDS: &[&str] = &[
411 "org_type",
412 "jurisdiction",
413 "headquarters",
414 "founded_date",
415 "registration_number",
416 "status",
417];
418
419const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
420
421const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
422
423const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
424
425use crate::domain;
427
428const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
429const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
430const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
431const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
432const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
433const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
434const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
435const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
436const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
437
438struct FieldConstraint {
440 max_len: usize,
441 enum_values: Option<&'static [&'static str]>,
443}
444
445fn field_constraint(key: &str) -> Option<FieldConstraint> {
446 match key {
447 "description" => Some(FieldConstraint {
448 max_len: 2000,
449 enum_values: None,
450 }),
451 "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
452 max_len: 2048,
453 enum_values: None,
454 }),
455 "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
456 | "closed_at" => Some(FieldConstraint {
457 max_len: 10,
458 enum_values: None,
459 }),
460 "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
461 Some(FieldConstraint {
462 max_len: 200,
463 enum_values: None,
464 })
465 }
466 "jurisdiction" => Some(FieldConstraint {
467 max_len: 203, enum_values: None,
470 }),
471 "role" => Some(FieldConstraint {
472 max_len: 100,
473 enum_values: Some(ROLE_VALUES),
474 }),
475 "org_type" => Some(FieldConstraint {
476 max_len: 100,
477 enum_values: Some(ORG_TYPE_VALUES),
478 }),
479 "event_type" => Some(FieldConstraint {
480 max_len: 100,
481 enum_values: Some(EVENT_TYPE_VALUES),
482 }),
483 "doc_type" => Some(FieldConstraint {
484 max_len: 100,
485 enum_values: Some(DOC_TYPE_VALUES),
486 }),
487 "asset_type" => Some(FieldConstraint {
488 max_len: 100,
489 enum_values: Some(ASSET_TYPE_VALUES),
490 }),
491 "severity" => Some(FieldConstraint {
492 max_len: 20,
493 enum_values: Some(SEVERITY_VALUES),
494 }),
495 "status" => Some(FieldConstraint {
496 max_len: 30,
499 enum_values: None,
500 }),
501 "qualifier" | "nationality" | "case_number" | "registration_number" => {
502 Some(FieldConstraint {
503 max_len: 100,
504 enum_values: None,
505 })
506 }
507 _ => None,
509 }
510}
511
512const MAX_ALIASES: usize = 10;
514const MAX_ALIAS_LEN: usize = 200;
515const MAX_URLS: usize = 10;
516const MAX_URL_LEN: usize = 2048;
517
518fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
522 for (key, value) in fields.iter_mut() {
523 let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
524
525 match value {
526 FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
527 let normalized = val.to_lowercase().replace(' ', "_");
528 if normalized != *val {
529 *val = normalized;
530 }
531 }
532 FieldValue::List(items) if is_enum => {
533 for item in items.iter_mut() {
534 if !item.starts_with("custom:") {
535 let normalized = item.to_lowercase().replace(' ', "_");
536 if normalized != *item {
537 *item = normalized;
538 }
539 }
540 }
541 }
542 _ => {}
543 }
544 }
545}
546
547#[allow(clippy::too_many_lines)]
548fn validate_fields(
549 fields: &[(String, FieldValue)],
550 label: Label,
551 line: usize,
552 errors: &mut Vec<ParseError>,
553) {
554 let label_fields: &[&str] = match label {
555 Label::Person => PERSON_FIELDS,
556 Label::Organization => ORGANIZATION_FIELDS,
557 Label::Event => EVENT_FIELDS,
558 Label::Document => DOCUMENT_FIELDS,
559 Label::Asset => ASSET_FIELDS,
560 };
561
562 for (key, value) in fields {
563 if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
565 errors.push(ParseError {
566 line,
567 message: format!("unknown field {key:?} for {label}"),
568 });
569 continue;
570 }
571
572 match value {
573 FieldValue::Single(val) => {
574 if let Some(constraint) = field_constraint(key) {
575 if val.len() > constraint.max_len {
576 errors.push(ParseError {
577 line,
578 message: format!(
579 "field {key:?} exceeds {} chars (got {})",
580 constraint.max_len,
581 val.len()
582 ),
583 });
584 }
585
586 if let Some(allowed) = constraint.enum_values {
588 validate_enum_value(key, val, allowed, line, errors);
589 }
590
591 if matches!(
593 key.as_str(),
594 "occurred_at"
595 | "date_of_birth"
596 | "founded_date"
597 | "issued_at"
598 | "opened_at"
599 | "closed_at"
600 ) && !val.is_empty()
601 {
602 validate_date_format(key, val, line, errors);
603 }
604
605 if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
607 && !val.is_empty()
608 && !val.starts_with("https://")
609 {
610 errors.push(ParseError {
611 line,
612 message: format!("field {key:?} must be HTTPS URL"),
613 });
614 }
615 }
616
617 if key == "status" {
619 validate_status(val, label, line, errors);
620 }
621
622 if key == "jurisdiction" && !val.is_empty() {
624 validate_jurisdiction(val, line, errors);
625 }
626
627 if key == "value" && !val.is_empty() {
629 validate_money(val, line, errors);
630 }
631 }
632 FieldValue::List(items) => match key.as_str() {
633 "aliases" => {
634 if items.len() > MAX_ALIASES {
635 errors.push(ParseError {
636 line,
637 message: format!(
638 "aliases exceeds {MAX_ALIASES} items (got {})",
639 items.len()
640 ),
641 });
642 }
643 for item in items {
644 if item.len() > MAX_ALIAS_LEN {
645 errors.push(ParseError {
646 line,
647 message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
648 });
649 }
650 }
651 }
652 "urls" => {
653 if items.len() > MAX_URLS {
654 errors.push(ParseError {
655 line,
656 message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
657 });
658 }
659 for item in items {
660 if item.len() > MAX_URL_LEN {
661 errors.push(ParseError {
662 line,
663 message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
664 });
665 }
666 if !item.starts_with("https://") {
667 errors.push(ParseError {
668 line,
669 message: format!("url must be HTTPS: {item:?}"),
670 });
671 }
672 }
673 }
674 "role" => {
675 if items.len() > MAX_ROLES {
676 errors.push(ParseError {
677 line,
678 message: format!(
679 "role exceeds {MAX_ROLES} items (got {})",
680 items.len()
681 ),
682 });
683 }
684 for item in items {
685 validate_enum_value("role", item, ROLE_VALUES, line, errors);
686 }
687 }
688 _ => {}
689 },
690 }
691 }
692
693 if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
695 errors.push(ParseError {
696 line,
697 message: "organization entity missing required field \"org_type\"".into(),
698 });
699 }
700}
701
702const MAX_ROLES: usize = 10;
704
705fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
707 let allowed: &[&str] = match label {
708 Label::Person => PERSON_STATUS_VALUES,
709 Label::Organization => ORG_STATUS_VALUES,
710 Label::Asset => ASSET_STATUS_VALUES,
711 _ => {
712 errors.push(ParseError {
713 line,
714 message: format!("field \"status\" is not valid for {label}"),
715 });
716 return;
717 }
718 };
719
720 let normalized = value.to_lowercase().replace(' ', "_");
721 if !allowed.contains(&normalized.as_str()) {
722 errors.push(ParseError {
723 line,
724 message: format!(
725 "invalid status {value:?} for {label} (known: {})",
726 allowed.join(", ")
727 ),
728 });
729 }
730}
731
732fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
734 if let Some(slash_pos) = value.find('/') {
735 let country = &value[..slash_pos];
736 let subdivision = &value[slash_pos + 1..];
737 if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
738 errors.push(ParseError {
739 line,
740 message: format!(
741 "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
742 ),
743 });
744 }
745 if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
746 errors.push(ParseError {
747 line,
748 message: format!(
749 "jurisdiction subdivision must be 1-{} chars",
750 domain::MAX_SUBDIVISION_LEN
751 ),
752 });
753 }
754 } else {
755 if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
757 errors.push(ParseError {
758 line,
759 message: format!(
760 "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
761 ),
762 });
763 }
764 }
765}
766
767fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
770 let parts: Vec<&str> = value.splitn(3, ' ').collect();
772 if parts.len() < 3 {
773 errors.push(ParseError {
774 line,
775 message: format!(
776 "invalid money format: expected `amount currency \"display\"`, got {value:?}"
777 ),
778 });
779 return;
780 }
781
782 if parts[0].parse::<i64>().is_err() {
784 errors.push(ParseError {
785 line,
786 message: format!("money amount must be an integer, got {:?}", parts[0]),
787 });
788 }
789
790 let currency = parts[1];
792 if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
793 errors.push(ParseError {
794 line,
795 message: format!(
796 "money currency must be 3-letter uppercase ISO code, got {currency:?}"
797 ),
798 });
799 }
800
801 let display = parts[2];
803 if !display.starts_with('"') || !display.ends_with('"') {
804 errors.push(ParseError {
805 line,
806 message: format!("money display must be quoted, got {display:?}"),
807 });
808 } else {
809 let inner = &display[1..display.len() - 1];
810 if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
811 errors.push(ParseError {
812 line,
813 message: format!(
814 "money display exceeds {} chars (got {})",
815 domain::MAX_MONEY_DISPLAY_LEN,
816 inner.len()
817 ),
818 });
819 }
820 }
821}
822
823fn validate_enum_value(
824 key: &str,
825 value: &str,
826 allowed: &[&str],
827 line: usize,
828 errors: &mut Vec<ParseError>,
829) {
830 if let Some(custom) = value.strip_prefix("custom:") {
832 if custom.is_empty() || custom.len() > 100 {
833 errors.push(ParseError {
834 line,
835 message: format!(
836 "field {key:?} custom value must be 1-100 chars, got {}",
837 custom.len()
838 ),
839 });
840 }
841 return;
842 }
843
844 let normalized = value.to_lowercase().replace(' ', "_");
845 if !allowed.contains(&normalized.as_str()) {
846 errors.push(ParseError {
847 line,
848 message: format!(
849 "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
850 allowed.join(", ")
851 ),
852 });
853 }
854}
855
856fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
857 let valid = matches!(value.len(), 4 | 7 | 10)
859 && value.chars().enumerate().all(|(i, c)| match i {
860 4 | 7 => c == '-',
861 _ => c.is_ascii_digit(),
862 });
863
864 if !valid {
865 errors.push(ParseError {
866 line,
867 message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
868 });
869 }
870}
871
872#[cfg(test)]
873mod tests {
874 use super::*;
875
876 #[test]
877 fn parse_person_entity() {
878 let body = [
879 "",
880 "### Mark Bonnick",
881 "- qualifier: Arsenal Kit Manager",
882 "- nationality: GB",
883 "- role: custom:Kit Manager",
884 "- date_of_birth: 1962",
885 "- description: Academy kit manager at Arsenal FC for 22 years",
886 " (2001-2024). Age 62 at time of dismissal.",
887 "",
888 ]
889 .join("\n");
890
891 let mut errors = Vec::new();
892 let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
893 assert!(errors.is_empty(), "errors: {errors:?}");
894 assert_eq!(entities.len(), 1);
895
896 let e = &entities[0];
897 assert_eq!(e.name, "Mark Bonnick");
898 assert_eq!(e.label, Label::Person);
899 assert_eq!(e.fields.len(), 5);
900
901 let desc = e
903 .fields
904 .iter()
905 .find(|(k, _)| k == "description")
906 .map(|(_, v)| v);
907 assert_eq!(
908 desc,
909 Some(&FieldValue::Single(
910 "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
911 ))
912 );
913 }
914
915 #[test]
916 fn parse_person_with_role_list() {
917 let body = "### Test\n- role: politician, executive\n";
918 let mut errors = Vec::new();
919 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
920 assert!(errors.is_empty(), "errors: {errors:?}");
921 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
922 assert_eq!(
923 roles.map(|(_, v)| v),
924 Some(&FieldValue::List(vec![
925 "politician".into(),
926 "executive".into(),
927 ]))
928 );
929 }
930
931 #[test]
932 fn parse_person_with_status() {
933 let body = "### Test\n- status: imprisoned\n";
934 let mut errors = Vec::new();
935 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
936 assert!(errors.is_empty(), "errors: {errors:?}");
937 }
938
939 #[test]
940 fn reject_invalid_person_status() {
941 let body = "### Test\n- status: unknown_status\n";
942 let mut errors = Vec::new();
943 parse_entities(body, SectionKind::People, 1, &mut errors);
944 assert!(errors.iter().any(|e| e.message.contains("invalid status")));
945 }
946
947 #[test]
948 fn parse_organization_with_type_shorthand() {
949 let body = [
950 "",
951 "### Arsenal FC",
952 "- type: sports_club",
953 "- jurisdiction: GB",
954 "- aliases: Arsenal, The Gunners, Arsenal Football Club",
955 "- urls:",
956 " - https://www.arsenal.com",
957 " - https://en.wikipedia.org/wiki/Arsenal_F.C.",
958 "",
959 ]
960 .join("\n");
961
962 let mut errors = Vec::new();
963 let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
964 assert!(errors.is_empty(), "errors: {errors:?}");
965 assert_eq!(entities.len(), 1);
966
967 let e = &entities[0];
968 assert_eq!(e.name, "Arsenal FC");
969 assert_eq!(e.label, Label::Organization);
970
971 let it = e.fields.iter().find(|(k, _)| k == "org_type");
973 assert_eq!(
974 it.map(|(_, v)| v),
975 Some(&FieldValue::Single("sports_club".into()))
976 );
977
978 let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
980 assert_eq!(
981 aliases.map(|(_, v)| v),
982 Some(&FieldValue::List(vec![
983 "Arsenal".into(),
984 "The Gunners".into(),
985 "Arsenal Football Club".into(),
986 ]))
987 );
988
989 let urls = e.fields.iter().find(|(k, _)| k == "urls");
991 assert_eq!(
992 urls.map(|(_, v)| v),
993 Some(&FieldValue::List(vec![
994 "https://www.arsenal.com".into(),
995 "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
996 ]))
997 );
998 }
999
1000 #[test]
1001 fn parse_organization_with_jurisdiction_subdivision() {
1002 let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1003 let mut errors = Vec::new();
1004 let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1005 assert!(errors.is_empty(), "errors: {errors:?}");
1006 let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1007 assert_eq!(
1008 j.map(|(_, v)| v),
1009 Some(&FieldValue::Single("ID/West Java".into()))
1010 );
1011 }
1012
1013 #[test]
1014 fn reject_invalid_jurisdiction() {
1015 let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1016 let mut errors = Vec::new();
1017 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1018 assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1019 }
1020
1021 #[test]
1022 fn parse_event_with_type_shorthand() {
1023 let body = [
1024 "",
1025 "### Bonnick dismissal",
1026 "- occurred_at: 2024-12-24",
1027 "- type: dismissal",
1028 "- description: Arsenal dismisses Bonnick.",
1029 "",
1030 ]
1031 .join("\n");
1032
1033 let mut errors = Vec::new();
1034 let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1035 assert!(errors.is_empty(), "errors: {errors:?}");
1036
1037 let e = &entities[0];
1038 assert_eq!(e.label, Label::Event);
1039 let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1040 assert_eq!(
1041 dt.map(|(_, v)| v),
1042 Some(&FieldValue::Single("dismissal".into()))
1043 );
1044 }
1045
1046 #[test]
1047 fn parse_event_with_severity() {
1048 let body =
1049 "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1050 let mut errors = Vec::new();
1051 let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1052 assert!(errors.is_empty(), "errors: {errors:?}");
1053 }
1054
1055 #[test]
1056 fn parse_document_entity() {
1057 let body = [
1058 "### Indictment No. 123",
1059 "- doc_type: indictment",
1060 "- issued_at: 2024-03-15",
1061 "- issuing_authority: Jakarta District Court",
1062 "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1063 ]
1064 .join("\n");
1065 let mut errors = Vec::new();
1066 let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1067 assert!(errors.is_empty(), "errors: {errors:?}");
1068 assert_eq!(entities.len(), 1);
1069 assert_eq!(entities[0].label, Label::Document);
1070 }
1071
1072 #[test]
1073 fn parse_asset_entity() {
1074 let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1075 let mut errors = Vec::new();
1076 let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1077 assert!(errors.is_empty(), "errors: {errors:?}");
1078 assert_eq!(entities.len(), 1);
1079 assert_eq!(entities[0].label, Label::Asset);
1080 }
1081
1082 #[test]
1083 fn reject_invalid_money_format() {
1084 let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1085 let mut errors = Vec::new();
1086 parse_entities(body, SectionKind::Assets, 1, &mut errors);
1087 assert!(errors.iter().any(|e| e.message.contains("money")));
1088 }
1089
1090 #[test]
1091 fn reject_unknown_field() {
1092 let body = "### Test\n- foobar: value\n";
1093 let mut errors = Vec::new();
1094 parse_entities(body, SectionKind::People, 1, &mut errors);
1095 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1096 }
1097
1098 #[test]
1099 fn reject_wrong_label_field() {
1100 let body = "### Test\n- org_type: court\n";
1102 let mut errors = Vec::new();
1103 parse_entities(body, SectionKind::People, 1, &mut errors);
1104 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1105 }
1106
1107 #[test]
1108 fn reject_invalid_enum_value() {
1109 let body = "### Test\n- role: wizard\n";
1110 let mut errors = Vec::new();
1111 parse_entities(body, SectionKind::People, 1, &mut errors);
1112 assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1113 }
1114
1115 #[test]
1116 fn accept_custom_enum_value() {
1117 let body = "### Test\n- role: custom:Kit Manager\n";
1118 let mut errors = Vec::new();
1119 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1120 assert!(errors.is_empty(), "errors: {errors:?}");
1121 assert_eq!(entities.len(), 1);
1122 }
1123
1124 #[test]
1125 fn normalize_enum_value_spaces_to_underscores() {
1126 let body = "### Test\n- role: civil servant\n";
1127 let mut errors = Vec::new();
1128 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1129 assert!(errors.is_empty(), "errors: {errors:?}");
1130 assert_eq!(entities.len(), 1);
1131 let val = entities[0]
1132 .fields
1133 .iter()
1134 .find(|(k, _)| k == "role")
1135 .map(|(_, v)| match v {
1136 FieldValue::Single(s) => s.as_str(),
1137 _ => "",
1138 });
1139 assert_eq!(val, Some("civil_servant"));
1140 }
1141
1142 #[test]
1143 fn normalize_enum_list_values() {
1144 let body = "### Test\n- role: civil servant, law enforcement\n";
1145 let mut errors = Vec::new();
1146 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1147 assert!(errors.is_empty(), "errors: {errors:?}");
1148 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1149 assert_eq!(
1150 roles.map(|(_, v)| v),
1151 Some(&FieldValue::List(vec![
1152 "civil_servant".into(),
1153 "law_enforcement".into(),
1154 ]))
1155 );
1156 }
1157
1158 #[test]
1159 fn reject_invalid_date_format() {
1160 let body = "### Test\n- date_of_birth: January 1990\n";
1161 let mut errors = Vec::new();
1162 parse_entities(body, SectionKind::People, 1, &mut errors);
1163 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1164 }
1165
1166 #[test]
1167 fn accept_valid_date_formats() {
1168 for date in &["2024", "2024-01", "2024-01-15"] {
1169 let body = format!("### Test\n- date_of_birth: {date}\n");
1170 let mut errors = Vec::new();
1171 parse_entities(&body, SectionKind::People, 1, &mut errors);
1172 assert!(
1173 errors.is_empty(),
1174 "date {date:?} should be valid: {errors:?}"
1175 );
1176 }
1177 }
1178
1179 #[test]
1180 fn reject_non_https_url() {
1181 let body = "### Test\n- urls:\n - http://example.com\n";
1182 let mut errors = Vec::new();
1183 parse_entities(body, SectionKind::People, 1, &mut errors);
1184 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1185 }
1186
1187 #[test]
1188 fn reject_non_https_thumbnail() {
1189 let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1190 let mut errors = Vec::new();
1191 parse_entities(body, SectionKind::People, 1, &mut errors);
1192 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1193 }
1194
1195 #[test]
1196 fn multiple_entities() {
1197 let body = [
1198 "",
1199 "### Alice",
1200 "- nationality: NL",
1201 "",
1202 "### Bob",
1203 "- nationality: GB",
1204 "",
1205 ]
1206 .join("\n");
1207
1208 let mut errors = Vec::new();
1209 let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1210 assert!(errors.is_empty(), "errors: {errors:?}");
1211 assert_eq!(entities.len(), 2);
1212 assert_eq!(entities[0].name, "Alice");
1213 assert_eq!(entities[1].name, "Bob");
1214 }
1215
1216 #[test]
1217 fn field_max_length_violation() {
1218 let long_val = "a".repeat(201);
1219 let body = format!("### Test\n- nationality: {long_val}\n");
1220 let mut errors = Vec::new();
1221 parse_entities(&body, SectionKind::People, 1, &mut errors);
1222 assert!(
1223 errors
1224 .iter()
1225 .any(|e| e.message.contains("exceeds 100 chars"))
1226 );
1227 }
1228
1229 #[test]
1230 fn too_many_aliases() {
1231 let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1232 let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1233 let mut errors = Vec::new();
1234 parse_entities(&body, SectionKind::People, 1, &mut errors);
1235 assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1236 }
1237
1238 #[test]
1239 fn require_org_type_for_organizations() {
1240 let body = "### Test Corp\n- qualifier: Test\n";
1241 let mut errors = Vec::new();
1242 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1243 assert!(
1244 errors
1245 .iter()
1246 .any(|e| { e.message.contains("missing required field \"org_type\"") })
1247 );
1248 }
1249
1250 #[test]
1251 fn accept_organization_with_type() {
1252 let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1253 let mut errors = Vec::new();
1254 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1255 assert!(errors.is_empty(), "errors: {errors:?}");
1256 }
1257}