1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5const MAX_ENTITIES_PER_FILE: usize = 50;
7
8const MAX_NAME_LEN: usize = 300;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14 Person,
15 Organization,
16 Event,
17 Document,
18 Asset,
19}
20
21impl fmt::Display for Label {
22 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23 match self {
24 Self::Person => write!(f, "person"),
25 Self::Organization => write!(f, "organization"),
26 Self::Event => write!(f, "event"),
27 Self::Document => write!(f, "document"),
28 Self::Asset => write!(f, "asset"),
29 }
30 }
31}
32
33impl Label {
34 pub fn from_section(kind: SectionKind) -> Option<Self> {
35 match kind {
36 SectionKind::People => Some(Self::Person),
37 SectionKind::Organizations => Some(Self::Organization),
38 SectionKind::Events => Some(Self::Event),
39 SectionKind::Documents => Some(Self::Document),
40 SectionKind::Assets => Some(Self::Asset),
41 _ => None,
42 }
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct Entity {
49 pub name: String,
50 pub label: Label,
51 pub fields: Vec<(String, FieldValue)>,
52 pub id: Option<String>,
54 pub line: usize,
56 pub tags: Vec<String>,
58}
59
60#[derive(Debug, Clone, PartialEq, Eq)]
62pub enum FieldValue {
63 Single(String),
64 List(Vec<String>),
65}
66
67pub fn parse_entity_file_body(
72 name: &str,
73 body: &str,
74 label: Label,
75 id: Option<String>,
76 title_line: usize,
77 errors: &mut Vec<ParseError>,
78) -> Entity {
79 let section_kind = match label {
80 Label::Person => SectionKind::People,
81 Label::Organization => SectionKind::Organizations,
82 Label::Event => SectionKind::Events,
83 Label::Document => SectionKind::Documents,
84 Label::Asset => SectionKind::Assets,
85 };
86
87 let wrapped = format!("### {name}\n{body}");
89 let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
90
91 if let Some(mut entity) = entities.pop() {
92 entity.id = id;
93 entity.line = title_line;
94 entity
95 } else {
96 Entity {
97 name: name.to_string(),
98 label,
99 fields: Vec::new(),
100 id,
101 line: title_line,
102 tags: Vec::new(),
103 }
104 }
105}
106
107#[allow(clippy::too_many_lines)]
111pub fn parse_entities(
112 body: &str,
113 section_kind: SectionKind,
114 section_start_line: usize,
115 errors: &mut Vec<ParseError>,
116) -> Vec<Entity> {
117 let Some(label) = Label::from_section(section_kind) else {
118 return Vec::new();
119 };
120
121 let lines: Vec<&str> = body.lines().collect();
122 let mut entities: Vec<Entity> = Vec::new();
123 let mut current_name: Option<String> = None;
124 let mut current_line: usize = 0;
125 let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
126 let mut pending_list_key: Option<String> = None;
128 let mut pending_list_items: Vec<String> = Vec::new();
129
130 for (i, line) in lines.iter().enumerate() {
131 let file_line = section_start_line + 1 + i; if let Some(name) = strip_h3(line) {
135 flush_pending_list(
137 &mut pending_list_key,
138 &mut pending_list_items,
139 &mut current_fields,
140 );
141
142 if let Some(entity_name) = current_name.take() {
144 let entity = build_entity(
145 entity_name,
146 label,
147 current_line,
148 &mut current_fields,
149 errors,
150 );
151 entities.push(entity);
152 }
153
154 current_name = Some(name.to_string());
155 current_line = file_line;
156 current_fields.clear();
157 continue;
158 }
159
160 if current_name.is_none() {
162 if !line.trim().is_empty() {
163 errors.push(ParseError {
164 line: file_line,
165 message: "content before first entity heading (### Name)".into(),
166 });
167 }
168 continue;
169 }
170
171 let trimmed = line.trim();
172
173 if let Some(item) = trimmed.strip_prefix("- ") {
175 if line.starts_with(" - ") && pending_list_key.is_some() {
176 pending_list_items.push(item.trim().to_string());
178 continue;
179 }
180
181 flush_pending_list(
183 &mut pending_list_key,
184 &mut pending_list_items,
185 &mut current_fields,
186 );
187
188 if let Some((key, value)) = parse_bullet(item) {
190 if value.is_empty() {
191 pending_list_key = Some(key);
193 pending_list_items.clear();
194 } else if is_list_field(&key) && value.contains(',') {
195 let items: Vec<String> = value
197 .split(',')
198 .map(|s| s.trim().to_string())
199 .filter(|s| !s.is_empty())
200 .collect();
201 current_fields.push((key, FieldValue::List(items)));
202 } else {
203 current_fields.push((key, FieldValue::Single(value)));
204 }
205 } else {
206 errors.push(ParseError {
207 line: file_line,
208 message: format!(
209 "invalid field syntax: expected `- key: value`, got {trimmed:?}"
210 ),
211 });
212 }
213 continue;
214 }
215
216 if line.starts_with(" ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
218 if pending_list_key.is_some() {
219 errors.push(ParseError {
221 line: file_line,
222 message: "unexpected indented text in list context".into(),
223 });
224 } else if let Some(last) = current_fields.last_mut() {
225 if let FieldValue::Single(ref mut val) = last.1 {
227 val.push('\n');
228 val.push_str(trimmed);
229 }
230 }
231 continue;
232 }
233
234 if !trimmed.is_empty() {
236 flush_pending_list(
238 &mut pending_list_key,
239 &mut pending_list_items,
240 &mut current_fields,
241 );
242 }
243 }
244
245 flush_pending_list(
247 &mut pending_list_key,
248 &mut pending_list_items,
249 &mut current_fields,
250 );
251
252 if let Some(entity_name) = current_name.take() {
253 let entity = build_entity(
254 entity_name,
255 label,
256 current_line,
257 &mut current_fields,
258 errors,
259 );
260 entities.push(entity);
261 }
262
263 if entities.len() > MAX_ENTITIES_PER_FILE {
265 errors.push(ParseError {
266 line: section_start_line,
267 message: format!(
268 "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
269 entities.len()
270 ),
271 });
272 }
273
274 entities
275}
276
277fn flush_pending_list(
278 pending_key: &mut Option<String>,
279 pending_items: &mut Vec<String>,
280 fields: &mut Vec<(String, FieldValue)>,
281) {
282 if let Some(key) = pending_key.take() {
283 fields.push((key, FieldValue::List(std::mem::take(pending_items))));
284 }
285}
286
287fn build_entity(
288 name: String,
289 label: Label,
290 line: usize,
291 fields: &mut Vec<(String, FieldValue)>,
292 errors: &mut Vec<ParseError>,
293) -> Entity {
294 if name.trim().is_empty() {
296 errors.push(ParseError {
297 line,
298 message: "entity name must not be empty".into(),
299 });
300 } else if name.len() > MAX_NAME_LEN {
301 errors.push(ParseError {
302 line,
303 message: format!(
304 "entity name exceeds {MAX_NAME_LEN} chars (got {})",
305 name.len()
306 ),
307 });
308 }
309
310 let id = extract_id_field(fields);
312
313 apply_type_shorthand(fields, label);
315
316 normalize_enum_fields(fields);
318
319 validate_fields(fields, label, line, errors);
321
322 Entity {
323 name,
324 label,
325 fields: std::mem::take(fields),
326 id,
327 line,
328 tags: Vec::new(),
329 }
330}
331
332fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
334 let pos = fields.iter().position(|(k, _)| k == "id")?;
335 let (_, value) = fields.remove(pos);
336 match value {
337 FieldValue::Single(s) if !s.is_empty() => Some(s),
338 _ => None,
339 }
340}
341
342fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
344 for field in fields.iter_mut() {
345 if field.0 == "type" {
346 field.0 = match label {
347 Label::Organization => "org_type".to_string(),
348 Label::Event => "event_type".to_string(),
349 Label::Document => "doc_type".to_string(),
350 Label::Asset => "asset_type".to_string(),
351 Label::Person => "type".to_string(), };
353 }
354 }
355}
356
357fn parse_bullet(item: &str) -> Option<(String, String)> {
359 let colon_pos = item.find(':')?;
360 let key = item[..colon_pos].trim();
361 if key.is_empty() {
362 return None;
363 }
364 let value = item[colon_pos + 1..].trim();
365 Some((key.to_string(), value.to_string()))
366}
367
368fn is_list_field(key: &str) -> bool {
370 matches!(key, "aliases" | "urls" | "role")
371}
372
373fn strip_h3(line: &str) -> Option<&str> {
375 let trimmed = line.trim_start();
376 if let Some(rest) = trimmed.strip_prefix("### ") {
377 if !rest.starts_with('#') {
379 return Some(rest.trim());
380 }
381 }
382 None
383}
384
385const COMMON_FIELDS: &[&str] = &[
389 "qualifier",
390 "aliases",
391 "thumbnail",
392 "thumbnail_source",
393 "urls",
394 "description",
395];
396
397const PERSON_FIELDS: &[&str] = &[
398 "role",
399 "nationality",
400 "date_of_birth",
401 "place_of_birth",
402 "status",
403];
404
405const ORGANIZATION_FIELDS: &[&str] = &[
406 "org_type",
407 "jurisdiction",
408 "headquarters",
409 "founded_date",
410 "registration_number",
411 "status",
412];
413
414const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
415
416const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
417
418const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
419
420use crate::domain;
422
423const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
424const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
425const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
426const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
427const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
428const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
429const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
430const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
431const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
432
433struct FieldConstraint {
435 max_len: usize,
436 enum_values: Option<&'static [&'static str]>,
438}
439
440fn field_constraint(key: &str) -> Option<FieldConstraint> {
441 match key {
442 "description" => Some(FieldConstraint {
443 max_len: 2000,
444 enum_values: None,
445 }),
446 "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
447 max_len: 2048,
448 enum_values: None,
449 }),
450 "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
451 | "closed_at" => Some(FieldConstraint {
452 max_len: 10,
453 enum_values: None,
454 }),
455 "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
456 Some(FieldConstraint {
457 max_len: 200,
458 enum_values: None,
459 })
460 }
461 "jurisdiction" => Some(FieldConstraint {
462 max_len: 203, enum_values: None,
465 }),
466 "role" => Some(FieldConstraint {
467 max_len: 100,
468 enum_values: Some(ROLE_VALUES),
469 }),
470 "org_type" => Some(FieldConstraint {
471 max_len: 100,
472 enum_values: Some(ORG_TYPE_VALUES),
473 }),
474 "event_type" => Some(FieldConstraint {
475 max_len: 100,
476 enum_values: Some(EVENT_TYPE_VALUES),
477 }),
478 "doc_type" => Some(FieldConstraint {
479 max_len: 100,
480 enum_values: Some(DOC_TYPE_VALUES),
481 }),
482 "asset_type" => Some(FieldConstraint {
483 max_len: 100,
484 enum_values: Some(ASSET_TYPE_VALUES),
485 }),
486 "severity" => Some(FieldConstraint {
487 max_len: 20,
488 enum_values: Some(SEVERITY_VALUES),
489 }),
490 "status" => Some(FieldConstraint {
491 max_len: 30,
494 enum_values: None,
495 }),
496 "qualifier" | "nationality" | "case_number" | "registration_number" => {
497 Some(FieldConstraint {
498 max_len: 100,
499 enum_values: None,
500 })
501 }
502 _ => None,
504 }
505}
506
507const MAX_ALIASES: usize = 10;
509const MAX_ALIAS_LEN: usize = 200;
510const MAX_URLS: usize = 10;
511const MAX_URL_LEN: usize = 2048;
512
513fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
517 for (key, value) in fields.iter_mut() {
518 let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
519
520 match value {
521 FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
522 let normalized = val.to_lowercase().replace(' ', "_");
523 if normalized != *val {
524 *val = normalized;
525 }
526 }
527 FieldValue::List(items) if is_enum => {
528 for item in items.iter_mut() {
529 if !item.starts_with("custom:") {
530 let normalized = item.to_lowercase().replace(' ', "_");
531 if normalized != *item {
532 *item = normalized;
533 }
534 }
535 }
536 }
537 _ => {}
538 }
539 }
540}
541
542#[allow(clippy::too_many_lines)]
543fn validate_fields(
544 fields: &[(String, FieldValue)],
545 label: Label,
546 line: usize,
547 errors: &mut Vec<ParseError>,
548) {
549 let label_fields: &[&str] = match label {
550 Label::Person => PERSON_FIELDS,
551 Label::Organization => ORGANIZATION_FIELDS,
552 Label::Event => EVENT_FIELDS,
553 Label::Document => DOCUMENT_FIELDS,
554 Label::Asset => ASSET_FIELDS,
555 };
556
557 for (key, value) in fields {
558 if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
560 errors.push(ParseError {
561 line,
562 message: format!("unknown field {key:?} for {label}"),
563 });
564 continue;
565 }
566
567 match value {
568 FieldValue::Single(val) => {
569 if let Some(constraint) = field_constraint(key) {
570 if val.len() > constraint.max_len {
571 errors.push(ParseError {
572 line,
573 message: format!(
574 "field {key:?} exceeds {} chars (got {})",
575 constraint.max_len,
576 val.len()
577 ),
578 });
579 }
580
581 if let Some(allowed) = constraint.enum_values {
583 validate_enum_value(key, val, allowed, line, errors);
584 }
585
586 if matches!(
588 key.as_str(),
589 "occurred_at"
590 | "date_of_birth"
591 | "founded_date"
592 | "issued_at"
593 | "opened_at"
594 | "closed_at"
595 ) && !val.is_empty()
596 {
597 validate_date_format(key, val, line, errors);
598 }
599
600 if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
602 && !val.is_empty()
603 && !val.starts_with("https://")
604 {
605 errors.push(ParseError {
606 line,
607 message: format!("field {key:?} must be HTTPS URL"),
608 });
609 }
610 }
611
612 if key == "status" {
614 validate_status(val, label, line, errors);
615 }
616
617 if key == "jurisdiction" && !val.is_empty() {
619 validate_jurisdiction(val, line, errors);
620 }
621
622 if key == "value" && !val.is_empty() {
624 validate_money(val, line, errors);
625 }
626 }
627 FieldValue::List(items) => match key.as_str() {
628 "aliases" => {
629 if items.len() > MAX_ALIASES {
630 errors.push(ParseError {
631 line,
632 message: format!(
633 "aliases exceeds {MAX_ALIASES} items (got {})",
634 items.len()
635 ),
636 });
637 }
638 for item in items {
639 if item.len() > MAX_ALIAS_LEN {
640 errors.push(ParseError {
641 line,
642 message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
643 });
644 }
645 }
646 }
647 "urls" => {
648 if items.len() > MAX_URLS {
649 errors.push(ParseError {
650 line,
651 message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
652 });
653 }
654 for item in items {
655 if item.len() > MAX_URL_LEN {
656 errors.push(ParseError {
657 line,
658 message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
659 });
660 }
661 if !item.starts_with("https://") {
662 errors.push(ParseError {
663 line,
664 message: format!("url must be HTTPS: {item:?}"),
665 });
666 }
667 }
668 }
669 "role" => {
670 if items.len() > MAX_ROLES {
671 errors.push(ParseError {
672 line,
673 message: format!(
674 "role exceeds {MAX_ROLES} items (got {})",
675 items.len()
676 ),
677 });
678 }
679 for item in items {
680 validate_enum_value("role", item, ROLE_VALUES, line, errors);
681 }
682 }
683 _ => {}
684 },
685 }
686 }
687
688 if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
690 errors.push(ParseError {
691 line,
692 message: "organization entity missing required field \"org_type\"".into(),
693 });
694 }
695}
696
697const MAX_ROLES: usize = 10;
699
700fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
702 let allowed: &[&str] = match label {
703 Label::Person => PERSON_STATUS_VALUES,
704 Label::Organization => ORG_STATUS_VALUES,
705 Label::Asset => ASSET_STATUS_VALUES,
706 _ => {
707 errors.push(ParseError {
708 line,
709 message: format!("field \"status\" is not valid for {label}"),
710 });
711 return;
712 }
713 };
714
715 let normalized = value.to_lowercase().replace(' ', "_");
716 if !allowed.contains(&normalized.as_str()) {
717 errors.push(ParseError {
718 line,
719 message: format!(
720 "invalid status {value:?} for {label} (known: {})",
721 allowed.join(", ")
722 ),
723 });
724 }
725}
726
727fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
729 if let Some(slash_pos) = value.find('/') {
730 let country = &value[..slash_pos];
731 let subdivision = &value[slash_pos + 1..];
732 if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
733 errors.push(ParseError {
734 line,
735 message: format!(
736 "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
737 ),
738 });
739 }
740 if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
741 errors.push(ParseError {
742 line,
743 message: format!(
744 "jurisdiction subdivision must be 1-{} chars",
745 domain::MAX_SUBDIVISION_LEN
746 ),
747 });
748 }
749 } else {
750 if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
752 errors.push(ParseError {
753 line,
754 message: format!(
755 "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
756 ),
757 });
758 }
759 }
760}
761
762fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
765 let parts: Vec<&str> = value.splitn(3, ' ').collect();
767 if parts.len() < 3 {
768 errors.push(ParseError {
769 line,
770 message: format!(
771 "invalid money format: expected `amount currency \"display\"`, got {value:?}"
772 ),
773 });
774 return;
775 }
776
777 if parts[0].parse::<i64>().is_err() {
779 errors.push(ParseError {
780 line,
781 message: format!("money amount must be an integer, got {:?}", parts[0]),
782 });
783 }
784
785 let currency = parts[1];
787 if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
788 errors.push(ParseError {
789 line,
790 message: format!(
791 "money currency must be 3-letter uppercase ISO code, got {currency:?}"
792 ),
793 });
794 }
795
796 let display = parts[2];
798 if !display.starts_with('"') || !display.ends_with('"') {
799 errors.push(ParseError {
800 line,
801 message: format!("money display must be quoted, got {display:?}"),
802 });
803 } else {
804 let inner = &display[1..display.len() - 1];
805 if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
806 errors.push(ParseError {
807 line,
808 message: format!(
809 "money display exceeds {} chars (got {})",
810 domain::MAX_MONEY_DISPLAY_LEN,
811 inner.len()
812 ),
813 });
814 }
815 }
816}
817
818fn validate_enum_value(
819 key: &str,
820 value: &str,
821 allowed: &[&str],
822 line: usize,
823 errors: &mut Vec<ParseError>,
824) {
825 if let Some(custom) = value.strip_prefix("custom:") {
827 if custom.is_empty() || custom.len() > 100 {
828 errors.push(ParseError {
829 line,
830 message: format!(
831 "field {key:?} custom value must be 1-100 chars, got {}",
832 custom.len()
833 ),
834 });
835 }
836 return;
837 }
838
839 let normalized = value.to_lowercase().replace(' ', "_");
840 if !allowed.contains(&normalized.as_str()) {
841 errors.push(ParseError {
842 line,
843 message: format!(
844 "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
845 allowed.join(", ")
846 ),
847 });
848 }
849}
850
851fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
852 let valid = matches!(value.len(), 4 | 7 | 10)
854 && value.chars().enumerate().all(|(i, c)| match i {
855 4 | 7 => c == '-',
856 _ => c.is_ascii_digit(),
857 });
858
859 if !valid {
860 errors.push(ParseError {
861 line,
862 message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
863 });
864 }
865}
866
867#[cfg(test)]
868mod tests {
869 use super::*;
870
871 #[test]
872 fn parse_person_entity() {
873 let body = [
874 "",
875 "### Mark Bonnick",
876 "- qualifier: Arsenal Kit Manager",
877 "- nationality: GB",
878 "- role: custom:Kit Manager",
879 "- date_of_birth: 1962",
880 "- description: Academy kit manager at Arsenal FC for 22 years",
881 " (2001-2024). Age 62 at time of dismissal.",
882 "",
883 ]
884 .join("\n");
885
886 let mut errors = Vec::new();
887 let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
888 assert!(errors.is_empty(), "errors: {errors:?}");
889 assert_eq!(entities.len(), 1);
890
891 let e = &entities[0];
892 assert_eq!(e.name, "Mark Bonnick");
893 assert_eq!(e.label, Label::Person);
894 assert_eq!(e.fields.len(), 5);
895
896 let desc = e
898 .fields
899 .iter()
900 .find(|(k, _)| k == "description")
901 .map(|(_, v)| v);
902 assert_eq!(
903 desc,
904 Some(&FieldValue::Single(
905 "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
906 ))
907 );
908 }
909
910 #[test]
911 fn parse_person_with_role_list() {
912 let body = "### Test\n- role: politician, executive\n";
913 let mut errors = Vec::new();
914 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
915 assert!(errors.is_empty(), "errors: {errors:?}");
916 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
917 assert_eq!(
918 roles.map(|(_, v)| v),
919 Some(&FieldValue::List(vec![
920 "politician".into(),
921 "executive".into(),
922 ]))
923 );
924 }
925
926 #[test]
927 fn parse_person_with_status() {
928 let body = "### Test\n- status: imprisoned\n";
929 let mut errors = Vec::new();
930 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
931 assert!(errors.is_empty(), "errors: {errors:?}");
932 }
933
934 #[test]
935 fn reject_invalid_person_status() {
936 let body = "### Test\n- status: unknown_status\n";
937 let mut errors = Vec::new();
938 parse_entities(body, SectionKind::People, 1, &mut errors);
939 assert!(errors.iter().any(|e| e.message.contains("invalid status")));
940 }
941
942 #[test]
943 fn parse_organization_with_type_shorthand() {
944 let body = [
945 "",
946 "### Arsenal FC",
947 "- type: sports_club",
948 "- jurisdiction: GB",
949 "- aliases: Arsenal, The Gunners, Arsenal Football Club",
950 "- urls:",
951 " - https://www.arsenal.com",
952 " - https://en.wikipedia.org/wiki/Arsenal_F.C.",
953 "",
954 ]
955 .join("\n");
956
957 let mut errors = Vec::new();
958 let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
959 assert!(errors.is_empty(), "errors: {errors:?}");
960 assert_eq!(entities.len(), 1);
961
962 let e = &entities[0];
963 assert_eq!(e.name, "Arsenal FC");
964 assert_eq!(e.label, Label::Organization);
965
966 let it = e.fields.iter().find(|(k, _)| k == "org_type");
968 assert_eq!(
969 it.map(|(_, v)| v),
970 Some(&FieldValue::Single("sports_club".into()))
971 );
972
973 let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
975 assert_eq!(
976 aliases.map(|(_, v)| v),
977 Some(&FieldValue::List(vec![
978 "Arsenal".into(),
979 "The Gunners".into(),
980 "Arsenal Football Club".into(),
981 ]))
982 );
983
984 let urls = e.fields.iter().find(|(k, _)| k == "urls");
986 assert_eq!(
987 urls.map(|(_, v)| v),
988 Some(&FieldValue::List(vec![
989 "https://www.arsenal.com".into(),
990 "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
991 ]))
992 );
993 }
994
995 #[test]
996 fn parse_organization_with_jurisdiction_subdivision() {
997 let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
998 let mut errors = Vec::new();
999 let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1000 assert!(errors.is_empty(), "errors: {errors:?}");
1001 let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1002 assert_eq!(
1003 j.map(|(_, v)| v),
1004 Some(&FieldValue::Single("ID/West Java".into()))
1005 );
1006 }
1007
1008 #[test]
1009 fn reject_invalid_jurisdiction() {
1010 let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1011 let mut errors = Vec::new();
1012 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1013 assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1014 }
1015
1016 #[test]
1017 fn parse_event_with_type_shorthand() {
1018 let body = [
1019 "",
1020 "### Bonnick dismissal",
1021 "- occurred_at: 2024-12-24",
1022 "- type: dismissal",
1023 "- description: Arsenal dismisses Bonnick.",
1024 "",
1025 ]
1026 .join("\n");
1027
1028 let mut errors = Vec::new();
1029 let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1030 assert!(errors.is_empty(), "errors: {errors:?}");
1031
1032 let e = &entities[0];
1033 assert_eq!(e.label, Label::Event);
1034 let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1035 assert_eq!(
1036 dt.map(|(_, v)| v),
1037 Some(&FieldValue::Single("dismissal".into()))
1038 );
1039 }
1040
1041 #[test]
1042 fn parse_event_with_severity() {
1043 let body =
1044 "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1045 let mut errors = Vec::new();
1046 let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1047 assert!(errors.is_empty(), "errors: {errors:?}");
1048 }
1049
1050 #[test]
1051 fn parse_document_entity() {
1052 let body = [
1053 "### Indictment No. 123",
1054 "- doc_type: indictment",
1055 "- issued_at: 2024-03-15",
1056 "- issuing_authority: Jakarta District Court",
1057 "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1058 ]
1059 .join("\n");
1060 let mut errors = Vec::new();
1061 let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1062 assert!(errors.is_empty(), "errors: {errors:?}");
1063 assert_eq!(entities.len(), 1);
1064 assert_eq!(entities[0].label, Label::Document);
1065 }
1066
1067 #[test]
1068 fn parse_asset_entity() {
1069 let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1070 let mut errors = Vec::new();
1071 let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1072 assert!(errors.is_empty(), "errors: {errors:?}");
1073 assert_eq!(entities.len(), 1);
1074 assert_eq!(entities[0].label, Label::Asset);
1075 }
1076
1077 #[test]
1078 fn reject_invalid_money_format() {
1079 let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1080 let mut errors = Vec::new();
1081 parse_entities(body, SectionKind::Assets, 1, &mut errors);
1082 assert!(errors.iter().any(|e| e.message.contains("money")));
1083 }
1084
1085 #[test]
1086 fn reject_unknown_field() {
1087 let body = "### Test\n- foobar: value\n";
1088 let mut errors = Vec::new();
1089 parse_entities(body, SectionKind::People, 1, &mut errors);
1090 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1091 }
1092
1093 #[test]
1094 fn reject_wrong_label_field() {
1095 let body = "### Test\n- org_type: court\n";
1097 let mut errors = Vec::new();
1098 parse_entities(body, SectionKind::People, 1, &mut errors);
1099 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1100 }
1101
1102 #[test]
1103 fn reject_invalid_enum_value() {
1104 let body = "### Test\n- role: wizard\n";
1105 let mut errors = Vec::new();
1106 parse_entities(body, SectionKind::People, 1, &mut errors);
1107 assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1108 }
1109
1110 #[test]
1111 fn accept_custom_enum_value() {
1112 let body = "### Test\n- role: custom:Kit Manager\n";
1113 let mut errors = Vec::new();
1114 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1115 assert!(errors.is_empty(), "errors: {errors:?}");
1116 assert_eq!(entities.len(), 1);
1117 }
1118
1119 #[test]
1120 fn normalize_enum_value_spaces_to_underscores() {
1121 let body = "### Test\n- role: civil servant\n";
1122 let mut errors = Vec::new();
1123 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1124 assert!(errors.is_empty(), "errors: {errors:?}");
1125 assert_eq!(entities.len(), 1);
1126 let val = entities[0]
1127 .fields
1128 .iter()
1129 .find(|(k, _)| k == "role")
1130 .map(|(_, v)| match v {
1131 FieldValue::Single(s) => s.as_str(),
1132 _ => "",
1133 });
1134 assert_eq!(val, Some("civil_servant"));
1135 }
1136
1137 #[test]
1138 fn normalize_enum_list_values() {
1139 let body = "### Test\n- role: civil servant, law enforcement\n";
1140 let mut errors = Vec::new();
1141 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1142 assert!(errors.is_empty(), "errors: {errors:?}");
1143 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1144 assert_eq!(
1145 roles.map(|(_, v)| v),
1146 Some(&FieldValue::List(vec![
1147 "civil_servant".into(),
1148 "law_enforcement".into(),
1149 ]))
1150 );
1151 }
1152
1153 #[test]
1154 fn reject_invalid_date_format() {
1155 let body = "### Test\n- date_of_birth: January 1990\n";
1156 let mut errors = Vec::new();
1157 parse_entities(body, SectionKind::People, 1, &mut errors);
1158 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1159 }
1160
1161 #[test]
1162 fn accept_valid_date_formats() {
1163 for date in &["2024", "2024-01", "2024-01-15"] {
1164 let body = format!("### Test\n- date_of_birth: {date}\n");
1165 let mut errors = Vec::new();
1166 parse_entities(&body, SectionKind::People, 1, &mut errors);
1167 assert!(
1168 errors.is_empty(),
1169 "date {date:?} should be valid: {errors:?}"
1170 );
1171 }
1172 }
1173
1174 #[test]
1175 fn reject_non_https_url() {
1176 let body = "### Test\n- urls:\n - http://example.com\n";
1177 let mut errors = Vec::new();
1178 parse_entities(body, SectionKind::People, 1, &mut errors);
1179 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1180 }
1181
1182 #[test]
1183 fn reject_non_https_thumbnail() {
1184 let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1185 let mut errors = Vec::new();
1186 parse_entities(body, SectionKind::People, 1, &mut errors);
1187 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1188 }
1189
1190 #[test]
1191 fn multiple_entities() {
1192 let body = [
1193 "",
1194 "### Alice",
1195 "- nationality: NL",
1196 "",
1197 "### Bob",
1198 "- nationality: GB",
1199 "",
1200 ]
1201 .join("\n");
1202
1203 let mut errors = Vec::new();
1204 let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1205 assert!(errors.is_empty(), "errors: {errors:?}");
1206 assert_eq!(entities.len(), 2);
1207 assert_eq!(entities[0].name, "Alice");
1208 assert_eq!(entities[1].name, "Bob");
1209 }
1210
1211 #[test]
1212 fn field_max_length_violation() {
1213 let long_val = "a".repeat(201);
1214 let body = format!("### Test\n- nationality: {long_val}\n");
1215 let mut errors = Vec::new();
1216 parse_entities(&body, SectionKind::People, 1, &mut errors);
1217 assert!(
1218 errors
1219 .iter()
1220 .any(|e| e.message.contains("exceeds 100 chars"))
1221 );
1222 }
1223
1224 #[test]
1225 fn too_many_aliases() {
1226 let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1227 let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1228 let mut errors = Vec::new();
1229 parse_entities(&body, SectionKind::People, 1, &mut errors);
1230 assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1231 }
1232
1233 #[test]
1234 fn require_org_type_for_organizations() {
1235 let body = "### Test Corp\n- qualifier: Test\n";
1236 let mut errors = Vec::new();
1237 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1238 assert!(
1239 errors
1240 .iter()
1241 .any(|e| { e.message.contains("missing required field \"org_type\"") })
1242 );
1243 }
1244
1245 #[test]
1246 fn accept_organization_with_type() {
1247 let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1248 let mut errors = Vec::new();
1249 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1250 assert!(errors.is_empty(), "errors: {errors:?}");
1251 }
1252}