1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5const MAX_ENTITIES_PER_FILE: usize = 50;
7
8const MAX_NAME_LEN: usize = 300;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14 Person,
15 Organization,
16 Event,
17 Document,
18 Asset,
19}
20
21impl fmt::Display for Label {
22 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23 match self {
24 Self::Person => write!(f, "person"),
25 Self::Organization => write!(f, "organization"),
26 Self::Event => write!(f, "event"),
27 Self::Document => write!(f, "document"),
28 Self::Asset => write!(f, "asset"),
29 }
30 }
31}
32
33impl Label {
34 pub fn from_section(kind: SectionKind) -> Option<Self> {
35 match kind {
36 SectionKind::People => Some(Self::Person),
37 SectionKind::Organizations => Some(Self::Organization),
38 SectionKind::Events => Some(Self::Event),
39 SectionKind::Documents => Some(Self::Document),
40 SectionKind::Assets => Some(Self::Asset),
41 _ => None,
42 }
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct Entity {
49 pub name: String,
50 pub label: Label,
51 pub fields: Vec<(String, FieldValue)>,
52 pub id: Option<String>,
54 pub line: usize,
56 pub tags: Vec<String>,
58 pub slug: Option<String>,
61}
62
63#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66 Single(String),
67 List(Vec<String>),
68}
69
70pub fn parse_entity_file_body(
75 name: &str,
76 body: &str,
77 label: Label,
78 id: Option<String>,
79 title_line: usize,
80 errors: &mut Vec<ParseError>,
81) -> Entity {
82 let section_kind = match label {
83 Label::Person => SectionKind::People,
84 Label::Organization => SectionKind::Organizations,
85 Label::Event => SectionKind::Events,
86 Label::Document => SectionKind::Documents,
87 Label::Asset => SectionKind::Assets,
88 };
89
90 let wrapped = format!("### {name}\n{body}");
92 let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94 if let Some(mut entity) = entities.pop() {
95 entity.id = id;
96 entity.line = title_line;
97 entity
98 } else {
99 Entity {
100 name: name.to_string(),
101 label,
102 fields: Vec::new(),
103 id,
104 line: title_line,
105 tags: Vec::new(),
106 slug: None,
107 }
108 }
109}
110
111#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116 body: &str,
117 section_kind: SectionKind,
118 section_start_line: usize,
119 errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121 let Some(label) = Label::from_section(section_kind) else {
122 return Vec::new();
123 };
124
125 let lines: Vec<&str> = body.lines().collect();
126 let mut entities: Vec<Entity> = Vec::new();
127 let mut current_name: Option<String> = None;
128 let mut current_line: usize = 0;
129 let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130 let mut pending_list_key: Option<String> = None;
132 let mut pending_list_items: Vec<String> = Vec::new();
133
134 for (i, line) in lines.iter().enumerate() {
135 let file_line = section_start_line + 1 + i; if let Some(name) = strip_h3(line) {
139 flush_pending_list(
141 &mut pending_list_key,
142 &mut pending_list_items,
143 &mut current_fields,
144 );
145
146 if let Some(entity_name) = current_name.take() {
148 let entity = build_entity(
149 entity_name,
150 label,
151 current_line,
152 &mut current_fields,
153 errors,
154 );
155 entities.push(entity);
156 }
157
158 current_name = Some(name.to_string());
159 current_line = file_line;
160 current_fields.clear();
161 continue;
162 }
163
164 if current_name.is_none() {
166 if !line.trim().is_empty() {
167 errors.push(ParseError {
168 line: file_line,
169 message: "content before first entity heading (### Name)".into(),
170 });
171 }
172 continue;
173 }
174
175 let trimmed = line.trim();
176
177 if let Some(item) = trimmed.strip_prefix("- ") {
179 if line.starts_with(" - ") && pending_list_key.is_some() {
180 pending_list_items.push(item.trim().to_string());
182 continue;
183 }
184
185 flush_pending_list(
187 &mut pending_list_key,
188 &mut pending_list_items,
189 &mut current_fields,
190 );
191
192 if let Some((key, value)) = parse_bullet(item) {
194 if value.is_empty() {
195 pending_list_key = Some(key);
197 pending_list_items.clear();
198 } else if is_list_field(&key) && value.contains(',') {
199 let items: Vec<String> = value
201 .split(',')
202 .map(|s| s.trim().to_string())
203 .filter(|s| !s.is_empty())
204 .collect();
205 current_fields.push((key, FieldValue::List(items)));
206 } else {
207 current_fields.push((key, FieldValue::Single(value)));
208 }
209 } else {
210 errors.push(ParseError {
211 line: file_line,
212 message: format!(
213 "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214 ),
215 });
216 }
217 continue;
218 }
219
220 if line.starts_with(" ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222 if pending_list_key.is_some() {
223 errors.push(ParseError {
225 line: file_line,
226 message: "unexpected indented text in list context".into(),
227 });
228 } else if let Some(last) = current_fields.last_mut() {
229 match last.1 {
230 FieldValue::Single(ref mut val) => {
231 val.push('\n');
232 val.push_str(trimmed);
233 }
234 FieldValue::List(ref mut items) => {
235 let tail = items.pop().unwrap_or_default();
239 let joined = if tail.is_empty() {
240 trimmed.to_string()
241 } else {
242 format!("{tail} {trimmed}")
243 };
244 for part in joined.split(',') {
245 let part = part.trim().to_string();
246 if !part.is_empty() {
247 items.push(part);
248 }
249 }
250 }
251 }
252 }
253 continue;
254 }
255
256 if !trimmed.is_empty() {
258 flush_pending_list(
260 &mut pending_list_key,
261 &mut pending_list_items,
262 &mut current_fields,
263 );
264 }
265 }
266
267 flush_pending_list(
269 &mut pending_list_key,
270 &mut pending_list_items,
271 &mut current_fields,
272 );
273
274 if let Some(entity_name) = current_name.take() {
275 let entity = build_entity(
276 entity_name,
277 label,
278 current_line,
279 &mut current_fields,
280 errors,
281 );
282 entities.push(entity);
283 }
284
285 if entities.len() > MAX_ENTITIES_PER_FILE {
287 errors.push(ParseError {
288 line: section_start_line,
289 message: format!(
290 "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
291 entities.len()
292 ),
293 });
294 }
295
296 entities
297}
298
299fn flush_pending_list(
300 pending_key: &mut Option<String>,
301 pending_items: &mut Vec<String>,
302 fields: &mut Vec<(String, FieldValue)>,
303) {
304 if let Some(key) = pending_key.take() {
305 fields.push((key, FieldValue::List(std::mem::take(pending_items))));
306 }
307}
308
309fn build_entity(
310 name: String,
311 label: Label,
312 line: usize,
313 fields: &mut Vec<(String, FieldValue)>,
314 errors: &mut Vec<ParseError>,
315) -> Entity {
316 if name.trim().is_empty() {
318 errors.push(ParseError {
319 line,
320 message: "entity name must not be empty".into(),
321 });
322 } else if name.len() > MAX_NAME_LEN {
323 errors.push(ParseError {
324 line,
325 message: format!(
326 "entity name exceeds {MAX_NAME_LEN} chars (got {})",
327 name.len()
328 ),
329 });
330 }
331
332 let id = extract_id_field(fields);
334
335 apply_type_shorthand(fields, label);
337
338 normalize_enum_fields(fields);
340
341 validate_fields(fields, label, line, errors);
343
344 Entity {
345 name,
346 label,
347 fields: std::mem::take(fields),
348 id,
349 line,
350 tags: Vec::new(),
351 slug: None,
352 }
353}
354
355fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
357 let pos = fields.iter().position(|(k, _)| k == "id")?;
358 let (_, value) = fields.remove(pos);
359 match value {
360 FieldValue::Single(s) if !s.is_empty() => Some(s),
361 _ => None,
362 }
363}
364
365fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
367 for field in fields.iter_mut() {
368 if field.0 == "type" {
369 field.0 = match label {
370 Label::Organization => "org_type".to_string(),
371 Label::Event => "event_type".to_string(),
372 Label::Document => "doc_type".to_string(),
373 Label::Asset => "asset_type".to_string(),
374 Label::Person => "type".to_string(), };
376 }
377 }
378}
379
380fn parse_bullet(item: &str) -> Option<(String, String)> {
382 let colon_pos = item.find(':')?;
383 let key = item[..colon_pos].trim();
384 if key.is_empty() {
385 return None;
386 }
387 let value = item[colon_pos + 1..].trim();
388 Some((key.to_string(), value.to_string()))
389}
390
391fn is_list_field(key: &str) -> bool {
393 matches!(key, "aliases" | "urls" | "role")
394}
395
396fn strip_h3(line: &str) -> Option<&str> {
398 let trimmed = line.trim_start();
399 if let Some(rest) = trimmed.strip_prefix("### ") {
400 if !rest.starts_with('#') {
402 return Some(rest.trim());
403 }
404 }
405 None
406}
407
408const COMMON_FIELDS: &[&str] = &[
412 "qualifier",
413 "aliases",
414 "thumbnail",
415 "thumbnail_source",
416 "urls",
417 "description",
418];
419
420const PERSON_FIELDS: &[&str] = &[
421 "role",
422 "nationality",
423 "date_of_birth",
424 "date_of_death",
425 "place_of_birth",
426 "status",
427];
428
429const ORGANIZATION_FIELDS: &[&str] = &[
430 "org_type",
431 "jurisdiction",
432 "headquarters",
433 "founded_date",
434 "registration_number",
435 "status",
436];
437
438const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
439
440const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
441
442const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
443
444use crate::domain;
446
447const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
448const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
449const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
450const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
451const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
452const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
453const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
454const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
455const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
456
457struct FieldConstraint {
459 max_len: usize,
460 enum_values: Option<&'static [&'static str]>,
462}
463
464fn field_constraint(key: &str) -> Option<FieldConstraint> {
465 match key {
466 "description" => Some(FieldConstraint {
467 max_len: 2000,
468 enum_values: None,
469 }),
470 "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
471 max_len: 2048,
472 enum_values: None,
473 }),
474 "occurred_at" | "date_of_birth" | "date_of_death" | "founded_date" | "issued_at" | "opened_at"
475 | "closed_at" => Some(FieldConstraint {
476 max_len: 10,
477 enum_values: None,
478 }),
479 "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
480 Some(FieldConstraint {
481 max_len: 200,
482 enum_values: None,
483 })
484 }
485 "jurisdiction" => Some(FieldConstraint {
486 max_len: 203, enum_values: None,
489 }),
490 "role" => Some(FieldConstraint {
491 max_len: 100,
492 enum_values: Some(ROLE_VALUES),
493 }),
494 "org_type" => Some(FieldConstraint {
495 max_len: 100,
496 enum_values: Some(ORG_TYPE_VALUES),
497 }),
498 "event_type" => Some(FieldConstraint {
499 max_len: 100,
500 enum_values: Some(EVENT_TYPE_VALUES),
501 }),
502 "doc_type" => Some(FieldConstraint {
503 max_len: 100,
504 enum_values: Some(DOC_TYPE_VALUES),
505 }),
506 "asset_type" => Some(FieldConstraint {
507 max_len: 100,
508 enum_values: Some(ASSET_TYPE_VALUES),
509 }),
510 "severity" => Some(FieldConstraint {
511 max_len: 20,
512 enum_values: Some(SEVERITY_VALUES),
513 }),
514 "status" => Some(FieldConstraint {
515 max_len: 30,
518 enum_values: None,
519 }),
520 "qualifier" | "nationality" | "case_number" | "registration_number" => {
521 Some(FieldConstraint {
522 max_len: 100,
523 enum_values: None,
524 })
525 }
526 _ => None,
528 }
529}
530
531const MAX_ALIASES: usize = 10;
533const MAX_ALIAS_LEN: usize = 200;
534const MAX_URLS: usize = 10;
535const MAX_URL_LEN: usize = 2048;
536
537fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
541 for (key, value) in fields.iter_mut() {
542 let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
543
544 match value {
545 FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
546 let normalized = val.to_lowercase().replace(' ', "_");
547 if normalized != *val {
548 *val = normalized;
549 }
550 }
551 FieldValue::List(items) if is_enum => {
552 for item in items.iter_mut() {
553 if !item.starts_with("custom:") {
554 let normalized = item.to_lowercase().replace(' ', "_");
555 if normalized != *item {
556 *item = normalized;
557 }
558 }
559 }
560 }
561 _ => {}
562 }
563 }
564}
565
566fn validate_fields(
567 fields: &[(String, FieldValue)],
568 label: Label,
569 line: usize,
570 errors: &mut Vec<ParseError>,
571) {
572 let label_fields: &[&str] = match label {
573 Label::Person => PERSON_FIELDS,
574 Label::Organization => ORGANIZATION_FIELDS,
575 Label::Event => EVENT_FIELDS,
576 Label::Document => DOCUMENT_FIELDS,
577 Label::Asset => ASSET_FIELDS,
578 };
579
580 for (key, value) in fields {
581 if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
582 errors.push(ParseError {
583 line,
584 message: format!("unknown field {key:?} for {label}"),
585 });
586 continue;
587 }
588
589 match value {
590 FieldValue::Single(val) => validate_single_field(key, val, label, line, errors),
591 FieldValue::List(items) => validate_list_field(key, items, line, errors),
592 }
593 }
594
595 if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
597 errors.push(ParseError {
598 line,
599 message: "organization entity missing required field \"org_type\"".into(),
600 });
601 }
602}
603
604fn validate_single_field(
606 key: &str,
607 val: &str,
608 label: Label,
609 line: usize,
610 errors: &mut Vec<ParseError>,
611) {
612 if let Some(constraint) = field_constraint(key) {
613 if val.len() > constraint.max_len {
614 errors.push(ParseError {
615 line,
616 message: format!(
617 "field {key:?} exceeds {} chars (got {})",
618 constraint.max_len,
619 val.len()
620 ),
621 });
622 }
623
624 if let Some(allowed) = constraint.enum_values {
625 validate_enum_value(key, val, allowed, line, errors);
626 }
627
628 if matches!(
629 key,
630 "occurred_at"
631 | "date_of_birth"
632 | "date_of_death"
633 | "founded_date"
634 | "issued_at"
635 | "opened_at"
636 | "closed_at"
637 ) && !val.is_empty()
638 {
639 validate_date_format(key, val, line, errors);
640 }
641
642 if matches!(key, "thumbnail" | "thumbnail_source")
643 && !val.is_empty()
644 && !val.starts_with("https://")
645 {
646 errors.push(ParseError {
647 line,
648 message: format!("field {key:?} must be HTTPS URL"),
649 });
650 }
651 }
652
653 if key == "status" {
654 validate_status(val, label, line, errors);
655 }
656
657 if key == "jurisdiction" && !val.is_empty() {
658 validate_jurisdiction(val, line, errors);
659 }
660
661 if key == "value" && !val.is_empty() {
662 validate_money(val, line, errors);
663 }
664}
665
666fn validate_list_field(key: &str, items: &[String], line: usize, errors: &mut Vec<ParseError>) {
668 match key {
669 "aliases" => {
670 if items.len() > MAX_ALIASES {
671 errors.push(ParseError {
672 line,
673 message: format!(
674 "aliases exceeds {MAX_ALIASES} items (got {})",
675 items.len()
676 ),
677 });
678 }
679 for item in items {
680 if item.len() > MAX_ALIAS_LEN {
681 errors.push(ParseError {
682 line,
683 message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
684 });
685 }
686 }
687 }
688 "urls" => {
689 if items.len() > MAX_URLS {
690 errors.push(ParseError {
691 line,
692 message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
693 });
694 }
695 for item in items {
696 if item.len() > MAX_URL_LEN {
697 errors.push(ParseError {
698 line,
699 message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
700 });
701 }
702 if !item.starts_with("https://") {
703 errors.push(ParseError {
704 line,
705 message: format!("url must be HTTPS: {item:?}"),
706 });
707 }
708 }
709 }
710 "role" => {
711 if items.len() > MAX_ROLES {
712 errors.push(ParseError {
713 line,
714 message: format!("role exceeds {MAX_ROLES} items (got {})", items.len()),
715 });
716 }
717 for item in items {
718 validate_enum_value("role", item, ROLE_VALUES, line, errors);
719 }
720 }
721 _ => {}
722 }
723}
724
725const MAX_ROLES: usize = 10;
727
728fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
730 let allowed: &[&str] = match label {
731 Label::Person => PERSON_STATUS_VALUES,
732 Label::Organization => ORG_STATUS_VALUES,
733 Label::Asset => ASSET_STATUS_VALUES,
734 _ => {
735 errors.push(ParseError {
736 line,
737 message: format!("field \"status\" is not valid for {label}"),
738 });
739 return;
740 }
741 };
742
743 let normalized = value.to_lowercase().replace(' ', "_");
744 if !allowed.contains(&normalized.as_str()) {
745 errors.push(ParseError {
746 line,
747 message: format!(
748 "invalid status {value:?} for {label} (known: {})",
749 allowed.join(", ")
750 ),
751 });
752 }
753}
754
755fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
757 if let Some(slash_pos) = value.find('/') {
758 let country = &value[..slash_pos];
759 let subdivision = &value[slash_pos + 1..];
760 if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
761 errors.push(ParseError {
762 line,
763 message: format!(
764 "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
765 ),
766 });
767 }
768 if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
769 errors.push(ParseError {
770 line,
771 message: format!(
772 "jurisdiction subdivision must be 1-{} chars",
773 domain::MAX_SUBDIVISION_LEN
774 ),
775 });
776 }
777 } else {
778 if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
780 errors.push(ParseError {
781 line,
782 message: format!(
783 "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
784 ),
785 });
786 }
787 }
788}
789
790fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
793 let parts: Vec<&str> = value.splitn(3, ' ').collect();
795 if parts.len() < 3 {
796 errors.push(ParseError {
797 line,
798 message: format!(
799 "invalid money format: expected `amount currency \"display\"`, got {value:?}"
800 ),
801 });
802 return;
803 }
804
805 if parts[0].parse::<i64>().is_err() {
807 errors.push(ParseError {
808 line,
809 message: format!("money amount must be an integer, got {:?}", parts[0]),
810 });
811 }
812
813 let currency = parts[1];
815 if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
816 errors.push(ParseError {
817 line,
818 message: format!(
819 "money currency must be 3-letter uppercase ISO code, got {currency:?}"
820 ),
821 });
822 }
823
824 let display = parts[2];
826 if !display.starts_with('"') || !display.ends_with('"') {
827 errors.push(ParseError {
828 line,
829 message: format!("money display must be quoted, got {display:?}"),
830 });
831 } else {
832 let inner = &display[1..display.len() - 1];
833 if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
834 errors.push(ParseError {
835 line,
836 message: format!(
837 "money display exceeds {} chars (got {})",
838 domain::MAX_MONEY_DISPLAY_LEN,
839 inner.len()
840 ),
841 });
842 }
843 }
844}
845
846fn validate_enum_value(
847 key: &str,
848 value: &str,
849 allowed: &[&str],
850 line: usize,
851 errors: &mut Vec<ParseError>,
852) {
853 if let Some(custom) = value.strip_prefix("custom:") {
855 if custom.is_empty() || custom.len() > 100 {
856 errors.push(ParseError {
857 line,
858 message: format!(
859 "field {key:?} custom value must be 1-100 chars, got {}",
860 custom.len()
861 ),
862 });
863 }
864 return;
865 }
866
867 let normalized = value.to_lowercase().replace(' ', "_");
868 if !allowed.contains(&normalized.as_str()) {
869 errors.push(ParseError {
870 line,
871 message: format!(
872 "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
873 allowed.join(", ")
874 ),
875 });
876 }
877}
878
879fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
880 let valid = matches!(value.len(), 4 | 7 | 10)
882 && value.chars().enumerate().all(|(i, c)| match i {
883 4 | 7 => c == '-',
884 _ => c.is_ascii_digit(),
885 });
886
887 if !valid {
888 errors.push(ParseError {
889 line,
890 message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
891 });
892 }
893}
894
895#[cfg(test)]
896mod tests {
897 use super::*;
898
899 #[test]
900 fn parse_person_entity() {
901 let body = [
902 "",
903 "### Mark Bonnick",
904 "- qualifier: Arsenal Kit Manager",
905 "- nationality: GB",
906 "- role: custom:Kit Manager",
907 "- date_of_birth: 1962",
908 "- description: Academy kit manager at Arsenal FC for 22 years",
909 " (2001-2024). Age 62 at time of dismissal.",
910 "",
911 ]
912 .join("\n");
913
914 let mut errors = Vec::new();
915 let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
916 assert!(errors.is_empty(), "errors: {errors:?}");
917 assert_eq!(entities.len(), 1);
918
919 let e = &entities[0];
920 assert_eq!(e.name, "Mark Bonnick");
921 assert_eq!(e.label, Label::Person);
922 assert_eq!(e.fields.len(), 5);
923
924 let desc = e
926 .fields
927 .iter()
928 .find(|(k, _)| k == "description")
929 .map(|(_, v)| v);
930 assert_eq!(
931 desc,
932 Some(&FieldValue::Single(
933 "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
934 ))
935 );
936 }
937
938 #[test]
939 fn parse_person_with_role_list() {
940 let body = "### Test\n- role: politician, executive\n";
941 let mut errors = Vec::new();
942 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
943 assert!(errors.is_empty(), "errors: {errors:?}");
944 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
945 assert_eq!(
946 roles.map(|(_, v)| v),
947 Some(&FieldValue::List(vec![
948 "politician".into(),
949 "executive".into(),
950 ]))
951 );
952 }
953
954 #[test]
955 fn parse_person_with_status() {
956 let body = "### Test\n- status: imprisoned\n";
957 let mut errors = Vec::new();
958 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
959 assert!(errors.is_empty(), "errors: {errors:?}");
960 }
961
962 #[test]
963 fn reject_invalid_person_status() {
964 let body = "### Test\n- status: unknown_status\n";
965 let mut errors = Vec::new();
966 parse_entities(body, SectionKind::People, 1, &mut errors);
967 assert!(errors.iter().any(|e| e.message.contains("invalid status")));
968 }
969
970 #[test]
971 fn parse_organization_with_type_shorthand() {
972 let body = [
973 "",
974 "### Arsenal FC",
975 "- type: sports_club",
976 "- jurisdiction: GB",
977 "- aliases: Arsenal, The Gunners, Arsenal Football Club",
978 "- urls:",
979 " - https://www.arsenal.com",
980 " - https://en.wikipedia.org/wiki/Arsenal_F.C.",
981 "",
982 ]
983 .join("\n");
984
985 let mut errors = Vec::new();
986 let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
987 assert!(errors.is_empty(), "errors: {errors:?}");
988 assert_eq!(entities.len(), 1);
989
990 let e = &entities[0];
991 assert_eq!(e.name, "Arsenal FC");
992 assert_eq!(e.label, Label::Organization);
993
994 let it = e.fields.iter().find(|(k, _)| k == "org_type");
996 assert_eq!(
997 it.map(|(_, v)| v),
998 Some(&FieldValue::Single("sports_club".into()))
999 );
1000
1001 let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
1003 assert_eq!(
1004 aliases.map(|(_, v)| v),
1005 Some(&FieldValue::List(vec![
1006 "Arsenal".into(),
1007 "The Gunners".into(),
1008 "Arsenal Football Club".into(),
1009 ]))
1010 );
1011
1012 let urls = e.fields.iter().find(|(k, _)| k == "urls");
1014 assert_eq!(
1015 urls.map(|(_, v)| v),
1016 Some(&FieldValue::List(vec![
1017 "https://www.arsenal.com".into(),
1018 "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
1019 ]))
1020 );
1021 }
1022
1023 #[test]
1024 fn parse_organization_with_jurisdiction_subdivision() {
1025 let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1026 let mut errors = Vec::new();
1027 let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1028 assert!(errors.is_empty(), "errors: {errors:?}");
1029 let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1030 assert_eq!(
1031 j.map(|(_, v)| v),
1032 Some(&FieldValue::Single("ID/West Java".into()))
1033 );
1034 }
1035
1036 #[test]
1037 fn reject_invalid_jurisdiction() {
1038 let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1039 let mut errors = Vec::new();
1040 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1041 assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1042 }
1043
1044 #[test]
1045 fn parse_event_with_type_shorthand() {
1046 let body = [
1047 "",
1048 "### Bonnick dismissal",
1049 "- occurred_at: 2024-12-24",
1050 "- type: dismissal",
1051 "- description: Arsenal dismisses Bonnick.",
1052 "",
1053 ]
1054 .join("\n");
1055
1056 let mut errors = Vec::new();
1057 let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1058 assert!(errors.is_empty(), "errors: {errors:?}");
1059
1060 let e = &entities[0];
1061 assert_eq!(e.label, Label::Event);
1062 let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1063 assert_eq!(
1064 dt.map(|(_, v)| v),
1065 Some(&FieldValue::Single("dismissal".into()))
1066 );
1067 }
1068
1069 #[test]
1070 fn parse_event_with_severity() {
1071 let body =
1072 "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1073 let mut errors = Vec::new();
1074 let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1075 assert!(errors.is_empty(), "errors: {errors:?}");
1076 }
1077
1078 #[test]
1079 fn parse_document_entity() {
1080 let body = [
1081 "### Indictment No. 123",
1082 "- doc_type: indictment",
1083 "- issued_at: 2024-03-15",
1084 "- issuing_authority: Jakarta District Court",
1085 "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1086 ]
1087 .join("\n");
1088 let mut errors = Vec::new();
1089 let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1090 assert!(errors.is_empty(), "errors: {errors:?}");
1091 assert_eq!(entities.len(), 1);
1092 assert_eq!(entities[0].label, Label::Document);
1093 }
1094
1095 #[test]
1096 fn parse_asset_entity() {
1097 let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1098 let mut errors = Vec::new();
1099 let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1100 assert!(errors.is_empty(), "errors: {errors:?}");
1101 assert_eq!(entities.len(), 1);
1102 assert_eq!(entities[0].label, Label::Asset);
1103 }
1104
1105 #[test]
1106 fn reject_invalid_money_format() {
1107 let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1108 let mut errors = Vec::new();
1109 parse_entities(body, SectionKind::Assets, 1, &mut errors);
1110 assert!(errors.iter().any(|e| e.message.contains("money")));
1111 }
1112
1113 #[test]
1114 fn reject_unknown_field() {
1115 let body = "### Test\n- foobar: value\n";
1116 let mut errors = Vec::new();
1117 parse_entities(body, SectionKind::People, 1, &mut errors);
1118 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1119 }
1120
1121 #[test]
1122 fn reject_wrong_label_field() {
1123 let body = "### Test\n- org_type: court\n";
1125 let mut errors = Vec::new();
1126 parse_entities(body, SectionKind::People, 1, &mut errors);
1127 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1128 }
1129
1130 #[test]
1131 fn reject_invalid_enum_value() {
1132 let body = "### Test\n- role: wizard\n";
1133 let mut errors = Vec::new();
1134 parse_entities(body, SectionKind::People, 1, &mut errors);
1135 assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1136 }
1137
1138 #[test]
1139 fn accept_custom_enum_value() {
1140 let body = "### Test\n- role: custom:Kit Manager\n";
1141 let mut errors = Vec::new();
1142 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1143 assert!(errors.is_empty(), "errors: {errors:?}");
1144 assert_eq!(entities.len(), 1);
1145 }
1146
1147 #[test]
1148 fn normalize_enum_value_spaces_to_underscores() {
1149 let body = "### Test\n- role: civil servant\n";
1150 let mut errors = Vec::new();
1151 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1152 assert!(errors.is_empty(), "errors: {errors:?}");
1153 assert_eq!(entities.len(), 1);
1154 let val = entities[0]
1155 .fields
1156 .iter()
1157 .find(|(k, _)| k == "role")
1158 .map(|(_, v)| match v {
1159 FieldValue::Single(s) => s.as_str(),
1160 _ => "",
1161 });
1162 assert_eq!(val, Some("civil_servant"));
1163 }
1164
1165 #[test]
1166 fn normalize_enum_list_values() {
1167 let body = "### Test\n- role: civil servant, law enforcement\n";
1168 let mut errors = Vec::new();
1169 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1170 assert!(errors.is_empty(), "errors: {errors:?}");
1171 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1172 assert_eq!(
1173 roles.map(|(_, v)| v),
1174 Some(&FieldValue::List(vec![
1175 "civil_servant".into(),
1176 "law_enforcement".into(),
1177 ]))
1178 );
1179 }
1180
1181 #[test]
1182 fn reject_invalid_date_format() {
1183 let body = "### Test\n- date_of_birth: January 1990\n";
1184 let mut errors = Vec::new();
1185 parse_entities(body, SectionKind::People, 1, &mut errors);
1186 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1187 }
1188
1189 #[test]
1190 fn accept_valid_date_formats() {
1191 for date in &["2024", "2024-01", "2024-01-15"] {
1192 let body = format!("### Test\n- date_of_birth: {date}\n");
1193 let mut errors = Vec::new();
1194 parse_entities(&body, SectionKind::People, 1, &mut errors);
1195 assert!(
1196 errors.is_empty(),
1197 "date {date:?} should be valid: {errors:?}"
1198 );
1199 }
1200 }
1201
1202 #[test]
1203 fn reject_non_https_url() {
1204 let body = "### Test\n- urls:\n - http://example.com\n";
1205 let mut errors = Vec::new();
1206 parse_entities(body, SectionKind::People, 1, &mut errors);
1207 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1208 }
1209
1210 #[test]
1211 fn reject_non_https_thumbnail() {
1212 let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1213 let mut errors = Vec::new();
1214 parse_entities(body, SectionKind::People, 1, &mut errors);
1215 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1216 }
1217
1218 #[test]
1219 fn multiple_entities() {
1220 let body = [
1221 "",
1222 "### Alice",
1223 "- nationality: NL",
1224 "",
1225 "### Bob",
1226 "- nationality: GB",
1227 "",
1228 ]
1229 .join("\n");
1230
1231 let mut errors = Vec::new();
1232 let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1233 assert!(errors.is_empty(), "errors: {errors:?}");
1234 assert_eq!(entities.len(), 2);
1235 assert_eq!(entities[0].name, "Alice");
1236 assert_eq!(entities[1].name, "Bob");
1237 }
1238
1239 #[test]
1240 fn field_max_length_violation() {
1241 let long_val = "a".repeat(201);
1242 let body = format!("### Test\n- nationality: {long_val}\n");
1243 let mut errors = Vec::new();
1244 parse_entities(&body, SectionKind::People, 1, &mut errors);
1245 assert!(
1246 errors
1247 .iter()
1248 .any(|e| e.message.contains("exceeds 100 chars"))
1249 );
1250 }
1251
1252 #[test]
1253 fn too_many_aliases() {
1254 let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1255 let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1256 let mut errors = Vec::new();
1257 parse_entities(&body, SectionKind::People, 1, &mut errors);
1258 assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1259 }
1260
1261 #[test]
1262 fn require_org_type_for_organizations() {
1263 let body = "### Test Corp\n- qualifier: Test\n";
1264 let mut errors = Vec::new();
1265 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1266 assert!(
1267 errors
1268 .iter()
1269 .any(|e| { e.message.contains("missing required field \"org_type\"") })
1270 );
1271 }
1272
1273 #[test]
1274 fn accept_organization_with_type() {
1275 let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1276 let mut errors = Vec::new();
1277 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1278 assert!(errors.is_empty(), "errors: {errors:?}");
1279 }
1280}