1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5const MAX_ENTITIES_PER_FILE: usize = 50;
7
8const MAX_NAME_LEN: usize = 300;
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14 Person,
15 Organization,
16 Event,
17 Document,
18 Asset,
19}
20
21impl fmt::Display for Label {
22 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23 match self {
24 Self::Person => write!(f, "person"),
25 Self::Organization => write!(f, "organization"),
26 Self::Event => write!(f, "event"),
27 Self::Document => write!(f, "document"),
28 Self::Asset => write!(f, "asset"),
29 }
30 }
31}
32
33impl Label {
34 pub fn from_section(kind: SectionKind) -> Option<Self> {
35 match kind {
36 SectionKind::People => Some(Self::Person),
37 SectionKind::Organizations => Some(Self::Organization),
38 SectionKind::Events => Some(Self::Event),
39 SectionKind::Documents => Some(Self::Document),
40 SectionKind::Assets => Some(Self::Asset),
41 _ => None,
42 }
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct Entity {
49 pub name: String,
50 pub label: Label,
51 pub fields: Vec<(String, FieldValue)>,
52 pub id: Option<String>,
54 pub line: usize,
56 pub tags: Vec<String>,
58 pub slug: Option<String>,
61}
62
63#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66 Single(String),
67 List(Vec<String>),
68}
69
70pub fn parse_entity_file_body(
75 name: &str,
76 body: &str,
77 label: Label,
78 id: Option<String>,
79 title_line: usize,
80 errors: &mut Vec<ParseError>,
81) -> Entity {
82 let section_kind = match label {
83 Label::Person => SectionKind::People,
84 Label::Organization => SectionKind::Organizations,
85 Label::Event => SectionKind::Events,
86 Label::Document => SectionKind::Documents,
87 Label::Asset => SectionKind::Assets,
88 };
89
90 let wrapped = format!("### {name}\n{body}");
92 let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94 if let Some(mut entity) = entities.pop() {
95 entity.id = id;
96 entity.line = title_line;
97 entity
98 } else {
99 Entity {
100 name: name.to_string(),
101 label,
102 fields: Vec::new(),
103 id,
104 line: title_line,
105 tags: Vec::new(),
106 slug: None,
107 }
108 }
109}
110
111#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116 body: &str,
117 section_kind: SectionKind,
118 section_start_line: usize,
119 errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121 let Some(label) = Label::from_section(section_kind) else {
122 return Vec::new();
123 };
124
125 let lines: Vec<&str> = body.lines().collect();
126 let mut entities: Vec<Entity> = Vec::new();
127 let mut current_name: Option<String> = None;
128 let mut current_line: usize = 0;
129 let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130 let mut pending_list_key: Option<String> = None;
132 let mut pending_list_items: Vec<String> = Vec::new();
133
134 for (i, line) in lines.iter().enumerate() {
135 let file_line = section_start_line + 1 + i; if let Some(name) = strip_h3(line) {
139 flush_pending_list(
141 &mut pending_list_key,
142 &mut pending_list_items,
143 &mut current_fields,
144 );
145
146 if let Some(entity_name) = current_name.take() {
148 let entity = build_entity(
149 entity_name,
150 label,
151 current_line,
152 &mut current_fields,
153 errors,
154 );
155 entities.push(entity);
156 }
157
158 current_name = Some(name.to_string());
159 current_line = file_line;
160 current_fields.clear();
161 continue;
162 }
163
164 if current_name.is_none() {
166 if !line.trim().is_empty() {
167 errors.push(ParseError {
168 line: file_line,
169 message: "content before first entity heading (### Name)".into(),
170 });
171 }
172 continue;
173 }
174
175 let trimmed = line.trim();
176
177 if let Some(item) = trimmed.strip_prefix("- ") {
179 if line.starts_with(" - ") && pending_list_key.is_some() {
180 pending_list_items.push(item.trim().to_string());
182 continue;
183 }
184
185 flush_pending_list(
187 &mut pending_list_key,
188 &mut pending_list_items,
189 &mut current_fields,
190 );
191
192 if let Some((key, value)) = parse_bullet(item) {
194 if value.is_empty() {
195 pending_list_key = Some(key);
197 pending_list_items.clear();
198 } else if is_list_field(&key) && value.contains(',') {
199 let items: Vec<String> = value
201 .split(',')
202 .map(|s| s.trim().to_string())
203 .filter(|s| !s.is_empty())
204 .collect();
205 current_fields.push((key, FieldValue::List(items)));
206 } else {
207 current_fields.push((key, FieldValue::Single(value)));
208 }
209 } else {
210 errors.push(ParseError {
211 line: file_line,
212 message: format!(
213 "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214 ),
215 });
216 }
217 continue;
218 }
219
220 if line.starts_with(" ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222 if pending_list_key.is_some() {
223 errors.push(ParseError {
225 line: file_line,
226 message: "unexpected indented text in list context".into(),
227 });
228 } else if let Some(last) = current_fields.last_mut() {
229 match last.1 {
230 FieldValue::Single(ref mut val) => {
231 val.push('\n');
232 val.push_str(trimmed);
233 }
234 FieldValue::List(ref mut items) => {
235 let tail = items.pop().unwrap_or_default();
239 let joined = if tail.is_empty() {
240 trimmed.to_string()
241 } else {
242 format!("{tail} {trimmed}")
243 };
244 for part in joined.split(',') {
245 let part = part.trim().to_string();
246 if !part.is_empty() {
247 items.push(part);
248 }
249 }
250 }
251 }
252 }
253 continue;
254 }
255
256 if !trimmed.is_empty() {
258 flush_pending_list(
260 &mut pending_list_key,
261 &mut pending_list_items,
262 &mut current_fields,
263 );
264 }
265 }
266
267 flush_pending_list(
269 &mut pending_list_key,
270 &mut pending_list_items,
271 &mut current_fields,
272 );
273
274 if let Some(entity_name) = current_name.take() {
275 let entity = build_entity(
276 entity_name,
277 label,
278 current_line,
279 &mut current_fields,
280 errors,
281 );
282 entities.push(entity);
283 }
284
285 if entities.len() > MAX_ENTITIES_PER_FILE {
287 errors.push(ParseError {
288 line: section_start_line,
289 message: format!(
290 "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
291 entities.len()
292 ),
293 });
294 }
295
296 entities
297}
298
299fn flush_pending_list(
300 pending_key: &mut Option<String>,
301 pending_items: &mut Vec<String>,
302 fields: &mut Vec<(String, FieldValue)>,
303) {
304 if let Some(key) = pending_key.take() {
305 fields.push((key, FieldValue::List(std::mem::take(pending_items))));
306 }
307}
308
309fn build_entity(
310 name: String,
311 label: Label,
312 line: usize,
313 fields: &mut Vec<(String, FieldValue)>,
314 errors: &mut Vec<ParseError>,
315) -> Entity {
316 if name.trim().is_empty() {
318 errors.push(ParseError {
319 line,
320 message: "entity name must not be empty".into(),
321 });
322 } else if name.len() > MAX_NAME_LEN {
323 errors.push(ParseError {
324 line,
325 message: format!(
326 "entity name exceeds {MAX_NAME_LEN} chars (got {})",
327 name.len()
328 ),
329 });
330 }
331
332 let id = extract_id_field(fields);
334
335 apply_type_shorthand(fields, label);
337
338 normalize_enum_fields(fields);
340
341 validate_fields(fields, label, line, errors);
343
344 Entity {
345 name,
346 label,
347 fields: std::mem::take(fields),
348 id,
349 line,
350 tags: Vec::new(),
351 slug: None,
352 }
353}
354
355fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
357 let pos = fields.iter().position(|(k, _)| k == "id")?;
358 let (_, value) = fields.remove(pos);
359 match value {
360 FieldValue::Single(s) if !s.is_empty() => Some(s),
361 _ => None,
362 }
363}
364
365fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
367 for field in fields.iter_mut() {
368 if field.0 == "type" {
369 field.0 = match label {
370 Label::Organization => "org_type".to_string(),
371 Label::Event => "event_type".to_string(),
372 Label::Document => "doc_type".to_string(),
373 Label::Asset => "asset_type".to_string(),
374 Label::Person => "type".to_string(), };
376 }
377 }
378}
379
380fn parse_bullet(item: &str) -> Option<(String, String)> {
382 let colon_pos = item.find(':')?;
383 let key = item[..colon_pos].trim();
384 if key.is_empty() {
385 return None;
386 }
387 let value = item[colon_pos + 1..].trim();
388 Some((key.to_string(), value.to_string()))
389}
390
391fn is_list_field(key: &str) -> bool {
393 matches!(key, "aliases" | "urls" | "role")
394}
395
396fn strip_h3(line: &str) -> Option<&str> {
398 let trimmed = line.trim_start();
399 if let Some(rest) = trimmed.strip_prefix("### ") {
400 if !rest.starts_with('#') {
402 return Some(rest.trim());
403 }
404 }
405 None
406}
407
408const COMMON_FIELDS: &[&str] = &[
412 "qualifier",
413 "aliases",
414 "thumbnail",
415 "thumbnail_source",
416 "urls",
417 "description",
418];
419
420const PERSON_FIELDS: &[&str] = &[
421 "role",
422 "nationality",
423 "date_of_birth",
424 "place_of_birth",
425 "status",
426];
427
428const ORGANIZATION_FIELDS: &[&str] = &[
429 "org_type",
430 "jurisdiction",
431 "headquarters",
432 "founded_date",
433 "registration_number",
434 "status",
435];
436
437const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
438
439const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
440
441const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
442
443use crate::domain;
445
446const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
447const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
448const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
449const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
450const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
451const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
452const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
453const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
454const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
455
456struct FieldConstraint {
458 max_len: usize,
459 enum_values: Option<&'static [&'static str]>,
461}
462
463fn field_constraint(key: &str) -> Option<FieldConstraint> {
464 match key {
465 "description" => Some(FieldConstraint {
466 max_len: 2000,
467 enum_values: None,
468 }),
469 "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
470 max_len: 2048,
471 enum_values: None,
472 }),
473 "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
474 | "closed_at" => Some(FieldConstraint {
475 max_len: 10,
476 enum_values: None,
477 }),
478 "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
479 Some(FieldConstraint {
480 max_len: 200,
481 enum_values: None,
482 })
483 }
484 "jurisdiction" => Some(FieldConstraint {
485 max_len: 203, enum_values: None,
488 }),
489 "role" => Some(FieldConstraint {
490 max_len: 100,
491 enum_values: Some(ROLE_VALUES),
492 }),
493 "org_type" => Some(FieldConstraint {
494 max_len: 100,
495 enum_values: Some(ORG_TYPE_VALUES),
496 }),
497 "event_type" => Some(FieldConstraint {
498 max_len: 100,
499 enum_values: Some(EVENT_TYPE_VALUES),
500 }),
501 "doc_type" => Some(FieldConstraint {
502 max_len: 100,
503 enum_values: Some(DOC_TYPE_VALUES),
504 }),
505 "asset_type" => Some(FieldConstraint {
506 max_len: 100,
507 enum_values: Some(ASSET_TYPE_VALUES),
508 }),
509 "severity" => Some(FieldConstraint {
510 max_len: 20,
511 enum_values: Some(SEVERITY_VALUES),
512 }),
513 "status" => Some(FieldConstraint {
514 max_len: 30,
517 enum_values: None,
518 }),
519 "qualifier" | "nationality" | "case_number" | "registration_number" => {
520 Some(FieldConstraint {
521 max_len: 100,
522 enum_values: None,
523 })
524 }
525 _ => None,
527 }
528}
529
530const MAX_ALIASES: usize = 10;
532const MAX_ALIAS_LEN: usize = 200;
533const MAX_URLS: usize = 10;
534const MAX_URL_LEN: usize = 2048;
535
536fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
540 for (key, value) in fields.iter_mut() {
541 let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
542
543 match value {
544 FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
545 let normalized = val.to_lowercase().replace(' ', "_");
546 if normalized != *val {
547 *val = normalized;
548 }
549 }
550 FieldValue::List(items) if is_enum => {
551 for item in items.iter_mut() {
552 if !item.starts_with("custom:") {
553 let normalized = item.to_lowercase().replace(' ', "_");
554 if normalized != *item {
555 *item = normalized;
556 }
557 }
558 }
559 }
560 _ => {}
561 }
562 }
563}
564
565#[allow(clippy::too_many_lines)]
566fn validate_fields(
567 fields: &[(String, FieldValue)],
568 label: Label,
569 line: usize,
570 errors: &mut Vec<ParseError>,
571) {
572 let label_fields: &[&str] = match label {
573 Label::Person => PERSON_FIELDS,
574 Label::Organization => ORGANIZATION_FIELDS,
575 Label::Event => EVENT_FIELDS,
576 Label::Document => DOCUMENT_FIELDS,
577 Label::Asset => ASSET_FIELDS,
578 };
579
580 for (key, value) in fields {
581 if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
583 errors.push(ParseError {
584 line,
585 message: format!("unknown field {key:?} for {label}"),
586 });
587 continue;
588 }
589
590 match value {
591 FieldValue::Single(val) => {
592 if let Some(constraint) = field_constraint(key) {
593 if val.len() > constraint.max_len {
594 errors.push(ParseError {
595 line,
596 message: format!(
597 "field {key:?} exceeds {} chars (got {})",
598 constraint.max_len,
599 val.len()
600 ),
601 });
602 }
603
604 if let Some(allowed) = constraint.enum_values {
606 validate_enum_value(key, val, allowed, line, errors);
607 }
608
609 if matches!(
611 key.as_str(),
612 "occurred_at"
613 | "date_of_birth"
614 | "founded_date"
615 | "issued_at"
616 | "opened_at"
617 | "closed_at"
618 ) && !val.is_empty()
619 {
620 validate_date_format(key, val, line, errors);
621 }
622
623 if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
625 && !val.is_empty()
626 && !val.starts_with("https://")
627 {
628 errors.push(ParseError {
629 line,
630 message: format!("field {key:?} must be HTTPS URL"),
631 });
632 }
633 }
634
635 if key == "status" {
637 validate_status(val, label, line, errors);
638 }
639
640 if key == "jurisdiction" && !val.is_empty() {
642 validate_jurisdiction(val, line, errors);
643 }
644
645 if key == "value" && !val.is_empty() {
647 validate_money(val, line, errors);
648 }
649 }
650 FieldValue::List(items) => match key.as_str() {
651 "aliases" => {
652 if items.len() > MAX_ALIASES {
653 errors.push(ParseError {
654 line,
655 message: format!(
656 "aliases exceeds {MAX_ALIASES} items (got {})",
657 items.len()
658 ),
659 });
660 }
661 for item in items {
662 if item.len() > MAX_ALIAS_LEN {
663 errors.push(ParseError {
664 line,
665 message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
666 });
667 }
668 }
669 }
670 "urls" => {
671 if items.len() > MAX_URLS {
672 errors.push(ParseError {
673 line,
674 message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
675 });
676 }
677 for item in items {
678 if item.len() > MAX_URL_LEN {
679 errors.push(ParseError {
680 line,
681 message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
682 });
683 }
684 if !item.starts_with("https://") {
685 errors.push(ParseError {
686 line,
687 message: format!("url must be HTTPS: {item:?}"),
688 });
689 }
690 }
691 }
692 "role" => {
693 if items.len() > MAX_ROLES {
694 errors.push(ParseError {
695 line,
696 message: format!(
697 "role exceeds {MAX_ROLES} items (got {})",
698 items.len()
699 ),
700 });
701 }
702 for item in items {
703 validate_enum_value("role", item, ROLE_VALUES, line, errors);
704 }
705 }
706 _ => {}
707 },
708 }
709 }
710
711 if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
713 errors.push(ParseError {
714 line,
715 message: "organization entity missing required field \"org_type\"".into(),
716 });
717 }
718}
719
720const MAX_ROLES: usize = 10;
722
723fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
725 let allowed: &[&str] = match label {
726 Label::Person => PERSON_STATUS_VALUES,
727 Label::Organization => ORG_STATUS_VALUES,
728 Label::Asset => ASSET_STATUS_VALUES,
729 _ => {
730 errors.push(ParseError {
731 line,
732 message: format!("field \"status\" is not valid for {label}"),
733 });
734 return;
735 }
736 };
737
738 let normalized = value.to_lowercase().replace(' ', "_");
739 if !allowed.contains(&normalized.as_str()) {
740 errors.push(ParseError {
741 line,
742 message: format!(
743 "invalid status {value:?} for {label} (known: {})",
744 allowed.join(", ")
745 ),
746 });
747 }
748}
749
750fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
752 if let Some(slash_pos) = value.find('/') {
753 let country = &value[..slash_pos];
754 let subdivision = &value[slash_pos + 1..];
755 if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
756 errors.push(ParseError {
757 line,
758 message: format!(
759 "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
760 ),
761 });
762 }
763 if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
764 errors.push(ParseError {
765 line,
766 message: format!(
767 "jurisdiction subdivision must be 1-{} chars",
768 domain::MAX_SUBDIVISION_LEN
769 ),
770 });
771 }
772 } else {
773 if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
775 errors.push(ParseError {
776 line,
777 message: format!(
778 "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
779 ),
780 });
781 }
782 }
783}
784
785fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
788 let parts: Vec<&str> = value.splitn(3, ' ').collect();
790 if parts.len() < 3 {
791 errors.push(ParseError {
792 line,
793 message: format!(
794 "invalid money format: expected `amount currency \"display\"`, got {value:?}"
795 ),
796 });
797 return;
798 }
799
800 if parts[0].parse::<i64>().is_err() {
802 errors.push(ParseError {
803 line,
804 message: format!("money amount must be an integer, got {:?}", parts[0]),
805 });
806 }
807
808 let currency = parts[1];
810 if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
811 errors.push(ParseError {
812 line,
813 message: format!(
814 "money currency must be 3-letter uppercase ISO code, got {currency:?}"
815 ),
816 });
817 }
818
819 let display = parts[2];
821 if !display.starts_with('"') || !display.ends_with('"') {
822 errors.push(ParseError {
823 line,
824 message: format!("money display must be quoted, got {display:?}"),
825 });
826 } else {
827 let inner = &display[1..display.len() - 1];
828 if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
829 errors.push(ParseError {
830 line,
831 message: format!(
832 "money display exceeds {} chars (got {})",
833 domain::MAX_MONEY_DISPLAY_LEN,
834 inner.len()
835 ),
836 });
837 }
838 }
839}
840
841fn validate_enum_value(
842 key: &str,
843 value: &str,
844 allowed: &[&str],
845 line: usize,
846 errors: &mut Vec<ParseError>,
847) {
848 if let Some(custom) = value.strip_prefix("custom:") {
850 if custom.is_empty() || custom.len() > 100 {
851 errors.push(ParseError {
852 line,
853 message: format!(
854 "field {key:?} custom value must be 1-100 chars, got {}",
855 custom.len()
856 ),
857 });
858 }
859 return;
860 }
861
862 let normalized = value.to_lowercase().replace(' ', "_");
863 if !allowed.contains(&normalized.as_str()) {
864 errors.push(ParseError {
865 line,
866 message: format!(
867 "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
868 allowed.join(", ")
869 ),
870 });
871 }
872}
873
874fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
875 let valid = matches!(value.len(), 4 | 7 | 10)
877 && value.chars().enumerate().all(|(i, c)| match i {
878 4 | 7 => c == '-',
879 _ => c.is_ascii_digit(),
880 });
881
882 if !valid {
883 errors.push(ParseError {
884 line,
885 message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
886 });
887 }
888}
889
890#[cfg(test)]
891mod tests {
892 use super::*;
893
894 #[test]
895 fn parse_person_entity() {
896 let body = [
897 "",
898 "### Mark Bonnick",
899 "- qualifier: Arsenal Kit Manager",
900 "- nationality: GB",
901 "- role: custom:Kit Manager",
902 "- date_of_birth: 1962",
903 "- description: Academy kit manager at Arsenal FC for 22 years",
904 " (2001-2024). Age 62 at time of dismissal.",
905 "",
906 ]
907 .join("\n");
908
909 let mut errors = Vec::new();
910 let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
911 assert!(errors.is_empty(), "errors: {errors:?}");
912 assert_eq!(entities.len(), 1);
913
914 let e = &entities[0];
915 assert_eq!(e.name, "Mark Bonnick");
916 assert_eq!(e.label, Label::Person);
917 assert_eq!(e.fields.len(), 5);
918
919 let desc = e
921 .fields
922 .iter()
923 .find(|(k, _)| k == "description")
924 .map(|(_, v)| v);
925 assert_eq!(
926 desc,
927 Some(&FieldValue::Single(
928 "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
929 ))
930 );
931 }
932
933 #[test]
934 fn parse_person_with_role_list() {
935 let body = "### Test\n- role: politician, executive\n";
936 let mut errors = Vec::new();
937 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
938 assert!(errors.is_empty(), "errors: {errors:?}");
939 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
940 assert_eq!(
941 roles.map(|(_, v)| v),
942 Some(&FieldValue::List(vec![
943 "politician".into(),
944 "executive".into(),
945 ]))
946 );
947 }
948
949 #[test]
950 fn parse_person_with_status() {
951 let body = "### Test\n- status: imprisoned\n";
952 let mut errors = Vec::new();
953 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
954 assert!(errors.is_empty(), "errors: {errors:?}");
955 }
956
957 #[test]
958 fn reject_invalid_person_status() {
959 let body = "### Test\n- status: unknown_status\n";
960 let mut errors = Vec::new();
961 parse_entities(body, SectionKind::People, 1, &mut errors);
962 assert!(errors.iter().any(|e| e.message.contains("invalid status")));
963 }
964
965 #[test]
966 fn parse_organization_with_type_shorthand() {
967 let body = [
968 "",
969 "### Arsenal FC",
970 "- type: sports_club",
971 "- jurisdiction: GB",
972 "- aliases: Arsenal, The Gunners, Arsenal Football Club",
973 "- urls:",
974 " - https://www.arsenal.com",
975 " - https://en.wikipedia.org/wiki/Arsenal_F.C.",
976 "",
977 ]
978 .join("\n");
979
980 let mut errors = Vec::new();
981 let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
982 assert!(errors.is_empty(), "errors: {errors:?}");
983 assert_eq!(entities.len(), 1);
984
985 let e = &entities[0];
986 assert_eq!(e.name, "Arsenal FC");
987 assert_eq!(e.label, Label::Organization);
988
989 let it = e.fields.iter().find(|(k, _)| k == "org_type");
991 assert_eq!(
992 it.map(|(_, v)| v),
993 Some(&FieldValue::Single("sports_club".into()))
994 );
995
996 let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
998 assert_eq!(
999 aliases.map(|(_, v)| v),
1000 Some(&FieldValue::List(vec![
1001 "Arsenal".into(),
1002 "The Gunners".into(),
1003 "Arsenal Football Club".into(),
1004 ]))
1005 );
1006
1007 let urls = e.fields.iter().find(|(k, _)| k == "urls");
1009 assert_eq!(
1010 urls.map(|(_, v)| v),
1011 Some(&FieldValue::List(vec![
1012 "https://www.arsenal.com".into(),
1013 "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
1014 ]))
1015 );
1016 }
1017
1018 #[test]
1019 fn parse_organization_with_jurisdiction_subdivision() {
1020 let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1021 let mut errors = Vec::new();
1022 let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1023 assert!(errors.is_empty(), "errors: {errors:?}");
1024 let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1025 assert_eq!(
1026 j.map(|(_, v)| v),
1027 Some(&FieldValue::Single("ID/West Java".into()))
1028 );
1029 }
1030
1031 #[test]
1032 fn reject_invalid_jurisdiction() {
1033 let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1034 let mut errors = Vec::new();
1035 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1036 assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1037 }
1038
1039 #[test]
1040 fn parse_event_with_type_shorthand() {
1041 let body = [
1042 "",
1043 "### Bonnick dismissal",
1044 "- occurred_at: 2024-12-24",
1045 "- type: dismissal",
1046 "- description: Arsenal dismisses Bonnick.",
1047 "",
1048 ]
1049 .join("\n");
1050
1051 let mut errors = Vec::new();
1052 let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1053 assert!(errors.is_empty(), "errors: {errors:?}");
1054
1055 let e = &entities[0];
1056 assert_eq!(e.label, Label::Event);
1057 let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1058 assert_eq!(
1059 dt.map(|(_, v)| v),
1060 Some(&FieldValue::Single("dismissal".into()))
1061 );
1062 }
1063
1064 #[test]
1065 fn parse_event_with_severity() {
1066 let body =
1067 "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1068 let mut errors = Vec::new();
1069 let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1070 assert!(errors.is_empty(), "errors: {errors:?}");
1071 }
1072
1073 #[test]
1074 fn parse_document_entity() {
1075 let body = [
1076 "### Indictment No. 123",
1077 "- doc_type: indictment",
1078 "- issued_at: 2024-03-15",
1079 "- issuing_authority: Jakarta District Court",
1080 "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1081 ]
1082 .join("\n");
1083 let mut errors = Vec::new();
1084 let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1085 assert!(errors.is_empty(), "errors: {errors:?}");
1086 assert_eq!(entities.len(), 1);
1087 assert_eq!(entities[0].label, Label::Document);
1088 }
1089
1090 #[test]
1091 fn parse_asset_entity() {
1092 let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1093 let mut errors = Vec::new();
1094 let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1095 assert!(errors.is_empty(), "errors: {errors:?}");
1096 assert_eq!(entities.len(), 1);
1097 assert_eq!(entities[0].label, Label::Asset);
1098 }
1099
1100 #[test]
1101 fn reject_invalid_money_format() {
1102 let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1103 let mut errors = Vec::new();
1104 parse_entities(body, SectionKind::Assets, 1, &mut errors);
1105 assert!(errors.iter().any(|e| e.message.contains("money")));
1106 }
1107
1108 #[test]
1109 fn reject_unknown_field() {
1110 let body = "### Test\n- foobar: value\n";
1111 let mut errors = Vec::new();
1112 parse_entities(body, SectionKind::People, 1, &mut errors);
1113 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1114 }
1115
1116 #[test]
1117 fn reject_wrong_label_field() {
1118 let body = "### Test\n- org_type: court\n";
1120 let mut errors = Vec::new();
1121 parse_entities(body, SectionKind::People, 1, &mut errors);
1122 assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1123 }
1124
1125 #[test]
1126 fn reject_invalid_enum_value() {
1127 let body = "### Test\n- role: wizard\n";
1128 let mut errors = Vec::new();
1129 parse_entities(body, SectionKind::People, 1, &mut errors);
1130 assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1131 }
1132
1133 #[test]
1134 fn accept_custom_enum_value() {
1135 let body = "### Test\n- role: custom:Kit Manager\n";
1136 let mut errors = Vec::new();
1137 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1138 assert!(errors.is_empty(), "errors: {errors:?}");
1139 assert_eq!(entities.len(), 1);
1140 }
1141
1142 #[test]
1143 fn normalize_enum_value_spaces_to_underscores() {
1144 let body = "### Test\n- role: civil servant\n";
1145 let mut errors = Vec::new();
1146 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1147 assert!(errors.is_empty(), "errors: {errors:?}");
1148 assert_eq!(entities.len(), 1);
1149 let val = entities[0]
1150 .fields
1151 .iter()
1152 .find(|(k, _)| k == "role")
1153 .map(|(_, v)| match v {
1154 FieldValue::Single(s) => s.as_str(),
1155 _ => "",
1156 });
1157 assert_eq!(val, Some("civil_servant"));
1158 }
1159
1160 #[test]
1161 fn normalize_enum_list_values() {
1162 let body = "### Test\n- role: civil servant, law enforcement\n";
1163 let mut errors = Vec::new();
1164 let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1165 assert!(errors.is_empty(), "errors: {errors:?}");
1166 let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1167 assert_eq!(
1168 roles.map(|(_, v)| v),
1169 Some(&FieldValue::List(vec![
1170 "civil_servant".into(),
1171 "law_enforcement".into(),
1172 ]))
1173 );
1174 }
1175
1176 #[test]
1177 fn reject_invalid_date_format() {
1178 let body = "### Test\n- date_of_birth: January 1990\n";
1179 let mut errors = Vec::new();
1180 parse_entities(body, SectionKind::People, 1, &mut errors);
1181 assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1182 }
1183
1184 #[test]
1185 fn accept_valid_date_formats() {
1186 for date in &["2024", "2024-01", "2024-01-15"] {
1187 let body = format!("### Test\n- date_of_birth: {date}\n");
1188 let mut errors = Vec::new();
1189 parse_entities(&body, SectionKind::People, 1, &mut errors);
1190 assert!(
1191 errors.is_empty(),
1192 "date {date:?} should be valid: {errors:?}"
1193 );
1194 }
1195 }
1196
1197 #[test]
1198 fn reject_non_https_url() {
1199 let body = "### Test\n- urls:\n - http://example.com\n";
1200 let mut errors = Vec::new();
1201 parse_entities(body, SectionKind::People, 1, &mut errors);
1202 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1203 }
1204
1205 #[test]
1206 fn reject_non_https_thumbnail() {
1207 let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1208 let mut errors = Vec::new();
1209 parse_entities(body, SectionKind::People, 1, &mut errors);
1210 assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1211 }
1212
1213 #[test]
1214 fn multiple_entities() {
1215 let body = [
1216 "",
1217 "### Alice",
1218 "- nationality: NL",
1219 "",
1220 "### Bob",
1221 "- nationality: GB",
1222 "",
1223 ]
1224 .join("\n");
1225
1226 let mut errors = Vec::new();
1227 let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1228 assert!(errors.is_empty(), "errors: {errors:?}");
1229 assert_eq!(entities.len(), 2);
1230 assert_eq!(entities[0].name, "Alice");
1231 assert_eq!(entities[1].name, "Bob");
1232 }
1233
1234 #[test]
1235 fn field_max_length_violation() {
1236 let long_val = "a".repeat(201);
1237 let body = format!("### Test\n- nationality: {long_val}\n");
1238 let mut errors = Vec::new();
1239 parse_entities(&body, SectionKind::People, 1, &mut errors);
1240 assert!(
1241 errors
1242 .iter()
1243 .any(|e| e.message.contains("exceeds 100 chars"))
1244 );
1245 }
1246
1247 #[test]
1248 fn too_many_aliases() {
1249 let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1250 let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1251 let mut errors = Vec::new();
1252 parse_entities(&body, SectionKind::People, 1, &mut errors);
1253 assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1254 }
1255
1256 #[test]
1257 fn require_org_type_for_organizations() {
1258 let body = "### Test Corp\n- qualifier: Test\n";
1259 let mut errors = Vec::new();
1260 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1261 assert!(
1262 errors
1263 .iter()
1264 .any(|e| { e.message.contains("missing required field \"org_type\"") })
1265 );
1266 }
1267
1268 #[test]
1269 fn accept_organization_with_type() {
1270 let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1271 let mut errors = Vec::new();
1272 parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1273 assert!(errors.is_empty(), "errors: {errors:?}");
1274 }
1275}