Skip to main content

weave_content/
entity.rs

1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5/// Maximum entities per file.
6const MAX_ENTITIES_PER_FILE: usize = 50;
7
8/// Maximum length of an entity name.
9const MAX_NAME_LEN: usize = 300;
10
11/// Label derived from the section an entity appears in.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14    Person,
15    Organization,
16    Event,
17    Document,
18    Asset,
19}
20
21impl fmt::Display for Label {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        match self {
24            Self::Person => write!(f, "person"),
25            Self::Organization => write!(f, "organization"),
26            Self::Event => write!(f, "event"),
27            Self::Document => write!(f, "document"),
28            Self::Asset => write!(f, "asset"),
29        }
30    }
31}
32
33impl Label {
34    pub fn from_section(kind: SectionKind) -> Option<Self> {
35        match kind {
36            SectionKind::People => Some(Self::Person),
37            SectionKind::Organizations => Some(Self::Organization),
38            SectionKind::Events => Some(Self::Event),
39            SectionKind::Documents => Some(Self::Document),
40            SectionKind::Assets => Some(Self::Asset),
41            _ => None,
42        }
43    }
44}
45
46/// A parsed entity with its name, label, and field map.
47#[derive(Debug, Clone)]
48pub struct Entity {
49    pub name: String,
50    pub label: Label,
51    pub fields: Vec<(String, FieldValue)>,
52    /// Stored NULID from `- id:` field (None if not yet generated).
53    pub id: Option<String>,
54    /// Line number (1-indexed) of the H3 heading.
55    pub line: usize,
56    /// Tags from front matter (empty for inline entities).
57    pub tags: Vec<String>,
58    /// File-path slug (e.g. `people/id/harvey-moeis`). Only set for
59    /// registry entities that have standalone files.
60    pub slug: Option<String>,
61}
62
63/// A field value: either a single string or a list of strings.
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66    Single(String),
67    List(Vec<String>),
68}
69
70/// Parse a single entity from a standalone entity file body.
71/// The body is the text after the H1 heading (bullet fields, no H3 headings).
72/// `label` is determined by the file's directory (people/ or organizations/).
73/// `id` comes from the front matter (may be None).
74pub fn parse_entity_file_body(
75    name: &str,
76    body: &str,
77    label: Label,
78    id: Option<String>,
79    title_line: usize,
80    errors: &mut Vec<ParseError>,
81) -> Entity {
82    let section_kind = match label {
83        Label::Person => SectionKind::People,
84        Label::Organization => SectionKind::Organizations,
85        Label::Event => SectionKind::Events,
86        Label::Document => SectionKind::Documents,
87        Label::Asset => SectionKind::Assets,
88    };
89
90    // Wrap the body with a fake H3 heading so we can reuse parse_entities
91    let wrapped = format!("### {name}\n{body}");
92    let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94    if let Some(mut entity) = entities.pop() {
95        entity.id = id;
96        entity.line = title_line;
97        entity
98    } else {
99        Entity {
100            name: name.to_string(),
101            label,
102            fields: Vec::new(),
103            id,
104            line: title_line,
105            tags: Vec::new(),
106            slug: None,
107        }
108    }
109}
110
111/// Parse entities from an entity section (People, Organizations, Events).
112/// The `body` is the text between the H2 heading and the next H2 heading.
113/// `section_start_line` is the line number of the H2 heading in the original file.
114#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116    body: &str,
117    section_kind: SectionKind,
118    section_start_line: usize,
119    errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121    let Some(label) = Label::from_section(section_kind) else {
122        return Vec::new();
123    };
124
125    let lines: Vec<&str> = body.lines().collect();
126    let mut entities: Vec<Entity> = Vec::new();
127    let mut current_name: Option<String> = None;
128    let mut current_line: usize = 0;
129    let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130    // Track multi-line value continuation and nested list building
131    let mut pending_list_key: Option<String> = None;
132    let mut pending_list_items: Vec<String> = Vec::new();
133
134    for (i, line) in lines.iter().enumerate() {
135        let file_line = section_start_line + 1 + i; // +1 because body starts after the H2 heading line
136
137        // Check for H3 heading
138        if let Some(name) = strip_h3(line) {
139            // Flush pending list
140            flush_pending_list(
141                &mut pending_list_key,
142                &mut pending_list_items,
143                &mut current_fields,
144            );
145
146            // Flush previous entity
147            if let Some(entity_name) = current_name.take() {
148                let entity = build_entity(
149                    entity_name,
150                    label,
151                    current_line,
152                    &mut current_fields,
153                    errors,
154                );
155                entities.push(entity);
156            }
157
158            current_name = Some(name.to_string());
159            current_line = file_line;
160            current_fields.clear();
161            continue;
162        }
163
164        // Only parse bullet fields if we're inside an entity (after an H3)
165        if current_name.is_none() {
166            if !line.trim().is_empty() {
167                errors.push(ParseError {
168                    line: file_line,
169                    message: "content before first entity heading (### Name)".into(),
170                });
171            }
172            continue;
173        }
174
175        let trimmed = line.trim();
176
177        // Nested list item: `  - value` (2-space indent + dash)
178        if let Some(item) = trimmed.strip_prefix("- ") {
179            if line.starts_with("  - ") && pending_list_key.is_some() {
180                // Nested list item for pending list key
181                pending_list_items.push(item.trim().to_string());
182                continue;
183            }
184
185            // Flush pending list before processing new top-level bullet
186            flush_pending_list(
187                &mut pending_list_key,
188                &mut pending_list_items,
189                &mut current_fields,
190            );
191
192            // Top-level bullet: `- key: value` or `- key:`
193            if let Some((key, value)) = parse_bullet(item) {
194                if value.is_empty() {
195                    // Start a nested list: `- urls:`
196                    pending_list_key = Some(key);
197                    pending_list_items.clear();
198                } else if is_list_field(&key) && value.contains(',') {
199                    // Comma-separated list: `- aliases: A, B, C`
200                    let items: Vec<String> = value
201                        .split(',')
202                        .map(|s| s.trim().to_string())
203                        .filter(|s| !s.is_empty())
204                        .collect();
205                    current_fields.push((key, FieldValue::List(items)));
206                } else {
207                    current_fields.push((key, FieldValue::Single(value)));
208                }
209            } else {
210                errors.push(ParseError {
211                    line: file_line,
212                    message: format!(
213                        "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214                    ),
215                });
216            }
217            continue;
218        }
219
220        // Multi-line value continuation (2-space indent, not a bullet)
221        if line.starts_with("  ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222            if pending_list_key.is_some() {
223                // Could be continuation inside a list context -- treat as error
224                errors.push(ParseError {
225                    line: file_line,
226                    message: "unexpected indented text in list context".into(),
227                });
228            } else if let Some(last) = current_fields.last_mut() {
229                // Append to last single-value field
230                if let FieldValue::Single(ref mut val) = last.1 {
231                    val.push('\n');
232                    val.push_str(trimmed);
233                }
234            }
235            continue;
236        }
237
238        // Blank line or other content -- ignore
239        if !trimmed.is_empty() {
240            // Flush pending list on non-indented non-bullet content
241            flush_pending_list(
242                &mut pending_list_key,
243                &mut pending_list_items,
244                &mut current_fields,
245            );
246        }
247    }
248
249    // Flush final pending list and entity
250    flush_pending_list(
251        &mut pending_list_key,
252        &mut pending_list_items,
253        &mut current_fields,
254    );
255
256    if let Some(entity_name) = current_name.take() {
257        let entity = build_entity(
258            entity_name,
259            label,
260            current_line,
261            &mut current_fields,
262            errors,
263        );
264        entities.push(entity);
265    }
266
267    // Boundary check
268    if entities.len() > MAX_ENTITIES_PER_FILE {
269        errors.push(ParseError {
270            line: section_start_line,
271            message: format!(
272                "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
273                entities.len()
274            ),
275        });
276    }
277
278    entities
279}
280
281fn flush_pending_list(
282    pending_key: &mut Option<String>,
283    pending_items: &mut Vec<String>,
284    fields: &mut Vec<(String, FieldValue)>,
285) {
286    if let Some(key) = pending_key.take() {
287        fields.push((key, FieldValue::List(std::mem::take(pending_items))));
288    }
289}
290
291fn build_entity(
292    name: String,
293    label: Label,
294    line: usize,
295    fields: &mut Vec<(String, FieldValue)>,
296    errors: &mut Vec<ParseError>,
297) -> Entity {
298    // Validate name
299    if name.trim().is_empty() {
300        errors.push(ParseError {
301            line,
302            message: "entity name must not be empty".into(),
303        });
304    } else if name.len() > MAX_NAME_LEN {
305        errors.push(ParseError {
306            line,
307            message: format!(
308                "entity name exceeds {MAX_NAME_LEN} chars (got {})",
309                name.len()
310            ),
311        });
312    }
313
314    // Extract id field before validation (not a schema field)
315    let id = extract_id_field(fields);
316
317    // Apply type: shorthand
318    apply_type_shorthand(fields, label);
319
320    // Normalize enum field values (lowercase, spaces → underscores)
321    normalize_enum_fields(fields);
322
323    // Validate fields against schema
324    validate_fields(fields, label, line, errors);
325
326    Entity {
327        name,
328        label,
329        fields: std::mem::take(fields),
330        id,
331        line,
332        tags: Vec::new(),
333        slug: None,
334    }
335}
336
337/// Extract and remove the `id` field from the field list.
338fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
339    let pos = fields.iter().position(|(k, _)| k == "id")?;
340    let (_, value) = fields.remove(pos);
341    match value {
342        FieldValue::Single(s) if !s.is_empty() => Some(s),
343        _ => None,
344    }
345}
346
347/// Replace `type:` shorthand with the label-specific field name.
348fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
349    for field in fields.iter_mut() {
350        if field.0 == "type" {
351            field.0 = match label {
352                Label::Organization => "org_type".to_string(),
353                Label::Event => "event_type".to_string(),
354                Label::Document => "doc_type".to_string(),
355                Label::Asset => "asset_type".to_string(),
356                Label::Person => "type".to_string(), // will be caught as unknown
357            };
358        }
359    }
360}
361
362/// Parse `key: value` from a bullet item (after stripping `- `).
363fn parse_bullet(item: &str) -> Option<(String, String)> {
364    let colon_pos = item.find(':')?;
365    let key = item[..colon_pos].trim();
366    if key.is_empty() {
367        return None;
368    }
369    let value = item[colon_pos + 1..].trim();
370    Some((key.to_string(), value.to_string()))
371}
372
373/// Check if a field name is a list-type field.
374fn is_list_field(key: &str) -> bool {
375    matches!(key, "aliases" | "urls" | "role")
376}
377
378/// Strip an H3 heading prefix. Returns the heading text.
379fn strip_h3(line: &str) -> Option<&str> {
380    let trimmed = line.trim_start();
381    if let Some(rest) = trimmed.strip_prefix("### ") {
382        // Must not be H4+
383        if !rest.starts_with('#') {
384            return Some(rest.trim());
385        }
386    }
387    None
388}
389
390// --- Field validation ---
391
392/// Known fields per label (common + label-specific).
393const COMMON_FIELDS: &[&str] = &[
394    "qualifier",
395    "aliases",
396    "thumbnail",
397    "thumbnail_source",
398    "urls",
399    "description",
400];
401
402const PERSON_FIELDS: &[&str] = &[
403    "role",
404    "nationality",
405    "date_of_birth",
406    "place_of_birth",
407    "status",
408];
409
410const ORGANIZATION_FIELDS: &[&str] = &[
411    "org_type",
412    "jurisdiction",
413    "headquarters",
414    "founded_date",
415    "registration_number",
416    "status",
417];
418
419const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
420
421const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
422
423const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
424
425/// Known enum values — delegated to domain module constants.
426use crate::domain;
427
428const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
429const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
430const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
431const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
432const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
433const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
434const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
435const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
436const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
437
438/// Field max lengths.
439struct FieldConstraint {
440    max_len: usize,
441    /// If Some, the field is an enum with these known values.
442    enum_values: Option<&'static [&'static str]>,
443}
444
445fn field_constraint(key: &str) -> Option<FieldConstraint> {
446    match key {
447        "description" => Some(FieldConstraint {
448            max_len: 2000,
449            enum_values: None,
450        }),
451        "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
452            max_len: 2048,
453            enum_values: None,
454        }),
455        "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
456        | "closed_at" => Some(FieldConstraint {
457            max_len: 10,
458            enum_values: None,
459        }),
460        "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
461            Some(FieldConstraint {
462                max_len: 200,
463                enum_values: None,
464            })
465        }
466        "jurisdiction" => Some(FieldConstraint {
467            // jurisdiction: ID or ID/South Sulawesi (country + optional subdivision)
468            max_len: 203, // 2 + 1 + 200
469            enum_values: None,
470        }),
471        "role" => Some(FieldConstraint {
472            max_len: 100,
473            enum_values: Some(ROLE_VALUES),
474        }),
475        "org_type" => Some(FieldConstraint {
476            max_len: 100,
477            enum_values: Some(ORG_TYPE_VALUES),
478        }),
479        "event_type" => Some(FieldConstraint {
480            max_len: 100,
481            enum_values: Some(EVENT_TYPE_VALUES),
482        }),
483        "doc_type" => Some(FieldConstraint {
484            max_len: 100,
485            enum_values: Some(DOC_TYPE_VALUES),
486        }),
487        "asset_type" => Some(FieldConstraint {
488            max_len: 100,
489            enum_values: Some(ASSET_TYPE_VALUES),
490        }),
491        "severity" => Some(FieldConstraint {
492            max_len: 20,
493            enum_values: Some(SEVERITY_VALUES),
494        }),
495        "status" => Some(FieldConstraint {
496            // Status validation is context-dependent (Person vs Org vs Asset),
497            // handled separately in validate_fields.
498            max_len: 30,
499            enum_values: None,
500        }),
501        "qualifier" | "nationality" | "case_number" | "registration_number" => {
502            Some(FieldConstraint {
503                max_len: 100,
504                enum_values: None,
505            })
506        }
507        // List fields validated separately
508        _ => None,
509    }
510}
511
512/// Maximum items in list fields.
513const MAX_ALIASES: usize = 10;
514const MAX_ALIAS_LEN: usize = 200;
515const MAX_URLS: usize = 10;
516const MAX_URL_LEN: usize = 2048;
517
518/// Normalize enum field values in-place: lowercase and replace spaces with
519/// underscores. Values with the `custom:` prefix are left unchanged.
520/// Handles both single-value and list-value enum fields.
521fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
522    for (key, value) in fields.iter_mut() {
523        let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
524
525        match value {
526            FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
527                let normalized = val.to_lowercase().replace(' ', "_");
528                if normalized != *val {
529                    *val = normalized;
530                }
531            }
532            FieldValue::List(items) if is_enum => {
533                for item in items.iter_mut() {
534                    if !item.starts_with("custom:") {
535                        let normalized = item.to_lowercase().replace(' ', "_");
536                        if normalized != *item {
537                            *item = normalized;
538                        }
539                    }
540                }
541            }
542            _ => {}
543        }
544    }
545}
546
547#[allow(clippy::too_many_lines)]
548fn validate_fields(
549    fields: &[(String, FieldValue)],
550    label: Label,
551    line: usize,
552    errors: &mut Vec<ParseError>,
553) {
554    let label_fields: &[&str] = match label {
555        Label::Person => PERSON_FIELDS,
556        Label::Organization => ORGANIZATION_FIELDS,
557        Label::Event => EVENT_FIELDS,
558        Label::Document => DOCUMENT_FIELDS,
559        Label::Asset => ASSET_FIELDS,
560    };
561
562    for (key, value) in fields {
563        // Check if field is known
564        if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
565            errors.push(ParseError {
566                line,
567                message: format!("unknown field {key:?} for {label}"),
568            });
569            continue;
570        }
571
572        match value {
573            FieldValue::Single(val) => {
574                if let Some(constraint) = field_constraint(key) {
575                    if val.len() > constraint.max_len {
576                        errors.push(ParseError {
577                            line,
578                            message: format!(
579                                "field {key:?} exceeds {} chars (got {})",
580                                constraint.max_len,
581                                val.len()
582                            ),
583                        });
584                    }
585
586                    // Validate enum values
587                    if let Some(allowed) = constraint.enum_values {
588                        validate_enum_value(key, val, allowed, line, errors);
589                    }
590
591                    // Validate date format
592                    if matches!(
593                        key.as_str(),
594                        "occurred_at"
595                            | "date_of_birth"
596                            | "founded_date"
597                            | "issued_at"
598                            | "opened_at"
599                            | "closed_at"
600                    ) && !val.is_empty()
601                    {
602                        validate_date_format(key, val, line, errors);
603                    }
604
605                    // Validate URL fields
606                    if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
607                        && !val.is_empty()
608                        && !val.starts_with("https://")
609                    {
610                        errors.push(ParseError {
611                            line,
612                            message: format!("field {key:?} must be HTTPS URL"),
613                        });
614                    }
615                }
616
617                // Context-dependent status validation
618                if key == "status" {
619                    validate_status(val, label, line, errors);
620                }
621
622                // Validate jurisdiction format: `XX` or `XX/Subdivision`
623                if key == "jurisdiction" && !val.is_empty() {
624                    validate_jurisdiction(val, line, errors);
625                }
626
627                // Validate money format: `amount currency "display"`
628                if key == "value" && !val.is_empty() {
629                    validate_money(val, line, errors);
630                }
631            }
632            FieldValue::List(items) => match key.as_str() {
633                "aliases" => {
634                    if items.len() > MAX_ALIASES {
635                        errors.push(ParseError {
636                            line,
637                            message: format!(
638                                "aliases exceeds {MAX_ALIASES} items (got {})",
639                                items.len()
640                            ),
641                        });
642                    }
643                    for item in items {
644                        if item.len() > MAX_ALIAS_LEN {
645                            errors.push(ParseError {
646                                line,
647                                message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
648                            });
649                        }
650                    }
651                }
652                "urls" => {
653                    if items.len() > MAX_URLS {
654                        errors.push(ParseError {
655                            line,
656                            message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
657                        });
658                    }
659                    for item in items {
660                        if item.len() > MAX_URL_LEN {
661                            errors.push(ParseError {
662                                line,
663                                message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
664                            });
665                        }
666                        if !item.starts_with("https://") {
667                            errors.push(ParseError {
668                                line,
669                                message: format!("url must be HTTPS: {item:?}"),
670                            });
671                        }
672                    }
673                }
674                "role" => {
675                    if items.len() > MAX_ROLES {
676                        errors.push(ParseError {
677                            line,
678                            message: format!(
679                                "role exceeds {MAX_ROLES} items (got {})",
680                                items.len()
681                            ),
682                        });
683                    }
684                    for item in items {
685                        validate_enum_value("role", item, ROLE_VALUES, line, errors);
686                    }
687                }
688                _ => {}
689            },
690        }
691    }
692
693    // Required field checks
694    if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
695        errors.push(ParseError {
696            line,
697            message: "organization entity missing required field \"org_type\"".into(),
698        });
699    }
700}
701
702/// Maximum roles per person.
703const MAX_ROLES: usize = 10;
704
705/// Validate status value based on entity label context.
706fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
707    let allowed: &[&str] = match label {
708        Label::Person => PERSON_STATUS_VALUES,
709        Label::Organization => ORG_STATUS_VALUES,
710        Label::Asset => ASSET_STATUS_VALUES,
711        _ => {
712            errors.push(ParseError {
713                line,
714                message: format!("field \"status\" is not valid for {label}"),
715            });
716            return;
717        }
718    };
719
720    let normalized = value.to_lowercase().replace(' ', "_");
721    if !allowed.contains(&normalized.as_str()) {
722        errors.push(ParseError {
723            line,
724            message: format!(
725                "invalid status {value:?} for {label} (known: {})",
726                allowed.join(", ")
727            ),
728        });
729    }
730}
731
732/// Validate jurisdiction format: `XX` or `XX/Subdivision`.
733fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
734    if let Some(slash_pos) = value.find('/') {
735        let country = &value[..slash_pos];
736        let subdivision = &value[slash_pos + 1..];
737        if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
738            errors.push(ParseError {
739                line,
740                message: format!(
741                    "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
742                ),
743            });
744        }
745        if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
746            errors.push(ParseError {
747                line,
748                message: format!(
749                    "jurisdiction subdivision must be 1-{} chars",
750                    domain::MAX_SUBDIVISION_LEN
751                ),
752            });
753        }
754    } else {
755        // Just country code
756        if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
757            errors.push(ParseError {
758                line,
759                message: format!(
760                    "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
761                ),
762            });
763        }
764    }
765}
766
767/// Validate money DSL format: `amount currency "display"`.
768/// Example: `500000000000 IDR "Rp 500 billion"`
769fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
770    // Split: amount currency "display"
771    let parts: Vec<&str> = value.splitn(3, ' ').collect();
772    if parts.len() < 3 {
773        errors.push(ParseError {
774            line,
775            message: format!(
776                "invalid money format: expected `amount currency \"display\"`, got {value:?}"
777            ),
778        });
779        return;
780    }
781
782    // Validate amount is a valid integer
783    if parts[0].parse::<i64>().is_err() {
784        errors.push(ParseError {
785            line,
786            message: format!("money amount must be an integer, got {:?}", parts[0]),
787        });
788    }
789
790    // Validate currency is 3-letter uppercase
791    let currency = parts[1];
792    if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
793        errors.push(ParseError {
794            line,
795            message: format!(
796                "money currency must be 3-letter uppercase ISO code, got {currency:?}"
797            ),
798        });
799    }
800
801    // Validate display is quoted
802    let display = parts[2];
803    if !display.starts_with('"') || !display.ends_with('"') {
804        errors.push(ParseError {
805            line,
806            message: format!("money display must be quoted, got {display:?}"),
807        });
808    } else {
809        let inner = &display[1..display.len() - 1];
810        if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
811            errors.push(ParseError {
812                line,
813                message: format!(
814                    "money display exceeds {} chars (got {})",
815                    domain::MAX_MONEY_DISPLAY_LEN,
816                    inner.len()
817                ),
818            });
819        }
820    }
821}
822
823fn validate_enum_value(
824    key: &str,
825    value: &str,
826    allowed: &[&str],
827    line: usize,
828    errors: &mut Vec<ParseError>,
829) {
830    // custom: prefix is always valid (if non-empty after prefix, max 100 chars)
831    if let Some(custom) = value.strip_prefix("custom:") {
832        if custom.is_empty() || custom.len() > 100 {
833            errors.push(ParseError {
834                line,
835                message: format!(
836                    "field {key:?} custom value must be 1-100 chars, got {}",
837                    custom.len()
838                ),
839            });
840        }
841        return;
842    }
843
844    let normalized = value.to_lowercase().replace(' ', "_");
845    if !allowed.contains(&normalized.as_str()) {
846        errors.push(ParseError {
847            line,
848            message: format!(
849                "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
850                allowed.join(", ")
851            ),
852        });
853    }
854}
855
856fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
857    // Valid formats: YYYY, YYYY-MM, YYYY-MM-DD
858    let valid = matches!(value.len(), 4 | 7 | 10)
859        && value.chars().enumerate().all(|(i, c)| match i {
860            4 | 7 => c == '-',
861            _ => c.is_ascii_digit(),
862        });
863
864    if !valid {
865        errors.push(ParseError {
866            line,
867            message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
868        });
869    }
870}
871
872#[cfg(test)]
873mod tests {
874    use super::*;
875
876    #[test]
877    fn parse_person_entity() {
878        let body = [
879            "",
880            "### Mark Bonnick",
881            "- qualifier: Arsenal Kit Manager",
882            "- nationality: GB",
883            "- role: custom:Kit Manager",
884            "- date_of_birth: 1962",
885            "- description: Academy kit manager at Arsenal FC for 22 years",
886            "  (2001-2024). Age 62 at time of dismissal.",
887            "",
888        ]
889        .join("\n");
890
891        let mut errors = Vec::new();
892        let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
893        assert!(errors.is_empty(), "errors: {errors:?}");
894        assert_eq!(entities.len(), 1);
895
896        let e = &entities[0];
897        assert_eq!(e.name, "Mark Bonnick");
898        assert_eq!(e.label, Label::Person);
899        assert_eq!(e.fields.len(), 5);
900
901        // Check multi-line description
902        let desc = e
903            .fields
904            .iter()
905            .find(|(k, _)| k == "description")
906            .map(|(_, v)| v);
907        assert_eq!(
908            desc,
909            Some(&FieldValue::Single(
910                "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
911            ))
912        );
913    }
914
915    #[test]
916    fn parse_person_with_role_list() {
917        let body = "### Test\n- role: politician, executive\n";
918        let mut errors = Vec::new();
919        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
920        assert!(errors.is_empty(), "errors: {errors:?}");
921        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
922        assert_eq!(
923            roles.map(|(_, v)| v),
924            Some(&FieldValue::List(vec![
925                "politician".into(),
926                "executive".into(),
927            ]))
928        );
929    }
930
931    #[test]
932    fn parse_person_with_status() {
933        let body = "### Test\n- status: imprisoned\n";
934        let mut errors = Vec::new();
935        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
936        assert!(errors.is_empty(), "errors: {errors:?}");
937    }
938
939    #[test]
940    fn reject_invalid_person_status() {
941        let body = "### Test\n- status: unknown_status\n";
942        let mut errors = Vec::new();
943        parse_entities(body, SectionKind::People, 1, &mut errors);
944        assert!(errors.iter().any(|e| e.message.contains("invalid status")));
945    }
946
947    #[test]
948    fn parse_organization_with_type_shorthand() {
949        let body = [
950            "",
951            "### Arsenal FC",
952            "- type: sports_club",
953            "- jurisdiction: GB",
954            "- aliases: Arsenal, The Gunners, Arsenal Football Club",
955            "- urls:",
956            "  - https://www.arsenal.com",
957            "  - https://en.wikipedia.org/wiki/Arsenal_F.C.",
958            "",
959        ]
960        .join("\n");
961
962        let mut errors = Vec::new();
963        let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
964        assert!(errors.is_empty(), "errors: {errors:?}");
965        assert_eq!(entities.len(), 1);
966
967        let e = &entities[0];
968        assert_eq!(e.name, "Arsenal FC");
969        assert_eq!(e.label, Label::Organization);
970
971        // type: should have been expanded to org_type:
972        let it = e.fields.iter().find(|(k, _)| k == "org_type");
973        assert_eq!(
974            it.map(|(_, v)| v),
975            Some(&FieldValue::Single("sports_club".into()))
976        );
977
978        // aliases as comma-separated
979        let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
980        assert_eq!(
981            aliases.map(|(_, v)| v),
982            Some(&FieldValue::List(vec![
983                "Arsenal".into(),
984                "The Gunners".into(),
985                "Arsenal Football Club".into(),
986            ]))
987        );
988
989        // urls as nested list
990        let urls = e.fields.iter().find(|(k, _)| k == "urls");
991        assert_eq!(
992            urls.map(|(_, v)| v),
993            Some(&FieldValue::List(vec![
994                "https://www.arsenal.com".into(),
995                "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
996            ]))
997        );
998    }
999
1000    #[test]
1001    fn parse_organization_with_jurisdiction_subdivision() {
1002        let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1003        let mut errors = Vec::new();
1004        let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1005        assert!(errors.is_empty(), "errors: {errors:?}");
1006        let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1007        assert_eq!(
1008            j.map(|(_, v)| v),
1009            Some(&FieldValue::Single("ID/West Java".into()))
1010        );
1011    }
1012
1013    #[test]
1014    fn reject_invalid_jurisdiction() {
1015        let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1016        let mut errors = Vec::new();
1017        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1018        assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1019    }
1020
1021    #[test]
1022    fn parse_event_with_type_shorthand() {
1023        let body = [
1024            "",
1025            "### Bonnick dismissal",
1026            "- occurred_at: 2024-12-24",
1027            "- type: dismissal",
1028            "- description: Arsenal dismisses Bonnick.",
1029            "",
1030        ]
1031        .join("\n");
1032
1033        let mut errors = Vec::new();
1034        let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1035        assert!(errors.is_empty(), "errors: {errors:?}");
1036
1037        let e = &entities[0];
1038        assert_eq!(e.label, Label::Event);
1039        let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1040        assert_eq!(
1041            dt.map(|(_, v)| v),
1042            Some(&FieldValue::Single("dismissal".into()))
1043        );
1044    }
1045
1046    #[test]
1047    fn parse_event_with_severity() {
1048        let body =
1049            "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1050        let mut errors = Vec::new();
1051        let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1052        assert!(errors.is_empty(), "errors: {errors:?}");
1053    }
1054
1055    #[test]
1056    fn parse_document_entity() {
1057        let body = [
1058            "### Indictment No. 123",
1059            "- doc_type: indictment",
1060            "- issued_at: 2024-03-15",
1061            "- issuing_authority: Jakarta District Court",
1062            "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1063        ]
1064        .join("\n");
1065        let mut errors = Vec::new();
1066        let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1067        assert!(errors.is_empty(), "errors: {errors:?}");
1068        assert_eq!(entities.len(), 1);
1069        assert_eq!(entities[0].label, Label::Document);
1070    }
1071
1072    #[test]
1073    fn parse_asset_entity() {
1074        let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1075        let mut errors = Vec::new();
1076        let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1077        assert!(errors.is_empty(), "errors: {errors:?}");
1078        assert_eq!(entities.len(), 1);
1079        assert_eq!(entities[0].label, Label::Asset);
1080    }
1081
1082    #[test]
1083    fn reject_invalid_money_format() {
1084        let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1085        let mut errors = Vec::new();
1086        parse_entities(body, SectionKind::Assets, 1, &mut errors);
1087        assert!(errors.iter().any(|e| e.message.contains("money")));
1088    }
1089
1090    #[test]
1091    fn reject_unknown_field() {
1092        let body = "### Test\n- foobar: value\n";
1093        let mut errors = Vec::new();
1094        parse_entities(body, SectionKind::People, 1, &mut errors);
1095        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1096    }
1097
1098    #[test]
1099    fn reject_wrong_label_field() {
1100        // org_type on a person
1101        let body = "### Test\n- org_type: court\n";
1102        let mut errors = Vec::new();
1103        parse_entities(body, SectionKind::People, 1, &mut errors);
1104        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1105    }
1106
1107    #[test]
1108    fn reject_invalid_enum_value() {
1109        let body = "### Test\n- role: wizard\n";
1110        let mut errors = Vec::new();
1111        parse_entities(body, SectionKind::People, 1, &mut errors);
1112        assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1113    }
1114
1115    #[test]
1116    fn accept_custom_enum_value() {
1117        let body = "### Test\n- role: custom:Kit Manager\n";
1118        let mut errors = Vec::new();
1119        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1120        assert!(errors.is_empty(), "errors: {errors:?}");
1121        assert_eq!(entities.len(), 1);
1122    }
1123
1124    #[test]
1125    fn normalize_enum_value_spaces_to_underscores() {
1126        let body = "### Test\n- role: civil servant\n";
1127        let mut errors = Vec::new();
1128        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1129        assert!(errors.is_empty(), "errors: {errors:?}");
1130        assert_eq!(entities.len(), 1);
1131        let val = entities[0]
1132            .fields
1133            .iter()
1134            .find(|(k, _)| k == "role")
1135            .map(|(_, v)| match v {
1136                FieldValue::Single(s) => s.as_str(),
1137                _ => "",
1138            });
1139        assert_eq!(val, Some("civil_servant"));
1140    }
1141
1142    #[test]
1143    fn normalize_enum_list_values() {
1144        let body = "### Test\n- role: civil servant, law enforcement\n";
1145        let mut errors = Vec::new();
1146        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1147        assert!(errors.is_empty(), "errors: {errors:?}");
1148        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1149        assert_eq!(
1150            roles.map(|(_, v)| v),
1151            Some(&FieldValue::List(vec![
1152                "civil_servant".into(),
1153                "law_enforcement".into(),
1154            ]))
1155        );
1156    }
1157
1158    #[test]
1159    fn reject_invalid_date_format() {
1160        let body = "### Test\n- date_of_birth: January 1990\n";
1161        let mut errors = Vec::new();
1162        parse_entities(body, SectionKind::People, 1, &mut errors);
1163        assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1164    }
1165
1166    #[test]
1167    fn accept_valid_date_formats() {
1168        for date in &["2024", "2024-01", "2024-01-15"] {
1169            let body = format!("### Test\n- date_of_birth: {date}\n");
1170            let mut errors = Vec::new();
1171            parse_entities(&body, SectionKind::People, 1, &mut errors);
1172            assert!(
1173                errors.is_empty(),
1174                "date {date:?} should be valid: {errors:?}"
1175            );
1176        }
1177    }
1178
1179    #[test]
1180    fn reject_non_https_url() {
1181        let body = "### Test\n- urls:\n  - http://example.com\n";
1182        let mut errors = Vec::new();
1183        parse_entities(body, SectionKind::People, 1, &mut errors);
1184        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1185    }
1186
1187    #[test]
1188    fn reject_non_https_thumbnail() {
1189        let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1190        let mut errors = Vec::new();
1191        parse_entities(body, SectionKind::People, 1, &mut errors);
1192        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1193    }
1194
1195    #[test]
1196    fn multiple_entities() {
1197        let body = [
1198            "",
1199            "### Alice",
1200            "- nationality: NL",
1201            "",
1202            "### Bob",
1203            "- nationality: GB",
1204            "",
1205        ]
1206        .join("\n");
1207
1208        let mut errors = Vec::new();
1209        let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1210        assert!(errors.is_empty(), "errors: {errors:?}");
1211        assert_eq!(entities.len(), 2);
1212        assert_eq!(entities[0].name, "Alice");
1213        assert_eq!(entities[1].name, "Bob");
1214    }
1215
1216    #[test]
1217    fn field_max_length_violation() {
1218        let long_val = "a".repeat(201);
1219        let body = format!("### Test\n- nationality: {long_val}\n");
1220        let mut errors = Vec::new();
1221        parse_entities(&body, SectionKind::People, 1, &mut errors);
1222        assert!(
1223            errors
1224                .iter()
1225                .any(|e| e.message.contains("exceeds 100 chars"))
1226        );
1227    }
1228
1229    #[test]
1230    fn too_many_aliases() {
1231        let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1232        let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1233        let mut errors = Vec::new();
1234        parse_entities(&body, SectionKind::People, 1, &mut errors);
1235        assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1236    }
1237
1238    #[test]
1239    fn require_org_type_for_organizations() {
1240        let body = "### Test Corp\n- qualifier: Test\n";
1241        let mut errors = Vec::new();
1242        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1243        assert!(
1244            errors
1245                .iter()
1246                .any(|e| { e.message.contains("missing required field \"org_type\"") })
1247        );
1248    }
1249
1250    #[test]
1251    fn accept_organization_with_type() {
1252        let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1253        let mut errors = Vec::new();
1254        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1255        assert!(errors.is_empty(), "errors: {errors:?}");
1256    }
1257}