Skip to main content

weave_content/
entity.rs

1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5/// Maximum entities per file.
6const MAX_ENTITIES_PER_FILE: usize = 50;
7
8/// Maximum length of an entity name.
9const MAX_NAME_LEN: usize = 300;
10
11/// Label derived from the section an entity appears in.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14    Person,
15    Organization,
16    Event,
17    Document,
18    Asset,
19}
20
21impl fmt::Display for Label {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        match self {
24            Self::Person => write!(f, "person"),
25            Self::Organization => write!(f, "organization"),
26            Self::Event => write!(f, "event"),
27            Self::Document => write!(f, "document"),
28            Self::Asset => write!(f, "asset"),
29        }
30    }
31}
32
33impl Label {
34    pub fn from_section(kind: SectionKind) -> Option<Self> {
35        match kind {
36            SectionKind::People => Some(Self::Person),
37            SectionKind::Organizations => Some(Self::Organization),
38            SectionKind::Events => Some(Self::Event),
39            SectionKind::Documents => Some(Self::Document),
40            SectionKind::Assets => Some(Self::Asset),
41            _ => None,
42        }
43    }
44}
45
46/// A parsed entity with its name, label, and field map.
47#[derive(Debug, Clone)]
48pub struct Entity {
49    pub name: String,
50    pub label: Label,
51    pub fields: Vec<(String, FieldValue)>,
52    /// Stored NULID from `- id:` field (None if not yet generated).
53    pub id: Option<String>,
54    /// Line number (1-indexed) of the H3 heading.
55    pub line: usize,
56    /// Tags from front matter (empty for inline entities).
57    pub tags: Vec<String>,
58}
59
60/// A field value: either a single string or a list of strings.
61#[derive(Debug, Clone, PartialEq, Eq)]
62pub enum FieldValue {
63    Single(String),
64    List(Vec<String>),
65}
66
67/// Parse a single entity from a standalone entity file body.
68/// The body is the text after the H1 heading (bullet fields, no H3 headings).
69/// `label` is determined by the file's directory (people/ or organizations/).
70/// `id` comes from the front matter (may be None).
71pub fn parse_entity_file_body(
72    name: &str,
73    body: &str,
74    label: Label,
75    id: Option<String>,
76    title_line: usize,
77    errors: &mut Vec<ParseError>,
78) -> Entity {
79    let section_kind = match label {
80        Label::Person => SectionKind::People,
81        Label::Organization => SectionKind::Organizations,
82        Label::Event => SectionKind::Events,
83        Label::Document => SectionKind::Documents,
84        Label::Asset => SectionKind::Assets,
85    };
86
87    // Wrap the body with a fake H3 heading so we can reuse parse_entities
88    let wrapped = format!("### {name}\n{body}");
89    let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
90
91    if let Some(mut entity) = entities.pop() {
92        entity.id = id;
93        entity.line = title_line;
94        entity
95    } else {
96        Entity {
97            name: name.to_string(),
98            label,
99            fields: Vec::new(),
100            id,
101            line: title_line,
102            tags: Vec::new(),
103        }
104    }
105}
106
107/// Parse entities from an entity section (People, Organizations, Events).
108/// The `body` is the text between the H2 heading and the next H2 heading.
109/// `section_start_line` is the line number of the H2 heading in the original file.
110#[allow(clippy::too_many_lines)]
111pub fn parse_entities(
112    body: &str,
113    section_kind: SectionKind,
114    section_start_line: usize,
115    errors: &mut Vec<ParseError>,
116) -> Vec<Entity> {
117    let Some(label) = Label::from_section(section_kind) else {
118        return Vec::new();
119    };
120
121    let lines: Vec<&str> = body.lines().collect();
122    let mut entities: Vec<Entity> = Vec::new();
123    let mut current_name: Option<String> = None;
124    let mut current_line: usize = 0;
125    let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
126    // Track multi-line value continuation and nested list building
127    let mut pending_list_key: Option<String> = None;
128    let mut pending_list_items: Vec<String> = Vec::new();
129
130    for (i, line) in lines.iter().enumerate() {
131        let file_line = section_start_line + 1 + i; // +1 because body starts after the H2 heading line
132
133        // Check for H3 heading
134        if let Some(name) = strip_h3(line) {
135            // Flush pending list
136            flush_pending_list(
137                &mut pending_list_key,
138                &mut pending_list_items,
139                &mut current_fields,
140            );
141
142            // Flush previous entity
143            if let Some(entity_name) = current_name.take() {
144                let entity = build_entity(
145                    entity_name,
146                    label,
147                    current_line,
148                    &mut current_fields,
149                    errors,
150                );
151                entities.push(entity);
152            }
153
154            current_name = Some(name.to_string());
155            current_line = file_line;
156            current_fields.clear();
157            continue;
158        }
159
160        // Only parse bullet fields if we're inside an entity (after an H3)
161        if current_name.is_none() {
162            if !line.trim().is_empty() {
163                errors.push(ParseError {
164                    line: file_line,
165                    message: "content before first entity heading (### Name)".into(),
166                });
167            }
168            continue;
169        }
170
171        let trimmed = line.trim();
172
173        // Nested list item: `  - value` (2-space indent + dash)
174        if let Some(item) = trimmed.strip_prefix("- ") {
175            if line.starts_with("  - ") && pending_list_key.is_some() {
176                // Nested list item for pending list key
177                pending_list_items.push(item.trim().to_string());
178                continue;
179            }
180
181            // Flush pending list before processing new top-level bullet
182            flush_pending_list(
183                &mut pending_list_key,
184                &mut pending_list_items,
185                &mut current_fields,
186            );
187
188            // Top-level bullet: `- key: value` or `- key:`
189            if let Some((key, value)) = parse_bullet(item) {
190                if value.is_empty() {
191                    // Start a nested list: `- urls:`
192                    pending_list_key = Some(key);
193                    pending_list_items.clear();
194                } else if is_list_field(&key) && value.contains(',') {
195                    // Comma-separated list: `- aliases: A, B, C`
196                    let items: Vec<String> = value
197                        .split(',')
198                        .map(|s| s.trim().to_string())
199                        .filter(|s| !s.is_empty())
200                        .collect();
201                    current_fields.push((key, FieldValue::List(items)));
202                } else {
203                    current_fields.push((key, FieldValue::Single(value)));
204                }
205            } else {
206                errors.push(ParseError {
207                    line: file_line,
208                    message: format!(
209                        "invalid field syntax: expected `- key: value`, got {trimmed:?}"
210                    ),
211                });
212            }
213            continue;
214        }
215
216        // Multi-line value continuation (2-space indent, not a bullet)
217        if line.starts_with("  ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
218            if pending_list_key.is_some() {
219                // Could be continuation inside a list context -- treat as error
220                errors.push(ParseError {
221                    line: file_line,
222                    message: "unexpected indented text in list context".into(),
223                });
224            } else if let Some(last) = current_fields.last_mut() {
225                // Append to last single-value field
226                if let FieldValue::Single(ref mut val) = last.1 {
227                    val.push('\n');
228                    val.push_str(trimmed);
229                }
230            }
231            continue;
232        }
233
234        // Blank line or other content -- ignore
235        if !trimmed.is_empty() {
236            // Flush pending list on non-indented non-bullet content
237            flush_pending_list(
238                &mut pending_list_key,
239                &mut pending_list_items,
240                &mut current_fields,
241            );
242        }
243    }
244
245    // Flush final pending list and entity
246    flush_pending_list(
247        &mut pending_list_key,
248        &mut pending_list_items,
249        &mut current_fields,
250    );
251
252    if let Some(entity_name) = current_name.take() {
253        let entity = build_entity(
254            entity_name,
255            label,
256            current_line,
257            &mut current_fields,
258            errors,
259        );
260        entities.push(entity);
261    }
262
263    // Boundary check
264    if entities.len() > MAX_ENTITIES_PER_FILE {
265        errors.push(ParseError {
266            line: section_start_line,
267            message: format!(
268                "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
269                entities.len()
270            ),
271        });
272    }
273
274    entities
275}
276
277fn flush_pending_list(
278    pending_key: &mut Option<String>,
279    pending_items: &mut Vec<String>,
280    fields: &mut Vec<(String, FieldValue)>,
281) {
282    if let Some(key) = pending_key.take() {
283        fields.push((key, FieldValue::List(std::mem::take(pending_items))));
284    }
285}
286
287fn build_entity(
288    name: String,
289    label: Label,
290    line: usize,
291    fields: &mut Vec<(String, FieldValue)>,
292    errors: &mut Vec<ParseError>,
293) -> Entity {
294    // Validate name
295    if name.trim().is_empty() {
296        errors.push(ParseError {
297            line,
298            message: "entity name must not be empty".into(),
299        });
300    } else if name.len() > MAX_NAME_LEN {
301        errors.push(ParseError {
302            line,
303            message: format!(
304                "entity name exceeds {MAX_NAME_LEN} chars (got {})",
305                name.len()
306            ),
307        });
308    }
309
310    // Extract id field before validation (not a schema field)
311    let id = extract_id_field(fields);
312
313    // Apply type: shorthand
314    apply_type_shorthand(fields, label);
315
316    // Normalize enum field values (lowercase, spaces → underscores)
317    normalize_enum_fields(fields);
318
319    // Validate fields against schema
320    validate_fields(fields, label, line, errors);
321
322    Entity {
323        name,
324        label,
325        fields: std::mem::take(fields),
326        id,
327        line,
328        tags: Vec::new(),
329    }
330}
331
332/// Extract and remove the `id` field from the field list.
333fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
334    let pos = fields.iter().position(|(k, _)| k == "id")?;
335    let (_, value) = fields.remove(pos);
336    match value {
337        FieldValue::Single(s) if !s.is_empty() => Some(s),
338        _ => None,
339    }
340}
341
342/// Replace `type:` shorthand with the label-specific field name.
343fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
344    for field in fields.iter_mut() {
345        if field.0 == "type" {
346            field.0 = match label {
347                Label::Organization => "org_type".to_string(),
348                Label::Event => "event_type".to_string(),
349                Label::Document => "doc_type".to_string(),
350                Label::Asset => "asset_type".to_string(),
351                Label::Person => "type".to_string(), // will be caught as unknown
352            };
353        }
354    }
355}
356
357/// Parse `key: value` from a bullet item (after stripping `- `).
358fn parse_bullet(item: &str) -> Option<(String, String)> {
359    let colon_pos = item.find(':')?;
360    let key = item[..colon_pos].trim();
361    if key.is_empty() {
362        return None;
363    }
364    let value = item[colon_pos + 1..].trim();
365    Some((key.to_string(), value.to_string()))
366}
367
368/// Check if a field name is a list-type field.
369fn is_list_field(key: &str) -> bool {
370    matches!(key, "aliases" | "urls" | "role")
371}
372
373/// Strip an H3 heading prefix. Returns the heading text.
374fn strip_h3(line: &str) -> Option<&str> {
375    let trimmed = line.trim_start();
376    if let Some(rest) = trimmed.strip_prefix("### ") {
377        // Must not be H4+
378        if !rest.starts_with('#') {
379            return Some(rest.trim());
380        }
381    }
382    None
383}
384
385// --- Field validation ---
386
387/// Known fields per label (common + label-specific).
388const COMMON_FIELDS: &[&str] = &[
389    "qualifier",
390    "aliases",
391    "thumbnail",
392    "thumbnail_source",
393    "urls",
394    "description",
395];
396
397const PERSON_FIELDS: &[&str] = &[
398    "role",
399    "nationality",
400    "date_of_birth",
401    "place_of_birth",
402    "status",
403];
404
405const ORGANIZATION_FIELDS: &[&str] = &[
406    "org_type",
407    "jurisdiction",
408    "headquarters",
409    "founded_date",
410    "registration_number",
411    "status",
412];
413
414const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
415
416const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
417
418const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
419
420/// Known enum values — delegated to domain module constants.
421use crate::domain;
422
423const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
424const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
425const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
426const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
427const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
428const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
429const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
430const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
431const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
432
433/// Field max lengths.
434struct FieldConstraint {
435    max_len: usize,
436    /// If Some, the field is an enum with these known values.
437    enum_values: Option<&'static [&'static str]>,
438}
439
440fn field_constraint(key: &str) -> Option<FieldConstraint> {
441    match key {
442        "description" => Some(FieldConstraint {
443            max_len: 2000,
444            enum_values: None,
445        }),
446        "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
447            max_len: 2048,
448            enum_values: None,
449        }),
450        "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
451        | "closed_at" => Some(FieldConstraint {
452            max_len: 10,
453            enum_values: None,
454        }),
455        "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
456            Some(FieldConstraint {
457                max_len: 200,
458                enum_values: None,
459            })
460        }
461        "jurisdiction" => Some(FieldConstraint {
462            // jurisdiction: ID or ID/South Sulawesi (country + optional subdivision)
463            max_len: 203, // 2 + 1 + 200
464            enum_values: None,
465        }),
466        "role" => Some(FieldConstraint {
467            max_len: 100,
468            enum_values: Some(ROLE_VALUES),
469        }),
470        "org_type" => Some(FieldConstraint {
471            max_len: 100,
472            enum_values: Some(ORG_TYPE_VALUES),
473        }),
474        "event_type" => Some(FieldConstraint {
475            max_len: 100,
476            enum_values: Some(EVENT_TYPE_VALUES),
477        }),
478        "doc_type" => Some(FieldConstraint {
479            max_len: 100,
480            enum_values: Some(DOC_TYPE_VALUES),
481        }),
482        "asset_type" => Some(FieldConstraint {
483            max_len: 100,
484            enum_values: Some(ASSET_TYPE_VALUES),
485        }),
486        "severity" => Some(FieldConstraint {
487            max_len: 20,
488            enum_values: Some(SEVERITY_VALUES),
489        }),
490        "status" => Some(FieldConstraint {
491            // Status validation is context-dependent (Person vs Org vs Asset),
492            // handled separately in validate_fields.
493            max_len: 30,
494            enum_values: None,
495        }),
496        "qualifier" | "nationality" | "case_number" | "registration_number" => {
497            Some(FieldConstraint {
498                max_len: 100,
499                enum_values: None,
500            })
501        }
502        // List fields validated separately
503        _ => None,
504    }
505}
506
507/// Maximum items in list fields.
508const MAX_ALIASES: usize = 10;
509const MAX_ALIAS_LEN: usize = 200;
510const MAX_URLS: usize = 10;
511const MAX_URL_LEN: usize = 2048;
512
513/// Normalize enum field values in-place: lowercase and replace spaces with
514/// underscores. Values with the `custom:` prefix are left unchanged.
515/// Handles both single-value and list-value enum fields.
516fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
517    for (key, value) in fields.iter_mut() {
518        let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
519
520        match value {
521            FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
522                let normalized = val.to_lowercase().replace(' ', "_");
523                if normalized != *val {
524                    *val = normalized;
525                }
526            }
527            FieldValue::List(items) if is_enum => {
528                for item in items.iter_mut() {
529                    if !item.starts_with("custom:") {
530                        let normalized = item.to_lowercase().replace(' ', "_");
531                        if normalized != *item {
532                            *item = normalized;
533                        }
534                    }
535                }
536            }
537            _ => {}
538        }
539    }
540}
541
542#[allow(clippy::too_many_lines)]
543fn validate_fields(
544    fields: &[(String, FieldValue)],
545    label: Label,
546    line: usize,
547    errors: &mut Vec<ParseError>,
548) {
549    let label_fields: &[&str] = match label {
550        Label::Person => PERSON_FIELDS,
551        Label::Organization => ORGANIZATION_FIELDS,
552        Label::Event => EVENT_FIELDS,
553        Label::Document => DOCUMENT_FIELDS,
554        Label::Asset => ASSET_FIELDS,
555    };
556
557    for (key, value) in fields {
558        // Check if field is known
559        if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
560            errors.push(ParseError {
561                line,
562                message: format!("unknown field {key:?} for {label}"),
563            });
564            continue;
565        }
566
567        match value {
568            FieldValue::Single(val) => {
569                if let Some(constraint) = field_constraint(key) {
570                    if val.len() > constraint.max_len {
571                        errors.push(ParseError {
572                            line,
573                            message: format!(
574                                "field {key:?} exceeds {} chars (got {})",
575                                constraint.max_len,
576                                val.len()
577                            ),
578                        });
579                    }
580
581                    // Validate enum values
582                    if let Some(allowed) = constraint.enum_values {
583                        validate_enum_value(key, val, allowed, line, errors);
584                    }
585
586                    // Validate date format
587                    if matches!(
588                        key.as_str(),
589                        "occurred_at"
590                            | "date_of_birth"
591                            | "founded_date"
592                            | "issued_at"
593                            | "opened_at"
594                            | "closed_at"
595                    ) && !val.is_empty()
596                    {
597                        validate_date_format(key, val, line, errors);
598                    }
599
600                    // Validate URL fields
601                    if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
602                        && !val.is_empty()
603                        && !val.starts_with("https://")
604                    {
605                        errors.push(ParseError {
606                            line,
607                            message: format!("field {key:?} must be HTTPS URL"),
608                        });
609                    }
610                }
611
612                // Context-dependent status validation
613                if key == "status" {
614                    validate_status(val, label, line, errors);
615                }
616
617                // Validate jurisdiction format: `XX` or `XX/Subdivision`
618                if key == "jurisdiction" && !val.is_empty() {
619                    validate_jurisdiction(val, line, errors);
620                }
621
622                // Validate money format: `amount currency "display"`
623                if key == "value" && !val.is_empty() {
624                    validate_money(val, line, errors);
625                }
626            }
627            FieldValue::List(items) => match key.as_str() {
628                "aliases" => {
629                    if items.len() > MAX_ALIASES {
630                        errors.push(ParseError {
631                            line,
632                            message: format!(
633                                "aliases exceeds {MAX_ALIASES} items (got {})",
634                                items.len()
635                            ),
636                        });
637                    }
638                    for item in items {
639                        if item.len() > MAX_ALIAS_LEN {
640                            errors.push(ParseError {
641                                line,
642                                message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
643                            });
644                        }
645                    }
646                }
647                "urls" => {
648                    if items.len() > MAX_URLS {
649                        errors.push(ParseError {
650                            line,
651                            message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
652                        });
653                    }
654                    for item in items {
655                        if item.len() > MAX_URL_LEN {
656                            errors.push(ParseError {
657                                line,
658                                message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
659                            });
660                        }
661                        if !item.starts_with("https://") {
662                            errors.push(ParseError {
663                                line,
664                                message: format!("url must be HTTPS: {item:?}"),
665                            });
666                        }
667                    }
668                }
669                "role" => {
670                    if items.len() > MAX_ROLES {
671                        errors.push(ParseError {
672                            line,
673                            message: format!(
674                                "role exceeds {MAX_ROLES} items (got {})",
675                                items.len()
676                            ),
677                        });
678                    }
679                    for item in items {
680                        validate_enum_value("role", item, ROLE_VALUES, line, errors);
681                    }
682                }
683                _ => {}
684            },
685        }
686    }
687
688    // Required field checks
689    if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
690        errors.push(ParseError {
691            line,
692            message: "organization entity missing required field \"org_type\"".into(),
693        });
694    }
695}
696
697/// Maximum roles per person.
698const MAX_ROLES: usize = 10;
699
700/// Validate status value based on entity label context.
701fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
702    let allowed: &[&str] = match label {
703        Label::Person => PERSON_STATUS_VALUES,
704        Label::Organization => ORG_STATUS_VALUES,
705        Label::Asset => ASSET_STATUS_VALUES,
706        _ => {
707            errors.push(ParseError {
708                line,
709                message: format!("field \"status\" is not valid for {label}"),
710            });
711            return;
712        }
713    };
714
715    let normalized = value.to_lowercase().replace(' ', "_");
716    if !allowed.contains(&normalized.as_str()) {
717        errors.push(ParseError {
718            line,
719            message: format!(
720                "invalid status {value:?} for {label} (known: {})",
721                allowed.join(", ")
722            ),
723        });
724    }
725}
726
727/// Validate jurisdiction format: `XX` or `XX/Subdivision`.
728fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
729    if let Some(slash_pos) = value.find('/') {
730        let country = &value[..slash_pos];
731        let subdivision = &value[slash_pos + 1..];
732        if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
733            errors.push(ParseError {
734                line,
735                message: format!(
736                    "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
737                ),
738            });
739        }
740        if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
741            errors.push(ParseError {
742                line,
743                message: format!(
744                    "jurisdiction subdivision must be 1-{} chars",
745                    domain::MAX_SUBDIVISION_LEN
746                ),
747            });
748        }
749    } else {
750        // Just country code
751        if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
752            errors.push(ParseError {
753                line,
754                message: format!(
755                    "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
756                ),
757            });
758        }
759    }
760}
761
762/// Validate money DSL format: `amount currency "display"`.
763/// Example: `500000000000 IDR "Rp 500 billion"`
764fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
765    // Split: amount currency "display"
766    let parts: Vec<&str> = value.splitn(3, ' ').collect();
767    if parts.len() < 3 {
768        errors.push(ParseError {
769            line,
770            message: format!(
771                "invalid money format: expected `amount currency \"display\"`, got {value:?}"
772            ),
773        });
774        return;
775    }
776
777    // Validate amount is a valid integer
778    if parts[0].parse::<i64>().is_err() {
779        errors.push(ParseError {
780            line,
781            message: format!("money amount must be an integer, got {:?}", parts[0]),
782        });
783    }
784
785    // Validate currency is 3-letter uppercase
786    let currency = parts[1];
787    if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
788        errors.push(ParseError {
789            line,
790            message: format!(
791                "money currency must be 3-letter uppercase ISO code, got {currency:?}"
792            ),
793        });
794    }
795
796    // Validate display is quoted
797    let display = parts[2];
798    if !display.starts_with('"') || !display.ends_with('"') {
799        errors.push(ParseError {
800            line,
801            message: format!("money display must be quoted, got {display:?}"),
802        });
803    } else {
804        let inner = &display[1..display.len() - 1];
805        if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
806            errors.push(ParseError {
807                line,
808                message: format!(
809                    "money display exceeds {} chars (got {})",
810                    domain::MAX_MONEY_DISPLAY_LEN,
811                    inner.len()
812                ),
813            });
814        }
815    }
816}
817
818fn validate_enum_value(
819    key: &str,
820    value: &str,
821    allowed: &[&str],
822    line: usize,
823    errors: &mut Vec<ParseError>,
824) {
825    // custom: prefix is always valid (if non-empty after prefix, max 100 chars)
826    if let Some(custom) = value.strip_prefix("custom:") {
827        if custom.is_empty() || custom.len() > 100 {
828            errors.push(ParseError {
829                line,
830                message: format!(
831                    "field {key:?} custom value must be 1-100 chars, got {}",
832                    custom.len()
833                ),
834            });
835        }
836        return;
837    }
838
839    let normalized = value.to_lowercase().replace(' ', "_");
840    if !allowed.contains(&normalized.as_str()) {
841        errors.push(ParseError {
842            line,
843            message: format!(
844                "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
845                allowed.join(", ")
846            ),
847        });
848    }
849}
850
851fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
852    // Valid formats: YYYY, YYYY-MM, YYYY-MM-DD
853    let valid = matches!(value.len(), 4 | 7 | 10)
854        && value.chars().enumerate().all(|(i, c)| match i {
855            4 | 7 => c == '-',
856            _ => c.is_ascii_digit(),
857        });
858
859    if !valid {
860        errors.push(ParseError {
861            line,
862            message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
863        });
864    }
865}
866
867#[cfg(test)]
868mod tests {
869    use super::*;
870
871    #[test]
872    fn parse_person_entity() {
873        let body = [
874            "",
875            "### Mark Bonnick",
876            "- qualifier: Arsenal Kit Manager",
877            "- nationality: GB",
878            "- role: custom:Kit Manager",
879            "- date_of_birth: 1962",
880            "- description: Academy kit manager at Arsenal FC for 22 years",
881            "  (2001-2024). Age 62 at time of dismissal.",
882            "",
883        ]
884        .join("\n");
885
886        let mut errors = Vec::new();
887        let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
888        assert!(errors.is_empty(), "errors: {errors:?}");
889        assert_eq!(entities.len(), 1);
890
891        let e = &entities[0];
892        assert_eq!(e.name, "Mark Bonnick");
893        assert_eq!(e.label, Label::Person);
894        assert_eq!(e.fields.len(), 5);
895
896        // Check multi-line description
897        let desc = e
898            .fields
899            .iter()
900            .find(|(k, _)| k == "description")
901            .map(|(_, v)| v);
902        assert_eq!(
903            desc,
904            Some(&FieldValue::Single(
905                "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
906            ))
907        );
908    }
909
910    #[test]
911    fn parse_person_with_role_list() {
912        let body = "### Test\n- role: politician, executive\n";
913        let mut errors = Vec::new();
914        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
915        assert!(errors.is_empty(), "errors: {errors:?}");
916        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
917        assert_eq!(
918            roles.map(|(_, v)| v),
919            Some(&FieldValue::List(vec![
920                "politician".into(),
921                "executive".into(),
922            ]))
923        );
924    }
925
926    #[test]
927    fn parse_person_with_status() {
928        let body = "### Test\n- status: imprisoned\n";
929        let mut errors = Vec::new();
930        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
931        assert!(errors.is_empty(), "errors: {errors:?}");
932    }
933
934    #[test]
935    fn reject_invalid_person_status() {
936        let body = "### Test\n- status: unknown_status\n";
937        let mut errors = Vec::new();
938        parse_entities(body, SectionKind::People, 1, &mut errors);
939        assert!(errors.iter().any(|e| e.message.contains("invalid status")));
940    }
941
942    #[test]
943    fn parse_organization_with_type_shorthand() {
944        let body = [
945            "",
946            "### Arsenal FC",
947            "- type: sports_club",
948            "- jurisdiction: GB",
949            "- aliases: Arsenal, The Gunners, Arsenal Football Club",
950            "- urls:",
951            "  - https://www.arsenal.com",
952            "  - https://en.wikipedia.org/wiki/Arsenal_F.C.",
953            "",
954        ]
955        .join("\n");
956
957        let mut errors = Vec::new();
958        let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
959        assert!(errors.is_empty(), "errors: {errors:?}");
960        assert_eq!(entities.len(), 1);
961
962        let e = &entities[0];
963        assert_eq!(e.name, "Arsenal FC");
964        assert_eq!(e.label, Label::Organization);
965
966        // type: should have been expanded to org_type:
967        let it = e.fields.iter().find(|(k, _)| k == "org_type");
968        assert_eq!(
969            it.map(|(_, v)| v),
970            Some(&FieldValue::Single("sports_club".into()))
971        );
972
973        // aliases as comma-separated
974        let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
975        assert_eq!(
976            aliases.map(|(_, v)| v),
977            Some(&FieldValue::List(vec![
978                "Arsenal".into(),
979                "The Gunners".into(),
980                "Arsenal Football Club".into(),
981            ]))
982        );
983
984        // urls as nested list
985        let urls = e.fields.iter().find(|(k, _)| k == "urls");
986        assert_eq!(
987            urls.map(|(_, v)| v),
988            Some(&FieldValue::List(vec![
989                "https://www.arsenal.com".into(),
990                "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
991            ]))
992        );
993    }
994
995    #[test]
996    fn parse_organization_with_jurisdiction_subdivision() {
997        let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
998        let mut errors = Vec::new();
999        let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1000        assert!(errors.is_empty(), "errors: {errors:?}");
1001        let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1002        assert_eq!(
1003            j.map(|(_, v)| v),
1004            Some(&FieldValue::Single("ID/West Java".into()))
1005        );
1006    }
1007
1008    #[test]
1009    fn reject_invalid_jurisdiction() {
1010        let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1011        let mut errors = Vec::new();
1012        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1013        assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1014    }
1015
1016    #[test]
1017    fn parse_event_with_type_shorthand() {
1018        let body = [
1019            "",
1020            "### Bonnick dismissal",
1021            "- occurred_at: 2024-12-24",
1022            "- type: dismissal",
1023            "- description: Arsenal dismisses Bonnick.",
1024            "",
1025        ]
1026        .join("\n");
1027
1028        let mut errors = Vec::new();
1029        let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1030        assert!(errors.is_empty(), "errors: {errors:?}");
1031
1032        let e = &entities[0];
1033        assert_eq!(e.label, Label::Event);
1034        let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1035        assert_eq!(
1036            dt.map(|(_, v)| v),
1037            Some(&FieldValue::Single("dismissal".into()))
1038        );
1039    }
1040
1041    #[test]
1042    fn parse_event_with_severity() {
1043        let body =
1044            "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1045        let mut errors = Vec::new();
1046        let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1047        assert!(errors.is_empty(), "errors: {errors:?}");
1048    }
1049
1050    #[test]
1051    fn parse_document_entity() {
1052        let body = [
1053            "### Indictment No. 123",
1054            "- doc_type: indictment",
1055            "- issued_at: 2024-03-15",
1056            "- issuing_authority: Jakarta District Court",
1057            "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1058        ]
1059        .join("\n");
1060        let mut errors = Vec::new();
1061        let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1062        assert!(errors.is_empty(), "errors: {errors:?}");
1063        assert_eq!(entities.len(), 1);
1064        assert_eq!(entities[0].label, Label::Document);
1065    }
1066
1067    #[test]
1068    fn parse_asset_entity() {
1069        let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1070        let mut errors = Vec::new();
1071        let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1072        assert!(errors.is_empty(), "errors: {errors:?}");
1073        assert_eq!(entities.len(), 1);
1074        assert_eq!(entities[0].label, Label::Asset);
1075    }
1076
1077    #[test]
1078    fn reject_invalid_money_format() {
1079        let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1080        let mut errors = Vec::new();
1081        parse_entities(body, SectionKind::Assets, 1, &mut errors);
1082        assert!(errors.iter().any(|e| e.message.contains("money")));
1083    }
1084
1085    #[test]
1086    fn reject_unknown_field() {
1087        let body = "### Test\n- foobar: value\n";
1088        let mut errors = Vec::new();
1089        parse_entities(body, SectionKind::People, 1, &mut errors);
1090        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1091    }
1092
1093    #[test]
1094    fn reject_wrong_label_field() {
1095        // org_type on a person
1096        let body = "### Test\n- org_type: court\n";
1097        let mut errors = Vec::new();
1098        parse_entities(body, SectionKind::People, 1, &mut errors);
1099        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1100    }
1101
1102    #[test]
1103    fn reject_invalid_enum_value() {
1104        let body = "### Test\n- role: wizard\n";
1105        let mut errors = Vec::new();
1106        parse_entities(body, SectionKind::People, 1, &mut errors);
1107        assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1108    }
1109
1110    #[test]
1111    fn accept_custom_enum_value() {
1112        let body = "### Test\n- role: custom:Kit Manager\n";
1113        let mut errors = Vec::new();
1114        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1115        assert!(errors.is_empty(), "errors: {errors:?}");
1116        assert_eq!(entities.len(), 1);
1117    }
1118
1119    #[test]
1120    fn normalize_enum_value_spaces_to_underscores() {
1121        let body = "### Test\n- role: civil servant\n";
1122        let mut errors = Vec::new();
1123        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1124        assert!(errors.is_empty(), "errors: {errors:?}");
1125        assert_eq!(entities.len(), 1);
1126        let val = entities[0]
1127            .fields
1128            .iter()
1129            .find(|(k, _)| k == "role")
1130            .map(|(_, v)| match v {
1131                FieldValue::Single(s) => s.as_str(),
1132                _ => "",
1133            });
1134        assert_eq!(val, Some("civil_servant"));
1135    }
1136
1137    #[test]
1138    fn normalize_enum_list_values() {
1139        let body = "### Test\n- role: civil servant, law enforcement\n";
1140        let mut errors = Vec::new();
1141        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1142        assert!(errors.is_empty(), "errors: {errors:?}");
1143        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1144        assert_eq!(
1145            roles.map(|(_, v)| v),
1146            Some(&FieldValue::List(vec![
1147                "civil_servant".into(),
1148                "law_enforcement".into(),
1149            ]))
1150        );
1151    }
1152
1153    #[test]
1154    fn reject_invalid_date_format() {
1155        let body = "### Test\n- date_of_birth: January 1990\n";
1156        let mut errors = Vec::new();
1157        parse_entities(body, SectionKind::People, 1, &mut errors);
1158        assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1159    }
1160
1161    #[test]
1162    fn accept_valid_date_formats() {
1163        for date in &["2024", "2024-01", "2024-01-15"] {
1164            let body = format!("### Test\n- date_of_birth: {date}\n");
1165            let mut errors = Vec::new();
1166            parse_entities(&body, SectionKind::People, 1, &mut errors);
1167            assert!(
1168                errors.is_empty(),
1169                "date {date:?} should be valid: {errors:?}"
1170            );
1171        }
1172    }
1173
1174    #[test]
1175    fn reject_non_https_url() {
1176        let body = "### Test\n- urls:\n  - http://example.com\n";
1177        let mut errors = Vec::new();
1178        parse_entities(body, SectionKind::People, 1, &mut errors);
1179        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1180    }
1181
1182    #[test]
1183    fn reject_non_https_thumbnail() {
1184        let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1185        let mut errors = Vec::new();
1186        parse_entities(body, SectionKind::People, 1, &mut errors);
1187        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1188    }
1189
1190    #[test]
1191    fn multiple_entities() {
1192        let body = [
1193            "",
1194            "### Alice",
1195            "- nationality: NL",
1196            "",
1197            "### Bob",
1198            "- nationality: GB",
1199            "",
1200        ]
1201        .join("\n");
1202
1203        let mut errors = Vec::new();
1204        let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1205        assert!(errors.is_empty(), "errors: {errors:?}");
1206        assert_eq!(entities.len(), 2);
1207        assert_eq!(entities[0].name, "Alice");
1208        assert_eq!(entities[1].name, "Bob");
1209    }
1210
1211    #[test]
1212    fn field_max_length_violation() {
1213        let long_val = "a".repeat(201);
1214        let body = format!("### Test\n- nationality: {long_val}\n");
1215        let mut errors = Vec::new();
1216        parse_entities(&body, SectionKind::People, 1, &mut errors);
1217        assert!(
1218            errors
1219                .iter()
1220                .any(|e| e.message.contains("exceeds 100 chars"))
1221        );
1222    }
1223
1224    #[test]
1225    fn too_many_aliases() {
1226        let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1227        let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1228        let mut errors = Vec::new();
1229        parse_entities(&body, SectionKind::People, 1, &mut errors);
1230        assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1231    }
1232
1233    #[test]
1234    fn require_org_type_for_organizations() {
1235        let body = "### Test Corp\n- qualifier: Test\n";
1236        let mut errors = Vec::new();
1237        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1238        assert!(
1239            errors
1240                .iter()
1241                .any(|e| { e.message.contains("missing required field \"org_type\"") })
1242        );
1243    }
1244
1245    #[test]
1246    fn accept_organization_with_type() {
1247        let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1248        let mut errors = Vec::new();
1249        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1250        assert!(errors.is_empty(), "errors: {errors:?}");
1251    }
1252}