Skip to main content

kg/
validate.rs

1use std::collections::{HashMap, HashSet};
2use std::path::Path;
3
4use crate::graph::GraphFile;
5
6pub struct ValidationReport {
7    pub errors: Vec<String>,
8    pub warnings: Vec<String>,
9}
10
11// ---------------------------------------------------------------------------
12// Static ontology data
13// ---------------------------------------------------------------------------
14
15pub const VALID_TYPES: &[&str] = &[
16    "Concept",
17    "Process",
18    "DataStore",
19    "Interface",
20    "Rule",
21    "Feature",
22    "Decision",
23    "Convention",
24    "Note",
25    "Bug",
26    "D",
27    "F",
28];
29
30pub const VALID_RELATIONS: &[&str] = &[
31    "HAS",
32    "STORED_IN",
33    "TRIGGERS",
34    "CREATED_BY",
35    "AFFECTED_BY",
36    "AVAILABLE_IN",
37    "DOCUMENTED_IN",
38    "DEPENDS_ON",
39    "TRANSITIONS",
40    "DECIDED_BY",
41    "GOVERNED_BY",
42    "USES",
43    "READS_FROM",
44];
45
46pub const VALID_PROVENANCE_CODES: &[&str] = &["U", "D", "A", "G"];
47
48pub const VALID_SOURCE_TYPES: &[&str] = &[
49    "URL",
50    "SVN",
51    "SOURCECODE",
52    "WIKI",
53    "CONFLUENCE",
54    "CONVERSATION",
55    "GIT_COMMIT",
56    "PULL_REQUEST",
57    "ISSUE",
58    "DOC",
59    "LOG",
60    "OTHER",
61];
62
63const MAX_CUSTOM_TYPE_LEN: usize = 48;
64const MAX_CUSTOM_RELATION_LEN: usize = 64;
65
66/// Maps node type -> expected id prefix.
67pub const TYPE_TO_PREFIX: &[(&str, &str)] = &[
68    ("Concept", "concept"),
69    ("Process", "process"),
70    ("DataStore", "datastore"),
71    ("Interface", "interface"),
72    ("Rule", "rule"),
73    ("Feature", "feature"),
74    ("Decision", "decision"),
75    ("Convention", "convention"),
76    ("Note", "note"),
77    ("Bug", "bug"),
78];
79
80/// Maps node type -> canonical short code used in IDs.
81pub const TYPE_TO_CODE: &[(&str, &str)] = &[
82    ("Concept", "K"),
83    ("Process", "P"),
84    ("DataStore", "D"),
85    ("Interface", "I"),
86    ("Rule", "R"),
87    ("Feature", "F"),
88    ("Decision", "Z"),
89    ("Convention", "C"),
90    ("Note", "N"),
91    ("Bug", "B"),
92];
93
94/// (relation, valid_source_types, valid_target_types)
95/// Empty slice = no constraint for that side.
96pub const EDGE_TYPE_RULES: &[(&str, &[&str], &[&str])] = &[
97    (
98        "HAS",
99        &["Concept", "Process", "Interface", "D", "F"],
100        &[
101            "Concept",
102            "Feature",
103            "DataStore",
104            "Rule",
105            "Interface",
106            "D",
107            "F",
108        ],
109    ),
110    ("STORED_IN", &["Concept", "Process", "Rule"], &["DataStore"]),
111    (
112        "CREATED_BY",
113        &["Concept", "DataStore", "Interface", "Decision"],
114        &["Process"],
115    ),
116    (
117        "TRIGGERS",
118        &["Process", "Rule"],
119        &["Process", "Bug", "Rule"],
120    ),
121    (
122        "AFFECTED_BY",
123        &[
124            "Concept",
125            "Process",
126            "DataStore",
127            "Interface",
128            "Rule",
129            "Feature",
130            "Decision",
131            "Bug",
132        ],
133        &[
134            "Concept",
135            "Process",
136            "DataStore",
137            "Interface",
138            "Rule",
139            "Feature",
140            "Decision",
141            "Convention",
142            "Bug",
143        ],
144    ),
145    (
146        "AVAILABLE_IN",
147        &["Feature", "DataStore", "Concept", "Process"],
148        &["Interface"],
149    ),
150    (
151        "DOCUMENTED_IN",
152        &["Concept", "Process", "Decision", "Rule", "Feature", "Bug"],
153        &["Interface", "Note"],
154    ),
155    (
156        "DEPENDS_ON",
157        &["Feature", "Process", "Interface"],
158        &["Feature", "DataStore", "Interface", "Concept"],
159    ),
160    ("TRANSITIONS", &["Process", "Rule"], &["Process", "Rule"]),
161    (
162        "DECIDED_BY",
163        &["Concept", "Process", "Interface"],
164        &["Decision"],
165    ),
166    (
167        "GOVERNED_BY",
168        &["Process", "Interface", "DataStore"],
169        &["Convention", "Rule"],
170    ),
171];
172
173// ---------------------------------------------------------------------------
174// Core validation
175// ---------------------------------------------------------------------------
176
177pub fn edge_type_rule(
178    relation: &str,
179) -> Option<(&'static [&'static str], &'static [&'static str])> {
180    EDGE_TYPE_RULES
181        .iter()
182        .find(|(rule_relation, _, _)| *rule_relation == relation)
183        .map(|(_, source_types, target_types)| (*source_types, *target_types))
184}
185
186pub fn canonical_type_code_for(node_type: &str) -> Option<&'static str> {
187    TYPE_TO_CODE
188        .iter()
189        .find(|(typ, _)| *typ == node_type)
190        .map(|(_, code)| *code)
191}
192
193fn type_for_prefix(prefix: &str) -> Option<&'static str> {
194    TYPE_TO_PREFIX
195        .iter()
196        .find(|(_, known_prefix)| *known_prefix == prefix)
197        .map(|(typ, _)| *typ)
198}
199
200fn type_for_code(code: &str) -> Option<&'static str> {
201    TYPE_TO_CODE
202        .iter()
203        .find(|(_, known_code)| *known_code == code)
204        .map(|(typ, _)| *typ)
205}
206
207fn valid_id_suffix(suffix: &str) -> bool {
208    !suffix.is_empty()
209        && suffix
210            .chars()
211            .next()
212            .is_some_and(|c| c.is_ascii_lowercase())
213        && suffix
214            .chars()
215            .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
216}
217
218fn valid_generated_node_suffix(suffix: &str) -> bool {
219    if suffix.is_empty() || suffix.contains(['\n', '\r']) {
220        return false;
221    }
222
223    let name_part = match suffix.rsplit_once(':') {
224        Some((head, tail)) if !tail.is_empty() && tail.chars().all(|ch| ch.is_ascii_digit()) => {
225            head
226        }
227        _ => suffix,
228    };
229
230    !name_part.is_empty() && !name_part.contains(':')
231}
232
233pub fn is_generated_node_type(node_type: &str) -> bool {
234    node_type.starts_with('G') && node_type.len() > 1
235}
236
237pub fn is_generated_relation(value: &str) -> bool {
238    value.starts_with('G') && value.len() > 1
239}
240
241fn is_valid_custom_token(token: &str, max_len: usize) -> bool {
242    if token.is_empty() || token.len() > max_len {
243        return false;
244    }
245    if token.chars().any(char::is_whitespace) {
246        return false;
247    }
248    token.chars().all(|ch| ch.is_ascii_graphic())
249}
250
251pub fn is_valid_node_type(value: &str) -> bool {
252    VALID_TYPES.contains(&value) || is_valid_custom_token(value, MAX_CUSTOM_TYPE_LEN)
253}
254
255pub fn is_valid_relation(value: &str) -> bool {
256    VALID_RELATIONS.contains(&value) || is_valid_custom_token(value, MAX_CUSTOM_RELATION_LEN)
257}
258
259fn parse_similarity_score(value: &str) -> Option<f64> {
260    let score = value.trim().parse::<f64>().ok()?;
261    if (0.0..=1.0).contains(&score) {
262        Some(score)
263    } else {
264        None
265    }
266}
267
268fn is_valid_score_component_label(value: &str) -> bool {
269    let mut chars = value.chars();
270    matches!(chars.next(), Some('C'))
271        && chars.clone().next().is_some()
272        && chars.all(|ch| ch.is_ascii_digit())
273}
274
275pub fn validate_bidirectional_similarity_edge(
276    source_id: &str,
277    relation: &str,
278    target_id: &str,
279    detail: &str,
280    bidirectional: bool,
281) -> Result<(), String> {
282    if !bidirectional {
283        return Ok(());
284    }
285    if relation != "~" {
286        return Err(format!(
287            "bidirectional edge requires '~' relation: {} {} {}",
288            source_id, relation, target_id
289        ));
290    }
291    if source_id > target_id {
292        return Err(format!(
293            "bidirectional edge must be canonicalized (source <= target): {} ~ {}",
294            source_id, target_id
295        ));
296    }
297    if parse_similarity_score(detail).is_none() {
298        return Err(format!(
299            "bidirectional similarity edge requires score in range 0..1: {} ~ {}",
300            source_id, target_id
301        ));
302    }
303    Ok(())
304}
305
306pub fn is_valid_iso_utc_timestamp(value: &str) -> bool {
307    if value.len() != 20 {
308        return false;
309    }
310    let bytes = value.as_bytes();
311    let is_digit = |idx: usize| bytes.get(idx).is_some_and(|b| b.is_ascii_digit());
312    if !(is_digit(0)
313        && is_digit(1)
314        && is_digit(2)
315        && is_digit(3)
316        && bytes.get(4) == Some(&b'-')
317        && is_digit(5)
318        && is_digit(6)
319        && bytes.get(7) == Some(&b'-')
320        && is_digit(8)
321        && is_digit(9)
322        && bytes.get(10) == Some(&b'T')
323        && is_digit(11)
324        && is_digit(12)
325        && bytes.get(13) == Some(&b':')
326        && is_digit(14)
327        && is_digit(15)
328        && bytes.get(16) == Some(&b':')
329        && is_digit(17)
330        && is_digit(18)
331        && bytes.get(19) == Some(&b'Z'))
332    {
333        return false;
334    }
335
336    let month = value[5..7].parse::<u32>().ok();
337    let day = value[8..10].parse::<u32>().ok();
338    let hour = value[11..13].parse::<u32>().ok();
339    let minute = value[14..16].parse::<u32>().ok();
340    let second = value[17..19].parse::<u32>().ok();
341    matches!(month, Some(1..=12))
342        && matches!(day, Some(1..=31))
343        && matches!(hour, Some(0..=23))
344        && matches!(minute, Some(0..=59))
345        && matches!(second, Some(0..=59))
346}
347
348pub fn is_valid_iso_date(value: &str) -> bool {
349    if value.len() != 10 {
350        return false;
351    }
352    let bytes = value.as_bytes();
353    let is_digit = |idx: usize| bytes.get(idx).is_some_and(|b| b.is_ascii_digit());
354    if !(is_digit(0)
355        && is_digit(1)
356        && is_digit(2)
357        && is_digit(3)
358        && bytes.get(4) == Some(&b'-')
359        && is_digit(5)
360        && is_digit(6)
361        && bytes.get(7) == Some(&b'-')
362        && is_digit(8)
363        && is_digit(9))
364    {
365        return false;
366    }
367    let month = value[5..7].parse::<u32>().ok();
368    let day = value[8..10].parse::<u32>().ok();
369    matches!(month, Some(1..=12)) && matches!(day, Some(1..=31))
370}
371
372pub fn validate_source_reference(value: &str) -> Result<(), String> {
373    let trimmed = value.trim();
374    if trimmed.is_empty() {
375        return Err("source entry cannot be empty".to_owned());
376    }
377
378    let parts: Vec<&str> = trimmed.split_whitespace().collect();
379    if parts.len() < 2 {
380        return Err(format!(
381            "source '{}' must have format '<TYPE> <LINK_OR_DATE> <OPTIONAL_DETAILS>'",
382            value
383        ));
384    }
385
386    let source_type = parts[0];
387    if !VALID_SOURCE_TYPES.contains(&source_type) {
388        return Err(format!(
389            "source '{}' uses invalid type '{}'; valid types: {}",
390            value,
391            source_type,
392            VALID_SOURCE_TYPES.join(", ")
393        ));
394    }
395
396    match source_type {
397        "CONVERSATION" => {
398            if !is_valid_iso_date(parts[1]) {
399                return Err(format!(
400                    "source '{}' must use date format YYYY-MM-DD for CONVERSATION",
401                    value
402                ));
403            }
404        }
405        "GIT_COMMIT" => {
406            if parts.len() < 3 {
407                return Err(format!(
408                    "source '{}' must use format 'GIT_COMMIT <REPO_URL_OR_NAME> <COMMIT_SHA> <OPTIONAL_DETAILS>'",
409                    value
410                ));
411            }
412        }
413        _ => {}
414    }
415
416    Ok(())
417}
418
419pub fn normalize_source_reference(value: &str) -> String {
420    let trimmed = value.trim();
421    if trimmed.is_empty() {
422        return String::new();
423    }
424    let source_type = trimmed.split_whitespace().next().unwrap_or_default();
425    if VALID_SOURCE_TYPES.contains(&source_type) {
426        return trimmed.to_owned();
427    }
428    format!("DOC {trimmed}")
429}
430
431pub fn is_valid_importance(value: f64) -> bool {
432    (0.0..=1.0).contains(&value)
433}
434
435pub fn is_legacy_importance(value: f64) -> bool {
436    value > 1.0 && (1.0..=6.0).contains(&value) && value.fract() == 0.0
437}
438
439/// Normalize a node id to legacy `<type_prefix>:snake_case` when possible.
440///
441/// Accepted inputs include both canonical `TYPE_CODE:snake_case` and legacy
442/// `prefix:snake_case` forms. Unknown prefixes are returned unchanged.
443pub fn normalize_node_id(id: &str) -> String {
444    let Some((head, suffix)) = id.split_once(':') else {
445        return id.to_owned();
446    };
447    let Some(node_type) = type_for_code(head).or_else(|| type_for_prefix(head)) else {
448        return id.to_owned();
449    };
450    let Some(prefix) = TYPE_TO_PREFIX
451        .iter()
452        .find(|(typ, _)| *typ == node_type)
453        .map(|(_, prefix)| *prefix)
454    else {
455        return id.to_owned();
456    };
457    format!("{prefix}:{suffix}")
458}
459
460/// Validate and canonicalize a node id for a concrete node type.
461///
462/// Returns canonical `TYPE:snake_case` on success.
463pub fn canonicalize_node_id_for_type(id: &str, node_type: &str) -> Result<String, String> {
464    if is_generated_node_type(node_type) {
465        let suffix = match id.split_once(':') {
466            Some((head, suffix)) if head == node_type => suffix,
467            Some((head, _)) => {
468                return Err(format!(
469                    "node id '{}' has type marker '{}'; expected '{}' or a path-only id",
470                    id, head, node_type
471                ));
472            }
473            None => id,
474        };
475        if !valid_generated_node_suffix(suffix) {
476            return Err(format!(
477                "node id '{}' has invalid suffix for type '{}'",
478                id, node_type
479            ));
480        }
481        return Ok(format!("{node_type}:{suffix}"));
482    }
483
484    let Some((head, suffix)) = id.split_once(':') else {
485        return Err(format!(
486            "node id '{}' must be in format <type_code>:snake_case",
487            id
488        ));
489    };
490    let suffix_valid = if matches!(node_type, "D" | "F") {
491        valid_generated_node_suffix(suffix)
492    } else {
493        valid_id_suffix(suffix)
494    };
495    if !suffix_valid {
496        return Err(format!(
497            "node id '{}' has invalid suffix for type '{}'",
498            id, node_type
499        ));
500    }
501
502    if !is_valid_node_type(node_type) {
503        return Err(format!("invalid node type '{node_type}'"));
504    }
505
506    let Some(expected_code) = canonical_type_code_for(node_type) else {
507        if head == node_type {
508            return Ok(format!("{node_type}:{suffix}"));
509        }
510        return Err(format!(
511            "node id '{}' has type marker '{}'; expected '{}' for custom node type",
512            id, head, node_type
513        ));
514    };
515    let Some(expected_prefix) = TYPE_TO_PREFIX
516        .iter()
517        .find(|(typ, _)| *typ == node_type)
518        .map(|(_, prefix)| *prefix)
519    else {
520        return Err(format!("invalid node type '{node_type}'"));
521    };
522
523    if head == expected_code || head == expected_prefix {
524        return Ok(format!("{expected_prefix}:{suffix}"));
525    }
526
527    if let Some(actual_type) = type_for_code(head).or_else(|| type_for_prefix(head)) {
528        return Err(format!(
529            "node id '{}' has type marker '{}' (type '{}') but node_type is '{}'",
530            id, head, actual_type, node_type
531        ));
532    }
533
534    Err(format!(
535        "node id '{}' has unknown type marker '{}'; expected '{}' or '{}'",
536        id, head, expected_code, expected_prefix
537    ))
538}
539
540pub fn format_edge_source_type_error(
541    source_type: &str,
542    relation: &str,
543    allowed_source_types: &[impl AsRef<str>],
544) -> String {
545    format!(
546        "{} cannot be source of {} (allowed: {})",
547        source_type,
548        relation,
549        allowed_source_types
550            .iter()
551            .map(|value| value.as_ref())
552            .collect::<Vec<_>>()
553            .join(", ")
554    )
555}
556
557pub fn format_edge_target_type_error(
558    target_type: &str,
559    relation: &str,
560    allowed_target_types: &[impl AsRef<str>],
561) -> String {
562    format!(
563        "{} cannot be target of {} (allowed: {})",
564        target_type,
565        relation,
566        allowed_target_types
567            .iter()
568            .map(|value| value.as_ref())
569            .collect::<Vec<_>>()
570            .join(", ")
571    )
572}
573
574pub fn validate_graph(
575    graph: &GraphFile,
576    cwd: &Path,
577    deep: bool,
578    base_dir: Option<&str>,
579) -> ValidationReport {
580    let mut errors = Vec::new();
581    let mut warnings = Vec::new();
582
583    let type_to_prefix: HashMap<&str, &str> = TYPE_TO_PREFIX.iter().copied().collect();
584    let type_to_code: HashMap<&str, &str> = TYPE_TO_CODE.iter().copied().collect();
585    // -- metadata --
586    if graph.metadata.name.trim().is_empty() {
587        errors.push("metadata.name missing".to_owned());
588    }
589
590    // -- nodes --
591    let mut id_counts = HashMap::<&str, usize>::new();
592    for node in &graph.nodes {
593        *id_counts.entry(node.id.as_str()).or_insert(0) += 1;
594        let generated = is_generated_node_type(&node.r#type);
595
596        if !is_valid_node_type(&node.r#type) {
597            errors.push(format!("node {} has invalid type {}", node.id, node.r#type));
598        }
599        if node.name.trim().is_empty() && !generated && node.properties.provenance != "G" {
600            errors.push(format!("node {} missing name", node.id));
601        }
602        if !generated && node.source_files.is_empty() {
603            errors.push(format!("node {} missing source_files", node.id));
604        }
605
606        match canonicalize_node_id_for_type(&node.id, &node.r#type) {
607            Ok(_) => {}
608            Err(_) => {
609                if let Some((head, _)) = node.id.split_once(':') {
610                    if let (Some(expected_code), Some(expected_prefix)) = (
611                        type_to_code.get(node.r#type.as_str()),
612                        type_to_prefix.get(node.r#type.as_str()),
613                    ) {
614                        errors.push(format!(
615                            "node id {} invalid for type {} (expected {}:* or {}:*)",
616                            node.id, node.r#type, expected_code, expected_prefix
617                        ));
618                        if type_for_code(head).is_none() && type_for_prefix(head).is_none() {
619                            errors.push(format!(
620                                "node id {} has unknown type marker '{}'",
621                                node.id, head
622                            ));
623                        }
624                    } else {
625                        errors.push(format!(
626                            "node id {} invalid for custom type {} (expected {}:*)",
627                            node.id, node.r#type, node.r#type
628                        ));
629                    }
630                } else {
631                    errors.push(format!(
632                        "node id {} does not match prefix:snake_case",
633                        node.id
634                    ));
635                }
636            }
637        }
638
639        // quality warnings (skip Feature nodes)
640        if !generated && node.r#type != "Feature" && node.properties.provenance != "G" {
641            if node.properties.description.trim().is_empty() {
642                warnings.push(format!("node {} missing description", node.id));
643            }
644            if node.properties.key_facts.is_empty() {
645                warnings.push(format!("node {} missing key_facts", node.id));
646            }
647            if node.properties.provenance.trim().is_empty() {
648                warnings.push(format!("node {} missing provenance", node.id));
649            }
650        }
651        if let Some(confidence) = node.properties.confidence {
652            if !(0.0..=1.0).contains(&confidence) {
653                warnings.push(format!(
654                    "node {} confidence out of range: {}",
655                    node.id, confidence
656                ));
657            }
658        }
659        if !generated && is_legacy_importance(node.properties.importance) {
660            warnings.push(format!(
661                "node {} uses legacy importance scale (1..6): {}",
662                node.id, node.properties.importance
663            ));
664        } else if !generated && !is_valid_importance(node.properties.importance) {
665            errors.push(format!(
666                "node {} importance out of range: {}",
667                node.id, node.properties.importance
668            ));
669        }
670
671        if !generated
672            && !node.properties.provenance.trim().is_empty()
673            && !VALID_PROVENANCE_CODES.contains(&node.properties.provenance.as_str())
674        {
675            warnings.push(format!(
676                "node {} has non-dictionary provenance '{}' (expected one of: {})",
677                node.id,
678                node.properties.provenance,
679                VALID_PROVENANCE_CODES.join(", ")
680            ));
681        }
682
683        if !generated {
684            for source in &node.source_files {
685                if let Err(err) = validate_source_reference(source) {
686                    warnings.push(format!(
687                        "node {} has non-standard source '{}': {}",
688                        node.id, source, err
689                    ));
690                }
691            }
692        }
693    }
694    for (node_id, count) in &id_counts {
695        if *count > 1 {
696            errors.push(format!("duplicate node id: {} ({})", node_id, count));
697        }
698    }
699
700    // -- edges --
701    let node_type_map: HashMap<&str, &str> = graph
702        .nodes
703        .iter()
704        .map(|node| (node.id.as_str(), node.r#type.as_str()))
705        .collect();
706    let node_ids: HashSet<&str> = node_type_map.keys().copied().collect();
707    let mut touched = HashSet::new();
708    let mut edge_keys = HashSet::new();
709
710    for edge in &graph.edges {
711        if !is_valid_relation(&edge.relation) {
712            errors.push(format!(
713                "edge has invalid relation: {} {} {}",
714                edge.source_id, edge.relation, edge.target_id
715            ));
716        }
717        if !node_ids.contains(edge.source_id.as_str()) {
718            errors.push(format!(
719                "edge source missing: {} {} {}",
720                edge.source_id, edge.relation, edge.target_id
721            ));
722        }
723        if !node_ids.contains(edge.target_id.as_str()) {
724            errors.push(format!(
725                "edge target missing: {} {} {}",
726                edge.source_id, edge.relation, edge.target_id
727            ));
728        }
729
730        if let Err(err) = validate_bidirectional_similarity_edge(
731            &edge.source_id,
732            &edge.relation,
733            &edge.target_id,
734            &edge.properties.detail,
735            edge.properties.bidirectional,
736        ) {
737            errors.push(err);
738        }
739
740        for (label, score) in &edge.properties.score_components {
741            if !is_valid_score_component_label(label) {
742                errors.push(format!(
743                    "edge {} {} {} has invalid score component label '{}'",
744                    edge.source_id, edge.relation, edge.target_id, label
745                ));
746            }
747            if !(0.0..=1.0).contains(score) {
748                errors.push(format!(
749                    "edge {} {} {} score component '{}' out of range: {}",
750                    edge.source_id, edge.relation, edge.target_id, label, score
751                ));
752            }
753        }
754
755        // Enforce relation semantics from decision table rules.
756        if let (Some(src_type), Some(tgt_type)) = (
757            node_type_map.get(edge.source_id.as_str()),
758            node_type_map.get(edge.target_id.as_str()),
759        ) {
760            if VALID_TYPES.contains(src_type) && VALID_TYPES.contains(tgt_type) {
761                if let Some((valid_src, valid_tgt)) = edge_type_rule(edge.relation.as_str()) {
762                    if !valid_src.is_empty() && !valid_src.contains(src_type) {
763                        errors.push(format!(
764                            "edge {} {} {} invalid: {}",
765                            edge.source_id,
766                            edge.relation,
767                            edge.target_id,
768                            format_edge_source_type_error(
769                                src_type,
770                                edge.relation.as_str(),
771                                valid_src
772                            )
773                        ));
774                    }
775                    if !valid_tgt.is_empty() && !valid_tgt.contains(tgt_type) {
776                        errors.push(format!(
777                            "edge {} {} {} invalid: {}",
778                            edge.source_id,
779                            edge.relation,
780                            edge.target_id,
781                            format_edge_target_type_error(
782                                tgt_type,
783                                edge.relation.as_str(),
784                                valid_tgt
785                            )
786                        ));
787                    }
788                }
789            }
790        }
791
792        touched.insert(edge.source_id.as_str());
793        touched.insert(edge.target_id.as_str());
794        let key = format!("{}|{}|{}", edge.source_id, edge.relation, edge.target_id);
795        if !edge_keys.insert(key.clone()) {
796            errors.push(format!("duplicate edge: {}", key.replace('|', " ")));
797        }
798    }
799
800    // orphan nodes = errors (not connected to any edge)
801    for node in &graph.nodes {
802        if !touched.contains(node.id.as_str()) {
803            errors.push(format!("orphan node: {}", node.id));
804        }
805    }
806
807    // deep: verify source files exist on disk
808    if deep {
809        let base = base_dir
810            .map(|d| cwd.join(d))
811            .unwrap_or_else(|| cwd.to_path_buf());
812        for node in &graph.nodes {
813            for source in &node.source_files {
814                if !base.join(source).exists() {
815                    errors.push(format!("missing source file: {} -> {}", node.id, source));
816                }
817            }
818        }
819    }
820
821    errors.sort();
822    warnings.sort();
823    ValidationReport { errors, warnings }
824}
825
826#[cfg(test)]
827mod tests {
828    use super::{
829        canonicalize_node_id_for_type, is_valid_node_type, is_valid_relation,
830        validate_bidirectional_similarity_edge,
831    };
832
833    #[test]
834    fn canonicalize_node_id_allows_custom_type_marker() {
835        let canonical = canonicalize_node_id_for_type("~:dedupe_anchor", "~").expect("custom id");
836        assert_eq!(canonical, "~:dedupe_anchor");
837    }
838
839    #[test]
840    fn canonicalize_node_id_allows_generated_type_marker() {
841        let canonical = canonicalize_node_id_for_type("GDIR:App", "GDIR").expect("generated id");
842        assert_eq!(canonical, "GDIR:App");
843    }
844
845    #[test]
846    fn canonicalize_node_id_rejects_mismatched_custom_marker() {
847        let err = canonicalize_node_id_for_type("custom:dedupe_anchor", "~").unwrap_err();
848        assert!(err.contains("expected '~' for custom node type"));
849    }
850
851    #[test]
852    fn relation_and_node_type_validation_accepts_custom_tokens() {
853        assert!(is_valid_node_type("~"));
854        assert!(is_valid_relation("~"));
855        assert!(!is_valid_node_type(""));
856        assert!(!is_valid_relation(" "));
857    }
858
859    #[test]
860    fn bidirectional_similarity_validation_requires_score_and_canonical_order() {
861        assert!(validate_bidirectional_similarity_edge("~:a", "~", "~:b", "0.8", true).is_ok());
862
863        let invalid_score =
864            validate_bidirectional_similarity_edge("~:a", "~", "~:b", "1.8", true).unwrap_err();
865        assert!(invalid_score.contains("requires score in range 0..1"));
866
867        let invalid_order =
868            validate_bidirectional_similarity_edge("~:b", "~", "~:a", "0.8", true).unwrap_err();
869        assert!(invalid_order.contains("must be canonicalized"));
870    }
871
872    #[test]
873    fn score_component_label_validation_accepts_only_c_numeric() {
874        assert!(super::is_valid_score_component_label("C1"));
875        assert!(super::is_valid_score_component_label("C2"));
876        assert!(!super::is_valid_score_component_label("DESC"));
877        assert!(!super::is_valid_score_component_label("C"));
878    }
879}