Skip to main content

kg/
graph.rs

1use std::collections::{BTreeMap, HashMap};
2use std::fs;
3use std::io::Write;
4use std::path::{Path, PathBuf};
5use std::time::{SystemTime, UNIX_EPOCH};
6
7use anyhow::{Context, Result};
8use flate2::Compression;
9use flate2::write::GzEncoder;
10use serde::{Deserialize, Serialize};
11
12const GRAPH_INFO_NODE_ID: &str = "^:graph_info";
13const GRAPH_INFO_NODE_TYPE: &str = "^";
14const GRAPH_UUID_FACT_PREFIX: &str = "graph_uuid=";
15const GRAPH_SCHEMA_VERSION: u32 = 2;
16const GRAPH_SCHEMA_VERSION_FACT_PREFIX: &str = "schema_version=";
17const KG_TEXT_COMPRESSION_MIN_LEN: usize = 7;
18
19/// Write `data` to `dest` atomically:
20/// 1. Write to `dest.tmp`
21/// 2. If `dest` already exists, copy it to `dest.bak`
22/// 3. Rename `dest.tmp` -> `dest`
23fn atomic_write(dest: &Path, data: &str) -> Result<()> {
24    let unique = SystemTime::now()
25        .duration_since(UNIX_EPOCH)
26        .unwrap_or_default()
27        .as_nanos();
28    let tmp = dest.with_extension(format!("tmp.{}.{}", std::process::id(), unique));
29    fs::write(&tmp, data).with_context(|| format!("failed to write tmp: {}", tmp.display()))?;
30    if dest.exists() {
31        let bak = backup_bak_path(dest)?;
32        if should_refresh_bak(&bak)? {
33            fs::copy(dest, &bak)
34                .with_context(|| format!("failed to create backup: {}", bak.display()))?;
35        }
36    }
37    fs::rename(&tmp, dest).with_context(|| format!("failed to rename tmp to {}", dest.display()))
38}
39
40const BACKUP_BAK_STALE_SECS: u64 = 5 * 60;
41const BACKUP_STALE_SECS: u64 = 60 * 60;
42
43fn should_refresh_bak(bak_path: &Path) -> Result<bool> {
44    if !bak_path.exists() {
45        return Ok(true);
46    }
47    let modified = fs::metadata(bak_path)
48        .and_then(|m| m.modified())
49        .with_context(|| format!("failed to read backup mtime: {}", bak_path.display()))?;
50    let age_secs = SystemTime::now()
51        .duration_since(modified)
52        .unwrap_or_default()
53        .as_secs();
54    Ok(age_secs >= BACKUP_BAK_STALE_SECS)
55}
56
57fn backup_graph_if_stale(path: &Path, data: &str) -> Result<()> {
58    let cache_dir = backup_cache_dir(path)?;
59    let stem = match path.file_stem().and_then(|s| s.to_str()) {
60        Some(stem) => stem,
61        None => return Ok(()),
62    };
63    let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("json");
64    let backup_prefix = format!("{stem}.{ext}");
65    let now = SystemTime::now()
66        .duration_since(UNIX_EPOCH)
67        .context("time went backwards")?
68        .as_secs();
69    if let Some(latest) = latest_backup_ts(&cache_dir, &backup_prefix)? {
70        if now.saturating_sub(latest) < BACKUP_STALE_SECS {
71            return Ok(());
72        }
73    }
74
75    let backup_path = cache_dir.join(format!("{backup_prefix}.bck.{now}.gz"));
76    let tmp_path = backup_path.with_extension("tmp");
77    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
78    encoder.write_all(data.as_bytes())?;
79    let encoded = encoder.finish()?;
80    fs::write(&tmp_path, encoded)
81        .with_context(|| format!("failed to write tmp: {}", tmp_path.display()))?;
82    fs::rename(&tmp_path, &backup_path)
83        .with_context(|| format!("failed to rename tmp to {}", backup_path.display()))?;
84    Ok(())
85}
86
87fn backup_cache_dir(path: &Path) -> Result<PathBuf> {
88    let dir = crate::cache_paths::cache_root_for_graph(path);
89    fs::create_dir_all(&dir)
90        .with_context(|| format!("failed to create cache directory: {}", dir.display()))?;
91    Ok(dir)
92}
93
94fn backup_bak_path(dest: &Path) -> Result<PathBuf> {
95    let cache_dir = backup_cache_dir(dest)?;
96    let stem = dest.file_stem().and_then(|s| s.to_str()).unwrap_or("graph");
97    let ext = dest.extension().and_then(|s| s.to_str()).unwrap_or("json");
98    Ok(cache_dir.join(format!("{stem}.{ext}.bak")))
99}
100
101fn latest_backup_ts(dir: &Path, stem: &str) -> Result<Option<u64>> {
102    let prefix = format!("{stem}.bck.");
103    let suffix = ".gz";
104    let mut latest = None;
105    for entry in fs::read_dir(dir).with_context(|| format!("read dir: {}", dir.display()))? {
106        let entry = entry?;
107        let name = entry.file_name();
108        let name = name.to_string_lossy();
109        if !name.starts_with(&prefix) || !name.ends_with(suffix) {
110            continue;
111        }
112        let ts_part = &name[prefix.len()..name.len() - suffix.len()];
113        if let Ok(ts) = ts_part.parse::<u64>() {
114            match latest {
115                Some(current) => {
116                    if ts > current {
117                        latest = Some(ts);
118                    }
119                }
120                None => latest = Some(ts),
121            }
122        }
123    }
124    Ok(latest)
125}
126
127fn node_type_to_code(node_type: &str) -> &str {
128    match node_type {
129        "Feature" => "F",
130        "Concept" => "K",
131        "Interface" => "I",
132        "Process" => "P",
133        "DataStore" => "D",
134        "Attribute" => "A",
135        "Entity" => "Y",
136        "Note" => "N",
137        "Rule" => "R",
138        "Convention" => "C",
139        "Bug" => "B",
140        "Decision" => "Z",
141        "OpenQuestion" => "O",
142        "Claim" => "Q",
143        "Insight" => "W",
144        "Reference" => "M",
145        "Term" => "T",
146        "Status" => "S",
147        "Doubt" => "L",
148        _ => node_type,
149    }
150}
151
152fn encode_node_type_token(node_type: &str) -> String {
153    let code = node_type_to_code(node_type);
154    if code != node_type {
155        return code.to_owned();
156    }
157    if code_to_node_type(node_type) != node_type {
158        return format!("={node_type}");
159    }
160    node_type.to_owned()
161}
162
163fn code_to_node_type(code: &str) -> &str {
164    match code {
165        "F" => "Feature",
166        "K" => "Concept",
167        "I" => "Interface",
168        "P" => "Process",
169        "D" => "DataStore",
170        "A" => "Attribute",
171        "Y" => "Entity",
172        "N" => "Note",
173        "R" => "Rule",
174        "C" => "Convention",
175        "B" => "Bug",
176        "Z" => "Decision",
177        "O" => "OpenQuestion",
178        "Q" => "Claim",
179        "W" => "Insight",
180        "M" => "Reference",
181        "T" => "Term",
182        "S" => "Status",
183        "L" => "Doubt",
184        _ => code,
185    }
186}
187
188fn decode_node_type_token(token: &str) -> String {
189    token
190        .strip_prefix('=')
191        .map(str::to_owned)
192        .unwrap_or_else(|| code_to_node_type(token).to_owned())
193}
194
195fn relation_to_code(relation: &str) -> &str {
196    match relation {
197        "DOCUMENTED_IN" | "DOCUMENTS" => "D",
198        "HAS" => "H",
199        "TRIGGERS" => "T",
200        "AFFECTED_BY" | "AFFECTS" => "A",
201        "READS_FROM" | "READS" => "R",
202        "GOVERNED_BY" | "GOVERNS" => "G",
203        "DEPENDS_ON" => "O",
204        "AVAILABLE_IN" => "I",
205        "SUPPORTS" => "S",
206        "SUMMARIZES" => "U",
207        "RELATED_TO" => "L",
208        "CONTRADICTS" => "V",
209        "CREATED_BY" | "CREATES" => "C",
210        _ => relation,
211    }
212}
213
214fn code_to_relation(code: &str) -> &str {
215    match code {
216        "D" => "DOCUMENTED_IN",
217        "H" => "HAS",
218        "T" => "TRIGGERS",
219        "A" => "AFFECTED_BY",
220        "R" => "READS_FROM",
221        "G" => "GOVERNED_BY",
222        "O" => "DEPENDS_ON",
223        "I" => "AVAILABLE_IN",
224        "S" => "SUPPORTS",
225        "U" => "SUMMARIZES",
226        "L" => "RELATED_TO",
227        "V" => "CONTRADICTS",
228        "C" => "CREATED_BY",
229        _ => code,
230    }
231}
232
233fn canonicalize_bidirectional_pair(a: &str, b: &str) -> (String, String) {
234    if a <= b {
235        (a.to_owned(), b.to_owned())
236    } else {
237        (b.to_owned(), a.to_owned())
238    }
239}
240
241fn is_score_component_label(value: &str) -> bool {
242    let mut chars = value.chars();
243    matches!(chars.next(), Some('C'))
244        && chars.clone().next().is_some()
245        && chars.all(|ch| ch.is_ascii_digit())
246}
247
248fn sort_case_insensitive(values: &[String]) -> Vec<String> {
249    let mut sorted = values.to_vec();
250    sorted.sort_by(|a, b| {
251        let la = a.to_ascii_lowercase();
252        let lb = b.to_ascii_lowercase();
253        la.cmp(&lb).then_with(|| a.cmp(b))
254    });
255    sorted
256}
257
258fn decode_kg_text(value: &str) -> String {
259    let mut out = String::new();
260    let mut chars = value.chars();
261    while let Some(ch) = chars.next() {
262        if ch != '\\' {
263            out.push(ch);
264            continue;
265        }
266        match chars.next() {
267            Some('n') => out.push('\n'),
268            Some('r') => out.push('\r'),
269            Some('\\') => out.push('\\'),
270            Some(other) => {
271                out.push('\\');
272                out.push(other);
273            }
274            None => out.push('\\'),
275        }
276    }
277    out
278}
279
280fn escape_kg_text(value: &str) -> String {
281    let mut out = String::new();
282    for ch in value.chars() {
283        match ch {
284            '\\' => out.push_str("\\\\"),
285            '\n' => out.push_str("\\n"),
286            '\r' => out.push_str("\\r"),
287            _ => out.push(ch),
288        }
289    }
290    out
291}
292
293fn parse_text_field(value: &str) -> String {
294    decode_kg_text(value)
295}
296
297fn push_text_line(out: &mut String, key: &str, value: &str) {
298    out.push_str(key);
299    out.push(' ');
300    out.push_str(&escape_kg_text(value));
301    out.push('\n');
302}
303
304#[derive(Debug, Clone)]
305struct KgCompressionCandidate {
306    token: usize,
307    value: String,
308    first_line: usize,
309    first_col: usize,
310}
311
312#[derive(Debug, Default, Clone, Copy)]
313struct KgCompressionStats {
314    original_bytes: usize,
315    compressed_bytes: usize,
316    dictionary_entries: usize,
317}
318
319#[derive(Debug, Clone)]
320struct LineOccurrence {
321    line_idx: usize,
322    col_idx: usize,
323}
324
325fn decode_kg_token_reference_line(line: &str) -> Option<(String, String)> {
326    let rest = line.strip_prefix('`')?;
327    let (token, value) = rest.split_once(' ')?;
328    if token.is_empty() || !token.chars().all(|ch| ch.is_ascii_digit()) {
329        return None;
330    }
331    Some((token.to_owned(), value.to_owned()))
332}
333
334fn expand_kg_tokens_in_line(line: &str, dictionary: &std::collections::HashMap<String, String>) -> String {
335    let mut out = String::new();
336    let chars: Vec<char> = line.chars().collect();
337    let mut idx = 0;
338
339    while idx < chars.len() {
340        if chars[idx] != '`' {
341            out.push(chars[idx]);
342            idx += 1;
343            continue;
344        }
345
346        let start = idx;
347        idx += 1;
348        let mut token = String::new();
349        while idx < chars.len() && chars[idx].is_ascii_digit() {
350            token.push(chars[idx]);
351            idx += 1;
352        }
353
354        if !token.is_empty() && idx < chars.len() && chars[idx] == '`' {
355            idx += 1;
356            if let Some(value) = dictionary.get(&token) {
357                out.push_str(value);
358            } else {
359                out.push('`');
360                out.push_str(&token);
361                out.push('`');
362            }
363            continue;
364        }
365
366        out.push('`');
367        out.push_str(&token);
368        if idx < chars.len() {
369            out.push(chars[idx]);
370            idx += 1;
371        } else if start + 1 < chars.len() {
372            // Keep the literal backtick when it does not form a token.
373        }
374    }
375
376    out
377}
378
379fn expand_kg_tokens(raw: &str) -> String {
380    let mut dictionary = std::collections::HashMap::new();
381    let mut out = String::new();
382
383    for line in raw.lines() {
384        if let Some((token, value)) = decode_kg_token_reference_line(line) {
385            dictionary.insert(token, value);
386            continue;
387        }
388        out.push_str(&expand_kg_tokens_in_line(line, &dictionary));
389        out.push('\n');
390    }
391
392    out
393}
394
395fn node_header_type_token(line: &str) -> Option<&str> {
396    let rest = line.strip_prefix("@ ")?;
397    let (type_token, _) = rest.split_once(':')?;
398    Some(type_token.trim())
399}
400
401fn is_generated_node_block_header(line: &str) -> bool {
402    node_header_type_token(line)
403        .is_some_and(|token| token.starts_with('G'))
404}
405
406fn collect_generated_text_lines(raw: &str) -> Vec<(usize, String)> {
407    let mut lines = Vec::new();
408    let mut in_block = false;
409    let mut generated_block = false;
410
411    for (idx, line) in raw.lines().enumerate() {
412        let trimmed = line.trim();
413        if trimmed.is_empty() {
414            in_block = false;
415            generated_block = false;
416            continue;
417        }
418
419        if trimmed.starts_with("@ ") {
420            in_block = true;
421            generated_block = is_generated_node_block_header(trimmed);
422            continue;
423        }
424
425        if in_block && generated_block {
426            lines.push((idx, line.to_owned()));
427        }
428    }
429
430    lines
431}
432
433fn extend_repeated_seed(
434    seed: &str,
435    occurrences: &[LineOccurrence],
436    source_lines: &[(usize, String)],
437) -> Option<String> {
438    let seed_chars: Vec<char> = seed.chars().collect();
439    let mut candidate = seed_chars.clone();
440
441    loop {
442        let mut next_char: Option<char> = None;
443
444        for occurrence in occurrences {
445            let (_, line) = source_lines
446                .iter()
447                .find(|(line_idx, _)| *line_idx == occurrence.line_idx)?;
448            let chars: Vec<char> = line.chars().collect();
449            let next_index = occurrence.col_idx + candidate.len();
450            let Some(&ch) = chars.get(next_index) else {
451                return Some(candidate.into_iter().collect());
452            };
453            if ch == '`' {
454                return Some(candidate.into_iter().collect());
455            }
456            match next_char {
457                Some(prev) if prev != ch => return Some(candidate.into_iter().collect()),
458                None => next_char = Some(ch),
459                _ => {}
460            }
461        }
462
463        let Some(ch) = next_char else {
464            return Some(candidate.into_iter().collect());
465        };
466        candidate.push(ch);
467        if candidate.len() > seed_chars.len() + 256 {
468            return Some(candidate.into_iter().collect());
469        }
470    }
471}
472
473fn discover_kg_compression_candidates(
474    source_lines: &[(usize, String)],
475    min_len: usize,
476) -> Vec<KgCompressionCandidate> {
477    let mut seeds: std::collections::HashMap<String, Vec<LineOccurrence>> =
478        std::collections::HashMap::new();
479
480    for (line_idx, line) in source_lines {
481        let chars: Vec<char> = line.chars().collect();
482        if chars.len() < min_len {
483            continue;
484        }
485
486        for start in 0..=chars.len() - min_len {
487            if chars[start..start + min_len].iter().any(|ch| *ch == '`') {
488                continue;
489            }
490            let seed: String = chars[start..start + min_len].iter().collect();
491            seeds.entry(seed).or_default().push(LineOccurrence {
492                line_idx: *line_idx,
493                col_idx: start,
494            });
495        }
496    }
497
498    let mut discovered: std::collections::HashMap<String, KgCompressionCandidate> =
499        std::collections::HashMap::new();
500
501    for (seed, occurrences) in seeds {
502        if occurrences.len() < 2 {
503            continue;
504        }
505
506        let Some(value) = extend_repeated_seed(&seed, &occurrences, source_lines) else {
507            continue;
508        };
509        if value.chars().count() < min_len || value.contains('`') {
510            continue;
511        }
512
513        let first = occurrences
514            .iter()
515            .min_by_key(|occ| (occ.line_idx, occ.col_idx))
516            .expect("at least one occurrence");
517
518        discovered
519            .entry(value.clone())
520            .and_modify(|candidate| {
521                let first_pos = (first.line_idx, first.col_idx);
522                let current_pos = (candidate.first_line, candidate.first_col);
523                if first_pos < current_pos {
524                    candidate.first_line = first.line_idx;
525                    candidate.first_col = first.col_idx;
526                }
527            })
528            .or_insert(KgCompressionCandidate {
529                token: 0,
530                value,
531                first_line: first.line_idx,
532                first_col: first.col_idx,
533            });
534    }
535
536    let mut candidates: Vec<KgCompressionCandidate> = discovered.into_values().collect();
537    candidates.sort_by(|a, b| {
538        b.value
539            .chars()
540            .count()
541            .cmp(&a.value.chars().count())
542            .then_with(|| a.first_line.cmp(&b.first_line))
543            .then_with(|| a.first_col.cmp(&b.first_col))
544            .then_with(|| a.value.cmp(&b.value))
545    });
546
547    let mut filtered: Vec<KgCompressionCandidate> = Vec::new();
548    'candidate: for candidate in candidates {
549        for kept in &filtered {
550            if kept.value.contains(&candidate.value) {
551                continue 'candidate;
552            }
553        }
554        filtered.push(candidate);
555    }
556
557    filtered.sort_by(|a, b| {
558        a.first_line
559            .cmp(&b.first_line)
560            .then_with(|| b.value.chars().count().cmp(&a.value.chars().count()))
561            .then_with(|| a.first_col.cmp(&b.first_col))
562            .then_with(|| a.value.cmp(&b.value))
563    });
564
565    for (idx, candidate) in filtered.iter_mut().enumerate() {
566        candidate.token = idx + 1;
567    }
568
569    filtered
570}
571
572fn replace_kg_text_with_tokens(line: &str, candidates: &[KgCompressionCandidate]) -> String {
573    let chars: Vec<char> = line.chars().collect();
574    let mut out = String::new();
575    let mut idx = 0;
576
577    while idx < chars.len() {
578        let mut best: Option<&KgCompressionCandidate> = None;
579
580        for candidate in candidates {
581            let candidate_chars: Vec<char> = candidate.value.chars().collect();
582            if idx + candidate_chars.len() > chars.len() {
583                continue;
584            }
585            if chars[idx..idx + candidate_chars.len()] != candidate_chars[..] {
586                continue;
587            }
588            match best {
589                Some(current)
590                    if current.value.chars().count() >= candidate_chars.len() => {}
591                _ => best = Some(candidate),
592            }
593        }
594
595        if let Some(candidate) = best {
596            out.push('`');
597            out.push_str(&candidate.token.to_string());
598            out.push('`');
599            idx += candidate.value.chars().count();
600            continue;
601        }
602
603        out.push(chars[idx]);
604        idx += 1;
605    }
606
607    out
608}
609
610fn compress_kg_text(raw: &str, min_len: usize) -> (String, KgCompressionStats) {
611    let source_lines = collect_generated_text_lines(raw);
612    let candidates = discover_kg_compression_candidates(&source_lines, min_len);
613
614    let mut defs_by_line: std::collections::HashMap<usize, Vec<&KgCompressionCandidate>> =
615        std::collections::HashMap::new();
616    for candidate in &candidates {
617        defs_by_line.entry(candidate.first_line).or_default().push(candidate);
618    }
619    for defs in defs_by_line.values_mut() {
620        defs.sort_by(|a, b| {
621            b.value
622                .chars()
623                .count()
624                .cmp(&a.value.chars().count())
625                .then_with(|| a.token.cmp(&b.token))
626        });
627    }
628
629    let compressed_source_lines: std::collections::HashSet<usize> =
630        source_lines.iter().map(|(idx, _)| *idx).collect();
631    let mut compressed = String::new();
632
633    for (idx, line) in raw.lines().enumerate() {
634        if let Some(defs) = defs_by_line.get(&idx) {
635            for def in defs {
636                compressed.push('`');
637                compressed.push_str(&def.token.to_string());
638                compressed.push(' ');
639                compressed.push_str(&def.value);
640                compressed.push('\n');
641            }
642        }
643
644        let rendered = if compressed_source_lines.contains(&idx) {
645            replace_kg_text_with_tokens(line, &candidates)
646        } else {
647            line.to_owned()
648        };
649        compressed.push_str(&rendered);
650        compressed.push('\n');
651    }
652
653    let original_bytes = raw.len();
654    let compressed_bytes = compressed.len();
655    let dictionary_entries = candidates.len();
656
657    (
658        if compressed_bytes < original_bytes {
659            compressed
660        } else {
661            raw.to_owned()
662        },
663        KgCompressionStats {
664            original_bytes,
665            compressed_bytes,
666            dictionary_entries,
667        },
668    )
669}
670
671fn dedupe_case_insensitive(values: Vec<String>) -> Vec<String> {
672    let mut seen = std::collections::HashSet::new();
673    let mut out = Vec::new();
674    for value in values {
675        let key = value.to_ascii_lowercase();
676        if seen.insert(key) {
677            out.push(value);
678        }
679    }
680    out
681}
682
683fn parse_utc_timestamp(value: &str) -> bool {
684    if value.len() != 20 {
685        return false;
686    }
687    let bytes = value.as_bytes();
688    let is_digit = |idx: usize| bytes.get(idx).is_some_and(|b| b.is_ascii_digit());
689    if !(is_digit(0)
690        && is_digit(1)
691        && is_digit(2)
692        && is_digit(3)
693        && bytes.get(4) == Some(&b'-')
694        && is_digit(5)
695        && is_digit(6)
696        && bytes.get(7) == Some(&b'-')
697        && is_digit(8)
698        && is_digit(9)
699        && bytes.get(10) == Some(&b'T')
700        && is_digit(11)
701        && is_digit(12)
702        && bytes.get(13) == Some(&b':')
703        && is_digit(14)
704        && is_digit(15)
705        && bytes.get(16) == Some(&b':')
706        && is_digit(17)
707        && is_digit(18)
708        && bytes.get(19) == Some(&b'Z'))
709    {
710        return false;
711    }
712
713    let month = value[5..7].parse::<u32>().ok();
714    let day = value[8..10].parse::<u32>().ok();
715    let hour = value[11..13].parse::<u32>().ok();
716    let minute = value[14..16].parse::<u32>().ok();
717    let second = value[17..19].parse::<u32>().ok();
718    matches!(month, Some(1..=12))
719        && matches!(day, Some(1..=31))
720        && matches!(hour, Some(0..=23))
721        && matches!(minute, Some(0..=59))
722        && matches!(second, Some(0..=59))
723}
724
725fn parse_boolish(value: &str) -> Option<bool> {
726    match value.trim().to_ascii_lowercase().as_str() {
727        "1" | "true" | "yes" | "on" => Some(true),
728        "0" | "false" | "no" | "off" => Some(false),
729        _ => None,
730    }
731}
732
733fn strict_kg_mode() -> bool {
734    let Ok(value) = std::env::var("KG_STRICT_FORMAT") else {
735        return false;
736    };
737    matches!(
738        value.trim().to_ascii_lowercase().as_str(),
739        "1" | "true" | "yes" | "on"
740    )
741}
742
743fn abbreviated_line(line: &str) -> String {
744    const MAX_CHARS: usize = 160;
745    let trimmed = line.trim();
746    let mut out = String::new();
747    for (idx, ch) in trimmed.chars().enumerate() {
748        if idx >= MAX_CHARS {
749            out.push_str("...");
750            break;
751        }
752        out.push(ch);
753    }
754    out
755}
756
757fn line_fragment(line: &str) -> String {
758    let snippet = abbreviated_line(line);
759    if snippet.is_empty() {
760        "fragment: <empty line>".to_owned()
761    } else {
762        format!("fragment: {snippet}")
763    }
764}
765
766fn json_error_detail(label: &str, path: &Path, raw: &str, error: &serde_json::Error) -> String {
767    let line_no = error.line();
768    let column = error.column();
769    let fragment = raw
770        .lines()
771        .nth(line_no.saturating_sub(1))
772        .map(line_fragment)
773        .unwrap_or_else(|| "fragment: <unavailable>".to_owned());
774    format!(
775        "{label}: {} at line {line_no}, column {column}: {error}\n{fragment}",
776        path.display()
777    )
778}
779
780fn validate_len(
781    line_no: usize,
782    field: &str,
783    value: &str,
784    raw_line: &str,
785    min: usize,
786    max: usize,
787    strict: bool,
788) -> Result<()> {
789    let len = value.chars().count();
790    if strict && (len < min || len > max) {
791        return Err(anyhow::anyhow!(
792            "invalid {field} length at line {line_no}: expected {min}..={max}, got {len}\n{}",
793            line_fragment(raw_line)
794        ));
795    }
796    Ok(())
797}
798
799fn enforce_field_order(
800    line_no: usize,
801    key: &str,
802    rank: u8,
803    last_rank: &mut u8,
804    section: &str,
805    raw_line: &str,
806    strict: bool,
807) -> Result<()> {
808    if strict && rank < *last_rank {
809        return Err(anyhow::anyhow!(
810            "invalid field order at line {line_no}: {key} in {section} block\n{}",
811            line_fragment(raw_line)
812        ));
813    }
814    if rank > *last_rank {
815        *last_rank = rank;
816    }
817    Ok(())
818}
819
820fn field_value<'a>(line: &'a str, key: &str) -> Option<&'a str> {
821    if line == key {
822        Some("")
823    } else {
824        line.strip_prefix(key)
825            .and_then(|rest| rest.strip_prefix(' '))
826    }
827}
828
829fn fail_or_warn(strict: bool, warnings: &mut Vec<String>, message: String) -> Result<()> {
830    if strict {
831        Err(anyhow::anyhow!(message))
832    } else {
833        warnings.push(message);
834        Ok(())
835    }
836}
837
838#[cfg(test)]
839fn parse_kg(raw: &str, graph_name: &str, strict: bool) -> Result<GraphFile> {
840    Ok(parse_kg_with_warnings(raw, graph_name, strict)?.0)
841}
842
843fn parse_kg_with_warnings(
844    raw: &str,
845    graph_name: &str,
846    strict: bool,
847) -> Result<(GraphFile, Vec<String>)> {
848    let mut graph = GraphFile::new(graph_name);
849    let mut warnings = Vec::new();
850    let mut current_node: Option<Node> = None;
851    let mut current_note: Option<Note> = None;
852    let mut current_edge_index: Option<usize> = None;
853    let mut last_node_rank: u8 = 0;
854    let mut last_note_rank: u8 = 0;
855    let mut last_edge_rank: u8 = 0;
856
857    for (idx, line) in raw.lines().enumerate() {
858        let line_no = idx + 1;
859        let raw_line = line.strip_suffix('\r').unwrap_or(line);
860        let trimmed = raw_line.trim();
861        if trimmed.is_empty() || trimmed.starts_with('#') {
862            continue;
863        }
864
865        if let Some(rest) = trimmed.strip_prefix("@ ") {
866            if let Some(note) = current_note.take() {
867                graph.notes.push(note);
868            }
869            if let Some(node) = current_node.take() {
870                graph.nodes.push(node);
871            }
872            let Some((type_code, node_id)) = rest.split_once(':') else {
873                fail_or_warn(
874                    strict,
875                    &mut warnings,
876                    format!("invalid node header at line {line_no}: {trimmed}"),
877                )?;
878                current_edge_index = None;
879                continue;
880            };
881            let decoded_type = decode_node_type_token(type_code.trim());
882            let parsed_id = {
883                let raw_id = node_id.trim();
884                if crate::validate::is_generated_node_type(&decoded_type) {
885                    if let Some((head, suffix)) = raw_id.split_once(':') {
886                        if head == decoded_type {
887                            suffix.to_owned()
888                        } else {
889                            raw_id.to_owned()
890                        }
891                    } else {
892                        raw_id.to_owned()
893                    }
894                } else if type_code.trim().starts_with('=') && raw_id.contains(':') {
895                    raw_id.to_owned()
896                } else if raw_id.contains(':') {
897                    crate::validate::normalize_node_id(raw_id)
898                } else if code_to_node_type(type_code.trim()) != type_code.trim() {
899                    crate::validate::normalize_node_id(&format!("{}:{raw_id}", type_code.trim()))
900                } else {
901                    format!("{}:{raw_id}", decoded_type)
902                }
903            };
904            current_node = Some(Node {
905                id: parsed_id,
906                r#type: decoded_type,
907                name: String::new(),
908                properties: NodeProperties::default(),
909                source_files: Vec::new(),
910            });
911            current_edge_index = None;
912            last_node_rank = 0;
913            last_edge_rank = 0;
914            continue;
915        }
916
917        if let Some(rest) = trimmed.strip_prefix("! ") {
918            if let Some(node) = current_node.take() {
919                graph.nodes.push(node);
920            }
921            if let Some(note) = current_note.take() {
922                graph.notes.push(note);
923            }
924            let mut parts = rest.split_whitespace();
925            let Some(id) = parts.next() else {
926                fail_or_warn(
927                    strict,
928                    &mut warnings,
929                    format!("invalid note header at line {line_no}: {trimmed}"),
930                )?;
931                current_edge_index = None;
932                continue;
933            };
934            let Some(node_id) = parts.next() else {
935                fail_or_warn(
936                    strict,
937                    &mut warnings,
938                    format!("invalid note header at line {line_no}: {trimmed}"),
939                )?;
940                current_edge_index = None;
941                continue;
942            };
943            current_note = Some(Note {
944                id: id.to_owned(),
945                node_id: node_id.to_owned(),
946                ..Default::default()
947            });
948            current_edge_index = None;
949            last_note_rank = 0;
950            continue;
951        }
952
953        if let Some(note) = current_note.as_mut() {
954            if let Some(rest) = field_value(raw_line, "b") {
955                enforce_field_order(
956                    line_no,
957                    "b",
958                    1,
959                    &mut last_note_rank,
960                    "note",
961                    raw_line,
962                    strict,
963                )?;
964                note.body = parse_text_field(rest);
965                continue;
966            }
967            if let Some(rest) = field_value(raw_line, "t") {
968                enforce_field_order(
969                    line_no,
970                    "t",
971                    2,
972                    &mut last_note_rank,
973                    "note",
974                    raw_line,
975                    strict,
976                )?;
977                let value = parse_text_field(rest);
978                if !value.is_empty() {
979                    note.tags.push(value);
980                }
981                continue;
982            }
983            if let Some(rest) = field_value(raw_line, "a") {
984                enforce_field_order(
985                    line_no,
986                    "a",
987                    3,
988                    &mut last_note_rank,
989                    "note",
990                    raw_line,
991                    strict,
992                )?;
993                note.author = parse_text_field(rest);
994                continue;
995            }
996            if let Some(rest) = field_value(raw_line, "e") {
997                enforce_field_order(
998                    line_no,
999                    "e",
1000                    4,
1001                    &mut last_note_rank,
1002                    "note",
1003                    raw_line,
1004                    strict,
1005                )?;
1006                note.created_at = rest.trim().to_owned();
1007                continue;
1008            }
1009            if let Some(rest) = field_value(raw_line, "p") {
1010                enforce_field_order(
1011                    line_no,
1012                    "p",
1013                    5,
1014                    &mut last_note_rank,
1015                    "note",
1016                    raw_line,
1017                    strict,
1018                )?;
1019                note.provenance = parse_text_field(rest);
1020                continue;
1021            }
1022            if let Some(rest) = field_value(raw_line, "s") {
1023                enforce_field_order(
1024                    line_no,
1025                    "s",
1026                    6,
1027                    &mut last_note_rank,
1028                    "note",
1029                    raw_line,
1030                    strict,
1031                )?;
1032                let value = parse_text_field(rest);
1033                if !value.is_empty() {
1034                    note.source_files.push(value);
1035                }
1036                continue;
1037            }
1038            fail_or_warn(
1039                strict,
1040                &mut warnings,
1041                format!("unrecognized note line at {line_no}: {trimmed}"),
1042            )?;
1043            continue;
1044        }
1045
1046        let Some(node) = current_node.as_mut() else {
1047            fail_or_warn(
1048                strict,
1049                &mut warnings,
1050                format!("unexpected line before first node at line {line_no}: {trimmed}"),
1051            )?;
1052            continue;
1053        };
1054
1055        if let Some(rest) = field_value(raw_line, "N") {
1056            enforce_field_order(
1057                line_no,
1058                "N",
1059                1,
1060                &mut last_node_rank,
1061                "node",
1062                raw_line,
1063                strict,
1064            )?;
1065            let value = parse_text_field(rest);
1066            validate_len(line_no, "N", &value, raw_line, 1, 120, strict)?;
1067            node.name = value;
1068            continue;
1069        }
1070        if let Some(rest) = field_value(raw_line, "D") {
1071            enforce_field_order(
1072                line_no,
1073                "D",
1074                2,
1075                &mut last_node_rank,
1076                "node",
1077                raw_line,
1078                strict,
1079            )?;
1080            let value = parse_text_field(rest);
1081            validate_len(line_no, "D", &value, raw_line, 1, 200, strict)?;
1082            node.properties.description = value;
1083            continue;
1084        }
1085        if let Some(rest) = field_value(raw_line, "A") {
1086            enforce_field_order(
1087                line_no,
1088                "A",
1089                3,
1090                &mut last_node_rank,
1091                "node",
1092                raw_line,
1093                strict,
1094            )?;
1095            let value = parse_text_field(rest);
1096            validate_len(line_no, "A", &value, raw_line, 1, 80, strict)?;
1097            node.properties.alias.push(value);
1098            continue;
1099        }
1100        if let Some(rest) = field_value(raw_line, "F") {
1101            enforce_field_order(
1102                line_no,
1103                "F",
1104                4,
1105                &mut last_node_rank,
1106                "node",
1107                raw_line,
1108                strict,
1109            )?;
1110            let value = parse_text_field(rest);
1111            validate_len(line_no, "F", &value, raw_line, 1, 200, strict)?;
1112            node.properties.key_facts.push(value);
1113            continue;
1114        }
1115        if let Some(rest) = field_value(raw_line, "E") {
1116            enforce_field_order(
1117                line_no,
1118                "E",
1119                5,
1120                &mut last_node_rank,
1121                "node",
1122                raw_line,
1123                strict,
1124            )?;
1125            let value = rest.trim();
1126            if !value.is_empty() && !parse_utc_timestamp(value) {
1127                fail_or_warn(
1128                    strict,
1129                    &mut warnings,
1130                    format!(
1131                        "invalid E timestamp at line {line_no}: expected YYYY-MM-DDTHH:MM:SSZ\n{}",
1132                        line_fragment(raw_line)
1133                    ),
1134                )?;
1135                continue;
1136            }
1137            node.properties.created_at = value.to_owned();
1138            continue;
1139        }
1140        if let Some(rest) = field_value(raw_line, "C") {
1141            enforce_field_order(
1142                line_no,
1143                "C",
1144                6,
1145                &mut last_node_rank,
1146                "node",
1147                raw_line,
1148                strict,
1149            )?;
1150            if !rest.trim().is_empty() {
1151                node.properties.confidence = rest.trim().parse::<f64>().ok();
1152            }
1153            continue;
1154        }
1155        if let Some(rest) = field_value(raw_line, "V") {
1156            enforce_field_order(
1157                line_no,
1158                "V",
1159                7,
1160                &mut last_node_rank,
1161                "node",
1162                raw_line,
1163                strict,
1164            )?;
1165            if let Ok(value) = rest.trim().parse::<f64>() {
1166                node.properties.importance = value;
1167            }
1168            continue;
1169        }
1170        if let Some(rest) = field_value(raw_line, "P") {
1171            enforce_field_order(
1172                line_no,
1173                "P",
1174                8,
1175                &mut last_node_rank,
1176                "node",
1177                raw_line,
1178                strict,
1179            )?;
1180            node.properties.provenance = parse_text_field(rest);
1181            continue;
1182        }
1183        if let Some(rest) = field_value(raw_line, "S") {
1184            enforce_field_order(
1185                line_no,
1186                "S",
1187                10,
1188                &mut last_node_rank,
1189                "node",
1190                raw_line,
1191                strict,
1192            )?;
1193            let value = parse_text_field(rest);
1194            validate_len(line_no, "S", &value, raw_line, 1, 200, strict)?;
1195            node.source_files.push(value);
1196            continue;
1197        }
1198
1199        if let Some(rest) = trimmed.strip_prefix("> ") {
1200            let mut parts = rest.split_whitespace();
1201            let Some(relation) = parts.next() else {
1202                fail_or_warn(
1203                    strict,
1204                    &mut warnings,
1205                    format!("missing relation in edge at line {line_no}: {trimmed}"),
1206                )?;
1207                current_edge_index = None;
1208                continue;
1209            };
1210            let Some(target_id) = parts.next() else {
1211                fail_or_warn(
1212                    strict,
1213                    &mut warnings,
1214                    format!("missing target id in edge at line {line_no}: {trimmed}"),
1215                )?;
1216                current_edge_index = None;
1217                continue;
1218            };
1219            graph.edges.push(Edge {
1220                source_id: node.id.clone(),
1221                relation: code_to_relation(relation).to_owned(),
1222                target_id: target_id.to_owned(),
1223                properties: EdgeProperties::default(),
1224            });
1225            current_edge_index = Some(graph.edges.len() - 1);
1226            last_edge_rank = 0;
1227            continue;
1228        }
1229
1230        if let Some(rest) = trimmed.strip_prefix("= ") {
1231            let mut parts = rest.split_whitespace();
1232            let Some(relation) = parts.next() else {
1233                fail_or_warn(
1234                    strict,
1235                    &mut warnings,
1236                    format!("missing relation in bidirectional edge at line {line_no}: {trimmed}"),
1237                )?;
1238                current_edge_index = None;
1239                continue;
1240            };
1241            let Some(target_id) = parts.next() else {
1242                fail_or_warn(
1243                    strict,
1244                    &mut warnings,
1245                    format!("missing target id in bidirectional edge at line {line_no}: {trimmed}"),
1246                )?;
1247                current_edge_index = None;
1248                continue;
1249            };
1250            let relation = code_to_relation(relation).to_owned();
1251            if relation != "~" {
1252                fail_or_warn(
1253                    strict,
1254                    &mut warnings,
1255                    format!(
1256                        "invalid bidirectional relation at line {line_no}: expected '~', got '{}'",
1257                        relation
1258                    ),
1259                )?;
1260                current_edge_index = None;
1261                continue;
1262            }
1263
1264            let target_id = target_id.to_owned();
1265            let (source_id, target_id) = canonicalize_bidirectional_pair(&node.id, &target_id);
1266            graph.edges.push(Edge {
1267                source_id,
1268                relation,
1269                target_id,
1270                properties: EdgeProperties {
1271                    bidirectional: true,
1272                    ..EdgeProperties::default()
1273                },
1274            });
1275            current_edge_index = Some(graph.edges.len() - 1);
1276            last_edge_rank = 0;
1277            continue;
1278        }
1279
1280        if let Some(rest) = field_value(raw_line, "d") {
1281            enforce_field_order(
1282                line_no,
1283                "d",
1284                1,
1285                &mut last_edge_rank,
1286                "edge",
1287                raw_line,
1288                strict,
1289            )?;
1290            let Some(edge_idx) = current_edge_index else {
1291                fail_or_warn(
1292                    strict,
1293                    &mut warnings,
1294                    format!(
1295                        "edge detail without preceding edge at line {line_no}\n{}",
1296                        line_fragment(raw_line)
1297                    ),
1298                )?;
1299                continue;
1300            };
1301            let trimmed_rest = rest.trim();
1302            let mut parts = trimmed_rest.split_whitespace();
1303            if let (Some(label), Some(raw_score), None) = (parts.next(), parts.next(), parts.next())
1304            {
1305                if is_score_component_label(label) {
1306                    let score = raw_score.parse::<f64>().map_err(|_| {
1307                        anyhow::anyhow!(
1308                            "invalid score component value at line {line_no}: expected number in '{}', got '{}'",
1309                            line_fragment(raw_line),
1310                            raw_score
1311                        )
1312                    })?;
1313                    graph.edges[edge_idx]
1314                        .properties
1315                        .score_components
1316                        .insert(label.to_owned(), score);
1317                    continue;
1318                }
1319            }
1320
1321            let value = parse_text_field(rest);
1322            validate_len(line_no, "d", &value, raw_line, 1, 200, strict)?;
1323            graph.edges[edge_idx].properties.detail = value;
1324            continue;
1325        }
1326
1327        if let Some(rest) = field_value(raw_line, "i") {
1328            enforce_field_order(
1329                line_no,
1330                "i",
1331                2,
1332                &mut last_edge_rank,
1333                "edge",
1334                raw_line,
1335                strict,
1336            )?;
1337            let Some(edge_idx) = current_edge_index else {
1338                fail_or_warn(
1339                    strict,
1340                    &mut warnings,
1341                    format!(
1342                        "edge valid_from without preceding edge at line {line_no}\n{}",
1343                        line_fragment(raw_line)
1344                    ),
1345                )?;
1346                continue;
1347            };
1348            let value = rest.trim();
1349            if !value.is_empty() && !parse_utc_timestamp(value) {
1350                fail_or_warn(
1351                    strict,
1352                    &mut warnings,
1353                    format!(
1354                        "invalid i timestamp at line {line_no}: expected YYYY-MM-DDTHH:MM:SSZ\n{}",
1355                        line_fragment(raw_line)
1356                    ),
1357                )?;
1358                continue;
1359            }
1360            graph.edges[edge_idx].properties.valid_from = value.to_owned();
1361            continue;
1362        }
1363
1364        if let Some(rest) = field_value(raw_line, "x") {
1365            enforce_field_order(
1366                line_no,
1367                "x",
1368                3,
1369                &mut last_edge_rank,
1370                "edge",
1371                raw_line,
1372                strict,
1373            )?;
1374            let Some(edge_idx) = current_edge_index else {
1375                fail_or_warn(
1376                    strict,
1377                    &mut warnings,
1378                    format!(
1379                        "edge valid_to without preceding edge at line {line_no}\n{}",
1380                        line_fragment(raw_line)
1381                    ),
1382                )?;
1383                continue;
1384            };
1385            let value = rest.trim();
1386            if !value.is_empty() && !parse_utc_timestamp(value) {
1387                fail_or_warn(
1388                    strict,
1389                    &mut warnings,
1390                    format!(
1391                        "invalid x timestamp at line {line_no}: expected YYYY-MM-DDTHH:MM:SSZ\n{}",
1392                        line_fragment(raw_line)
1393                    ),
1394                )?;
1395                continue;
1396            }
1397            graph.edges[edge_idx].properties.valid_to = value.to_owned();
1398            continue;
1399        }
1400
1401        if let Some(rest) = field_value(raw_line, "-") {
1402            let (key, value) = rest
1403                .split_once(char::is_whitespace)
1404                .map(|(key, value)| (key.trim(), value))
1405                .unwrap_or((rest.trim(), ""));
1406            let is_edge_custom = matches!(
1407                key,
1408                "edge_feedback_score" | "edge_feedback_count" | "edge_feedback_last_ts_ms"
1409            );
1410            if is_edge_custom {
1411                enforce_field_order(
1412                    line_no,
1413                    "-",
1414                    4,
1415                    &mut last_edge_rank,
1416                    "edge",
1417                    raw_line,
1418                    strict,
1419                )?;
1420            } else {
1421                enforce_field_order(
1422                    line_no,
1423                    "-",
1424                    9,
1425                    &mut last_node_rank,
1426                    "node",
1427                    raw_line,
1428                    strict,
1429                )?;
1430            }
1431            match key {
1432                "domain_area" => node.properties.domain_area = parse_text_field(value),
1433                "scan" => {
1434                    node.properties.scan = parse_boolish(value);
1435                }
1436                "scan_ignore_unknown" => {
1437                    node.properties.scan_ignore_unknown = parse_boolish(value);
1438                }
1439                "feedback_score" => {
1440                    node.properties.feedback_score = value.trim().parse::<f64>().unwrap_or(0.0)
1441                }
1442                "feedback_count" => {
1443                    node.properties.feedback_count = value.trim().parse::<u64>().unwrap_or(0)
1444                }
1445                "feedback_last_ts_ms" => {
1446                    node.properties.feedback_last_ts_ms = value.trim().parse::<u64>().ok()
1447                }
1448                "edge_feedback_score" => {
1449                    if let Some(edge_idx) = current_edge_index {
1450                        graph.edges[edge_idx].properties.feedback_score =
1451                            value.trim().parse::<f64>().unwrap_or(0.0);
1452                    }
1453                }
1454                "edge_feedback_count" => {
1455                    if let Some(edge_idx) = current_edge_index {
1456                        graph.edges[edge_idx].properties.feedback_count =
1457                            value.trim().parse::<u64>().unwrap_or(0);
1458                    }
1459                }
1460                "edge_feedback_last_ts_ms" => {
1461                    if let Some(edge_idx) = current_edge_index {
1462                        graph.edges[edge_idx].properties.feedback_last_ts_ms =
1463                            value.trim().parse::<u64>().ok();
1464                    }
1465                }
1466                _ => {}
1467            }
1468            continue;
1469        }
1470
1471        fail_or_warn(
1472            strict,
1473            &mut warnings,
1474            format!("unrecognized line at {line_no}: {trimmed}"),
1475        )?;
1476    }
1477
1478    if let Some(node) = current_node.take() {
1479        graph.nodes.push(node);
1480    }
1481    if let Some(note) = current_note.take() {
1482        graph.notes.push(note);
1483    }
1484
1485    for node in &mut graph.nodes {
1486        node.properties.alias =
1487            sort_case_insensitive(&dedupe_case_insensitive(node.properties.alias.clone()));
1488        node.properties.key_facts =
1489            sort_case_insensitive(&dedupe_case_insensitive(node.properties.key_facts.clone()));
1490        node.source_files =
1491            sort_case_insensitive(&dedupe_case_insensitive(node.source_files.clone()));
1492    }
1493
1494    graph.edges.sort_by(|a, b| {
1495        a.source_id
1496            .cmp(&b.source_id)
1497            .then_with(|| a.relation.cmp(&b.relation))
1498            .then_with(|| a.target_id.cmp(&b.target_id))
1499            .then_with(|| a.properties.bidirectional.cmp(&b.properties.bidirectional))
1500            .then_with(|| a.properties.detail.cmp(&b.properties.detail))
1501    });
1502
1503    for note in &mut graph.notes {
1504        note.tags = sort_case_insensitive(&dedupe_case_insensitive(note.tags.clone()));
1505        note.source_files =
1506            sort_case_insensitive(&dedupe_case_insensitive(note.source_files.clone()));
1507    }
1508    graph.notes.sort_by(|a, b| {
1509        a.id.cmp(&b.id)
1510            .then_with(|| a.node_id.cmp(&b.node_id))
1511            .then_with(|| a.created_at.cmp(&b.created_at))
1512    });
1513
1514    graph.refresh_counts();
1515    Ok((graph, warnings))
1516}
1517
1518fn serialize_kg(graph: &GraphFile) -> String {
1519    let mut out = String::new();
1520    let mut nodes = graph.nodes.clone();
1521    nodes.sort_by(|a, b| a.id.cmp(&b.id));
1522
1523    for node in nodes {
1524        let generated = crate::validate::is_generated_node_type(&node.r#type);
1525        out.push_str(&format!(
1526            "@ {}:{}\n",
1527            encode_node_type_token(&node.r#type),
1528            display_node_id(&node.id, &node.r#type)
1529        ));
1530        if !node.name.is_empty() {
1531            push_text_line(&mut out, "N", &node.name);
1532        }
1533        if !node.properties.description.is_empty() {
1534            push_text_line(&mut out, "D", &node.properties.description);
1535        }
1536
1537        for alias in sort_case_insensitive(&node.properties.alias) {
1538            push_text_line(&mut out, "A", &alias);
1539        }
1540        for fact in sort_case_insensitive(&node.properties.key_facts) {
1541            push_text_line(&mut out, "F", &fact);
1542        }
1543
1544        if !generated {
1545            if !node.properties.created_at.is_empty() {
1546                out.push_str(&format!("E {}\n", node.properties.created_at));
1547            }
1548            if let Some(confidence) = node.properties.confidence {
1549                out.push_str(&format!("C {}\n", confidence));
1550            }
1551            out.push_str(&format!("V {}\n", node.properties.importance));
1552            if !node.properties.provenance.is_empty() {
1553                push_text_line(&mut out, "P", &node.properties.provenance);
1554            }
1555            if !node.properties.domain_area.is_empty() {
1556                out.push_str("- domain_area ");
1557                out.push_str(&escape_kg_text(&node.properties.domain_area));
1558                out.push('\n');
1559            }
1560            if let Some(scan) = node.properties.scan {
1561                out.push_str(&format!("- scan {}\n", scan));
1562            }
1563            if let Some(scan_ignore_unknown) = node.properties.scan_ignore_unknown {
1564                out.push_str(&format!("- scan_ignore_unknown {}\n", scan_ignore_unknown));
1565            }
1566            if node.properties.feedback_score != 0.0 {
1567                out.push_str(&format!(
1568                    "- feedback_score {}\n",
1569                    node.properties.feedback_score
1570                ));
1571            }
1572            if node.properties.feedback_count != 0 {
1573                out.push_str(&format!(
1574                    "- feedback_count {}\n",
1575                    node.properties.feedback_count
1576                ));
1577            }
1578            if let Some(ts) = node.properties.feedback_last_ts_ms {
1579                out.push_str(&format!("- feedback_last_ts_ms {}\n", ts));
1580            }
1581
1582            for source in sort_case_insensitive(&node.source_files) {
1583                push_text_line(&mut out, "S", &source);
1584            }
1585        }
1586
1587        let mut edges: Vec<Edge> = graph
1588            .edges
1589            .iter()
1590            .filter(|edge| edge.source_id == node.id)
1591            .cloned()
1592            .collect();
1593        edges.sort_by(|a, b| {
1594            a.relation
1595                .cmp(&b.relation)
1596                .then_with(|| a.target_id.cmp(&b.target_id))
1597                .then_with(|| a.properties.bidirectional.cmp(&b.properties.bidirectional))
1598                .then_with(|| a.properties.detail.cmp(&b.properties.detail))
1599        });
1600
1601        for edge in edges {
1602            let op = if edge.properties.bidirectional && edge.relation == "~" {
1603                "="
1604            } else {
1605                ">"
1606            };
1607            out.push_str(&format!(
1608                "{} {} {}\n",
1609                op,
1610                relation_to_code(&edge.relation),
1611                canonical_node_id_for_storage(&edge.target_id)
1612            ));
1613            for (label, score) in &edge.properties.score_components {
1614                out.push_str(&format!("d {} {:.6}\n", label, score));
1615            }
1616            if !edge.properties.detail.is_empty() {
1617                push_text_line(&mut out, "d", &edge.properties.detail);
1618            }
1619            if !edge.properties.valid_from.is_empty() {
1620                out.push_str(&format!("i {}\n", edge.properties.valid_from));
1621            }
1622            if !edge.properties.valid_to.is_empty() {
1623                out.push_str(&format!("x {}\n", edge.properties.valid_to));
1624            }
1625            if edge.properties.feedback_score != 0.0 {
1626                out.push_str(&format!(
1627                    "- edge_feedback_score {}\n",
1628                    edge.properties.feedback_score
1629                ));
1630            }
1631            if edge.properties.feedback_count != 0 {
1632                out.push_str(&format!(
1633                    "- edge_feedback_count {}\n",
1634                    edge.properties.feedback_count
1635                ));
1636            }
1637            if let Some(ts) = edge.properties.feedback_last_ts_ms {
1638                out.push_str(&format!("- edge_feedback_last_ts_ms {}\n", ts));
1639            }
1640        }
1641
1642        out.push('\n');
1643    }
1644
1645    let mut notes = graph.notes.clone();
1646    notes.sort_by(|a, b| {
1647        a.id.cmp(&b.id)
1648            .then_with(|| a.node_id.cmp(&b.node_id))
1649            .then_with(|| a.created_at.cmp(&b.created_at))
1650    });
1651    for note in notes {
1652        out.push_str(&format!(
1653            "! {} {}\n",
1654            note.id,
1655            canonical_node_id_for_storage(&note.node_id)
1656        ));
1657        push_text_line(&mut out, "b", &note.body);
1658        for tag in sort_case_insensitive(&note.tags) {
1659            push_text_line(&mut out, "t", &tag);
1660        }
1661        if !note.author.is_empty() {
1662            push_text_line(&mut out, "a", &note.author);
1663        }
1664        if !note.created_at.is_empty() {
1665            out.push_str(&format!("e {}\n", note.created_at));
1666        }
1667        if !note.provenance.is_empty() {
1668            push_text_line(&mut out, "p", &note.provenance);
1669        }
1670        for source in sort_case_insensitive(&note.source_files) {
1671            push_text_line(&mut out, "s", &source);
1672        }
1673        out.push('\n');
1674    }
1675
1676    out
1677}
1678
1679#[derive(Debug, Clone, Serialize, Deserialize)]
1680pub struct GraphFile {
1681    pub metadata: Metadata,
1682    #[serde(default)]
1683    pub nodes: Vec<Node>,
1684    #[serde(default)]
1685    pub edges: Vec<Edge>,
1686    #[serde(default)]
1687    pub notes: Vec<Note>,
1688}
1689
1690#[derive(Debug, Clone, Serialize, Deserialize)]
1691pub struct Metadata {
1692    pub name: String,
1693    #[serde(default = "default_graph_schema_version")]
1694    pub schema_version: u32,
1695    pub version: String,
1696    pub description: String,
1697    pub node_count: usize,
1698    pub edge_count: usize,
1699}
1700
1701#[derive(Debug, Clone, Serialize, Deserialize)]
1702pub struct Node {
1703    pub id: String,
1704    #[serde(rename = "type")]
1705    pub r#type: String,
1706    pub name: String,
1707    #[serde(default)]
1708    pub properties: NodeProperties,
1709    #[serde(default)]
1710    pub source_files: Vec<String>,
1711}
1712
1713#[derive(Debug, Clone, Serialize, Deserialize)]
1714pub struct NodeProperties {
1715    #[serde(default)]
1716    pub description: String,
1717    #[serde(default)]
1718    pub domain_area: String,
1719    #[serde(default)]
1720    pub provenance: String,
1721    #[serde(default)]
1722    pub confidence: Option<f64>,
1723    #[serde(default)]
1724    pub created_at: String,
1725    #[serde(default = "default_importance")]
1726    pub importance: f64,
1727    #[serde(default)]
1728    pub key_facts: Vec<String>,
1729    #[serde(default)]
1730    pub alias: Vec<String>,
1731    #[serde(default)]
1732    pub valid_from: String,
1733    #[serde(default)]
1734    pub valid_to: String,
1735    #[serde(default)]
1736    pub scan: Option<bool>,
1737    #[serde(default)]
1738    pub scan_ignore_unknown: Option<bool>,
1739    #[serde(default)]
1740    pub feedback_score: f64,
1741    #[serde(default)]
1742    pub feedback_count: u64,
1743    #[serde(default)]
1744    pub feedback_last_ts_ms: Option<u64>,
1745}
1746
1747fn default_importance() -> f64 {
1748    0.5
1749}
1750
1751fn default_graph_schema_version() -> u32 {
1752    1
1753}
1754
1755impl Default for NodeProperties {
1756    fn default() -> Self {
1757        Self {
1758            description: String::new(),
1759            domain_area: String::new(),
1760            provenance: String::new(),
1761            confidence: None,
1762            created_at: String::new(),
1763            importance: default_importance(),
1764            key_facts: Vec::new(),
1765            alias: Vec::new(),
1766            valid_from: String::new(),
1767            valid_to: String::new(),
1768            scan: None,
1769            scan_ignore_unknown: None,
1770            feedback_score: 0.0,
1771            feedback_count: 0,
1772            feedback_last_ts_ms: None,
1773        }
1774    }
1775}
1776
1777#[derive(Debug, Clone, Serialize, Deserialize)]
1778pub struct Edge {
1779    pub source_id: String,
1780    pub relation: String,
1781    pub target_id: String,
1782    #[serde(default)]
1783    pub properties: EdgeProperties,
1784}
1785
1786#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1787pub struct EdgeProperties {
1788    #[serde(default)]
1789    pub detail: String,
1790    #[serde(default)]
1791    pub valid_from: String,
1792    #[serde(default)]
1793    pub valid_to: String,
1794    #[serde(default)]
1795    pub feedback_score: f64,
1796    #[serde(default)]
1797    pub feedback_count: u64,
1798    #[serde(default)]
1799    pub feedback_last_ts_ms: Option<u64>,
1800    #[serde(default)]
1801    pub bidirectional: bool,
1802    #[serde(default)]
1803    pub score_components: BTreeMap<String, f64>,
1804}
1805
1806#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1807pub struct Note {
1808    pub id: String,
1809    pub node_id: String,
1810    #[serde(default)]
1811    pub body: String,
1812    #[serde(default)]
1813    pub tags: Vec<String>,
1814    #[serde(default)]
1815    pub author: String,
1816    #[serde(default)]
1817    pub created_at: String,
1818    #[serde(default)]
1819    pub provenance: String,
1820    #[serde(default)]
1821    pub source_files: Vec<String>,
1822}
1823
1824impl GraphFile {
1825    pub fn new(name: &str) -> Self {
1826        Self {
1827            metadata: Metadata {
1828                name: name.to_owned(),
1829                schema_version: default_graph_schema_version(),
1830                version: "1.0".to_owned(),
1831                description: format!("Knowledge graph: {name}"),
1832                node_count: 0,
1833                edge_count: 0,
1834            },
1835            nodes: Vec::new(),
1836            edges: Vec::new(),
1837            notes: Vec::new(),
1838        }
1839    }
1840
1841    pub fn load(path: &Path) -> Result<Self> {
1842        let raw = fs::read_to_string(path)
1843            .with_context(|| format!("failed to read graph: {}", path.display()))?;
1844        let ext = path
1845            .extension()
1846            .and_then(|ext| ext.to_str())
1847            .unwrap_or("json");
1848        let mut graph = if ext == "kg" {
1849            if raw.trim_start().starts_with('{') {
1850                serde_json::from_str(&raw).map_err(|error| {
1851                    anyhow::anyhow!(json_error_detail(
1852                        "invalid legacy JSON payload in .kg file",
1853                        path,
1854                        &raw,
1855                        &error,
1856                    ))
1857                })?
1858            } else {
1859                let graph_name = path
1860                    .file_stem()
1861                    .and_then(|stem| stem.to_str())
1862                    .unwrap_or("graph");
1863                let decompressed = expand_kg_tokens(&raw);
1864                let (graph, warnings) = parse_kg_with_warnings(
1865                    &decompressed,
1866                    graph_name,
1867                    strict_kg_mode(),
1868                )
1869                    .with_context(|| format!("failed to parse .kg graph: {}", path.display()))?;
1870                for warning in warnings {
1871                    let _ = crate::kg_sidecar::append_warning(
1872                        path,
1873                        &format!(
1874                            "ignored invalid graph entry in {}: {warning}",
1875                            path.display()
1876                        ),
1877                    );
1878                }
1879                graph
1880            }
1881        } else {
1882            serde_json::from_str(&raw).map_err(|error| {
1883                anyhow::anyhow!(json_error_detail("invalid JSON", path, &raw, &error))
1884            })?
1885        };
1886        let schema_version_before = graph_schema_version(&graph);
1887        normalize_graph_ids(&mut graph);
1888        let created_graph_info = ensure_graph_info_node(&mut graph);
1889        graph.metadata.schema_version = GRAPH_SCHEMA_VERSION;
1890        graph.refresh_counts();
1891        if created_graph_info || schema_version_before < GRAPH_SCHEMA_VERSION {
1892            graph.save(path)?;
1893        }
1894        Ok(graph)
1895    }
1896
1897    pub fn save(&self, path: &Path) -> Result<()> {
1898        let mut graph = self.clone();
1899        ensure_graph_info_node(&mut graph);
1900        graph.metadata.schema_version = GRAPH_SCHEMA_VERSION;
1901        graph.refresh_counts();
1902        let ext = path
1903            .extension()
1904            .and_then(|ext| ext.to_str())
1905            .unwrap_or("json");
1906        let raw = if ext == "kg" {
1907            let serialized = serialize_kg(&graph);
1908            let (compressed, stats) = compress_kg_text(&serialized, KG_TEXT_COMPRESSION_MIN_LEN);
1909            let saved_bytes = serialized.len().saturating_sub(compressed.len());
1910            let saved_percent = if serialized.is_empty() {
1911                0.0
1912            } else {
1913                (saved_bytes as f64 * 100.0) / serialized.len() as f64
1914            };
1915            if saved_bytes > 0 {
1916                eprintln!(
1917                    "kg compression: {:.1}% saved ({} -> {} bytes, {} dictionary entries)",
1918                    saved_percent,
1919                    stats.original_bytes,
1920                    stats.compressed_bytes.min(stats.original_bytes),
1921                    stats.dictionary_entries
1922                );
1923            }
1924            compressed
1925        } else {
1926            serde_json::to_string_pretty(&graph).context("failed to serialize graph")?
1927        };
1928        atomic_write(path, &raw)?;
1929        backup_graph_if_stale(path, &raw)
1930    }
1931
1932    pub fn refresh_counts(&mut self) {
1933        self.metadata.node_count = self.nodes.len();
1934        self.metadata.edge_count = self.edges.len();
1935    }
1936
1937    pub fn node_by_id(&self, id: &str) -> Option<&Node> {
1938        self.nodes.iter().find(|node| node.id == id)
1939    }
1940
1941    pub fn node_by_id_sorted(&self, id: &str) -> Option<&Node> {
1942        self.nodes
1943            .binary_search_by(|node| node.id.as_str().cmp(id))
1944            .ok()
1945            .and_then(|idx| self.nodes.get(idx))
1946    }
1947
1948    pub fn node_by_id_mut(&mut self, id: &str) -> Option<&mut Node> {
1949        self.nodes.iter_mut().find(|node| node.id == id)
1950    }
1951
1952    pub fn has_edge(&self, source_id: &str, relation: &str, target_id: &str) -> bool {
1953        self.edges.iter().any(|edge| {
1954            edge.source_id == source_id && edge.relation == relation && edge.target_id == target_id
1955        })
1956    }
1957}
1958
1959fn normalize_graph_ids(graph: &mut GraphFile) {
1960    let mut remap: HashMap<String, String> = HashMap::new();
1961    for node in &mut graph.nodes {
1962        let normalized = crate::validate::canonicalize_node_id_for_type(&node.id, &node.r#type)
1963            .unwrap_or_else(|_| crate::validate::normalize_node_id(&node.id));
1964        if normalized != node.id {
1965            remap.insert(node.id.clone(), normalized.clone());
1966            node.id = normalized;
1967        }
1968    }
1969
1970    let known_ids: std::collections::HashSet<&str> =
1971        graph.nodes.iter().map(|node| node.id.as_str()).collect();
1972
1973    for edge in &mut graph.edges {
1974        edge.source_id = remap.get(&edge.source_id).cloned().unwrap_or_else(|| {
1975            if known_ids.contains(edge.source_id.as_str()) {
1976                edge.source_id.clone()
1977            } else {
1978                crate::validate::normalize_node_id(&edge.source_id)
1979            }
1980        });
1981        edge.target_id = remap.get(&edge.target_id).cloned().unwrap_or_else(|| {
1982            if known_ids.contains(edge.target_id.as_str()) {
1983                edge.target_id.clone()
1984            } else {
1985                crate::validate::normalize_node_id(&edge.target_id)
1986            }
1987        });
1988        if edge.properties.bidirectional {
1989            let (source_id, target_id) =
1990                canonicalize_bidirectional_pair(&edge.source_id, &edge.target_id);
1991            edge.source_id = source_id;
1992            edge.target_id = target_id;
1993        }
1994    }
1995
1996    for note in &mut graph.notes {
1997        note.node_id = remap.get(&note.node_id).cloned().unwrap_or_else(|| {
1998            if known_ids.contains(note.node_id.as_str()) {
1999                note.node_id.clone()
2000            } else {
2001                crate::validate::normalize_node_id(&note.node_id)
2002            }
2003        });
2004    }
2005}
2006
2007fn ensure_graph_info_node(graph: &mut GraphFile) -> bool {
2008    if let Some(node) = graph.node_by_id_mut(GRAPH_INFO_NODE_ID) {
2009        let mut changed = false;
2010        if node.r#type != GRAPH_INFO_NODE_TYPE {
2011            node.r#type = GRAPH_INFO_NODE_TYPE.to_owned();
2012            changed = true;
2013        }
2014        if node.name.is_empty() {
2015            node.name = "Graph Metadata".to_owned();
2016            changed = true;
2017        }
2018        if node.properties.description.is_empty() {
2019            node.properties.description =
2020                "Internal graph metadata for cross-graph linking".to_owned();
2021            changed = true;
2022        }
2023        if !node
2024            .properties
2025            .key_facts
2026            .iter()
2027            .any(|fact| fact.starts_with(GRAPH_UUID_FACT_PREFIX))
2028        {
2029            node.properties
2030                .key_facts
2031                .push(format!("{GRAPH_UUID_FACT_PREFIX}{}", generate_graph_uuid()));
2032            changed = true;
2033        }
2034        let schema_fact = format!("{GRAPH_SCHEMA_VERSION_FACT_PREFIX}{GRAPH_SCHEMA_VERSION}");
2035        let had_schema_fact = node
2036            .properties
2037            .key_facts
2038            .iter()
2039            .any(|fact| fact.starts_with(GRAPH_SCHEMA_VERSION_FACT_PREFIX));
2040        if !had_schema_fact {
2041            node.properties.key_facts.push(schema_fact);
2042            changed = true;
2043        } else {
2044            let mut replaced = false;
2045            for fact in &mut node.properties.key_facts {
2046                if fact.starts_with(GRAPH_SCHEMA_VERSION_FACT_PREFIX) {
2047                    if *fact != schema_fact {
2048                        *fact = schema_fact.clone();
2049                        replaced = true;
2050                    }
2051                }
2052            }
2053            if replaced {
2054                changed = true;
2055            }
2056        }
2057        return changed;
2058    }
2059
2060    graph.nodes.push(Node {
2061        id: GRAPH_INFO_NODE_ID.to_owned(),
2062        r#type: GRAPH_INFO_NODE_TYPE.to_owned(),
2063        name: "Graph Metadata".to_owned(),
2064        properties: NodeProperties {
2065            description: "Internal graph metadata for cross-graph linking".to_owned(),
2066            domain_area: "internal_metadata".to_owned(),
2067            provenance: "A".to_owned(),
2068            importance: 1.0,
2069            key_facts: vec![
2070                format!("{GRAPH_UUID_FACT_PREFIX}{}", generate_graph_uuid()),
2071                format!("{GRAPH_SCHEMA_VERSION_FACT_PREFIX}{GRAPH_SCHEMA_VERSION}"),
2072            ],
2073            ..NodeProperties::default()
2074        },
2075        source_files: vec!["DOC .kg/internal/graph_info".to_owned()],
2076    });
2077    true
2078}
2079
2080fn graph_schema_version(graph: &GraphFile) -> u32 {
2081    graph
2082        .node_by_id(GRAPH_INFO_NODE_ID)
2083        .and_then(|node| {
2084            node.properties.key_facts.iter().find_map(|fact| {
2085                fact.strip_prefix(GRAPH_SCHEMA_VERSION_FACT_PREFIX)
2086                    .and_then(|value| value.parse::<u32>().ok())
2087            })
2088        })
2089        .unwrap_or(graph.metadata.schema_version)
2090}
2091
2092fn display_node_id(id: &str, node_type: &str) -> String {
2093    let Some((head, suffix)) = id.split_once(':') else {
2094        return id.to_owned();
2095    };
2096    if head == node_type
2097        || crate::validate::canonical_type_code_for(node_type).is_some_and(|code| code == head)
2098        || crate::validate::TYPE_TO_PREFIX
2099            .iter()
2100            .any(|(typ, prefix)| *typ == node_type && *prefix == head)
2101    {
2102        return suffix.to_owned();
2103    }
2104    id.to_owned()
2105}
2106
2107fn canonical_node_id_for_storage(id: &str) -> String {
2108    let Some((head, suffix)) = id.split_once(':') else {
2109        return id.to_owned();
2110    };
2111    let Some(node_type) = crate::validate::TYPE_TO_PREFIX
2112        .iter()
2113        .find(|(typ, prefix)| {
2114            crate::validate::canonical_type_code_for(typ).is_some_and(|code| code == head)
2115                || *prefix == head
2116        })
2117        .map(|(typ, _)| *typ)
2118    else {
2119        return id.to_owned();
2120    };
2121    crate::validate::canonical_type_code_for(node_type)
2122        .map(|code| format!("{code}:{suffix}"))
2123        .unwrap_or_else(|| id.to_owned())
2124}
2125
2126fn generate_graph_uuid() -> String {
2127    let mut bytes = [0u8; 10];
2128    if fs::File::open("/dev/urandom")
2129        .and_then(|mut file| {
2130            use std::io::Read;
2131            file.read_exact(&mut bytes)
2132        })
2133        .is_err()
2134    {
2135        let nanos = SystemTime::now()
2136            .duration_since(UNIX_EPOCH)
2137            .unwrap_or_default()
2138            .as_nanos();
2139        let pid = std::process::id() as u128;
2140        let mixed = nanos ^ (pid << 64) ^ (nanos.rotate_left(17));
2141        bytes.copy_from_slice(&mixed.to_be_bytes()[6..16]);
2142    }
2143    let mut out = String::with_capacity(20);
2144    for byte in bytes {
2145        out.push_str(&format!("{byte:02x}"));
2146    }
2147    out
2148}
2149
2150#[cfg(test)]
2151mod tests {
2152    use super::{
2153        compress_kg_text, expand_kg_tokens, GRAPH_INFO_NODE_ID, GRAPH_INFO_NODE_TYPE,
2154        GRAPH_SCHEMA_VERSION, GRAPH_UUID_FACT_PREFIX, GraphFile, KG_TEXT_COMPRESSION_MIN_LEN,
2155        parse_kg,
2156    };
2157
2158    #[test]
2159    fn save_and_load_kg_roundtrip_keeps_core_fields() {
2160        let dir = tempfile::tempdir().expect("temp dir");
2161        let path = dir.path().join("graph.kg");
2162
2163        let mut graph = GraphFile::new("graph");
2164        graph.nodes.push(crate::Node {
2165            id: "concept:refrigerator".to_owned(),
2166            r#type: "Concept".to_owned(),
2167            name: "Lodowka".to_owned(),
2168            properties: crate::NodeProperties {
2169                description: "Urzadzenie chlodzace".to_owned(),
2170                provenance: "U".to_owned(),
2171                created_at: "2026-04-04T12:00:00Z".to_owned(),
2172                importance: 5.0,
2173                key_facts: vec!["A".to_owned(), "b".to_owned()],
2174                alias: vec!["Fridge".to_owned()],
2175                scan: Some(true),
2176                scan_ignore_unknown: Some(true),
2177                ..Default::default()
2178            },
2179            source_files: vec!["docs/fridge.md".to_owned()],
2180        });
2181        graph.edges.push(crate::Edge {
2182            source_id: "concept:refrigerator".to_owned(),
2183            relation: "READS_FROM".to_owned(),
2184            target_id: "datastore:settings".to_owned(),
2185            properties: crate::EdgeProperties {
2186                detail: "runtime read".to_owned(),
2187                valid_from: "2026-04-04T12:00:00Z".to_owned(),
2188                valid_to: "2026-04-05T12:00:00Z".to_owned(),
2189                ..Default::default()
2190            },
2191        });
2192
2193        graph.save(&path).expect("save kg");
2194        let raw = std::fs::read_to_string(&path).expect("read kg");
2195        assert!(raw.contains("@ K:refrigerator"));
2196        assert!(raw.contains("> R D:settings"));
2197
2198        let loaded = GraphFile::load(&path).expect("load kg");
2199        assert_eq!(loaded.nodes.len(), 2);
2200        assert_eq!(loaded.edges.len(), 1);
2201        let node = loaded
2202            .node_by_id("concept:refrigerator")
2203            .expect("domain node");
2204        assert_eq!(node.properties.importance, 5.0);
2205        assert_eq!(node.properties.provenance, "U");
2206        assert_eq!(node.properties.scan, Some(true));
2207        assert_eq!(node.properties.scan_ignore_unknown, Some(true));
2208        assert_eq!(node.name, "Lodowka");
2209        assert_eq!(loaded.edges[0].relation, "READS_FROM");
2210        assert_eq!(loaded.edges[0].properties.detail, "runtime read");
2211        assert_eq!(
2212            loaded.edges[0].properties.valid_from,
2213            "2026-04-04T12:00:00Z"
2214        );
2215        assert_eq!(loaded.edges[0].properties.valid_to, "2026-04-05T12:00:00Z");
2216        assert_eq!(loaded.metadata.schema_version, GRAPH_SCHEMA_VERSION);
2217    }
2218
2219    #[test]
2220    fn load_supports_legacy_json_payload_with_kg_extension() {
2221        let dir = tempfile::tempdir().expect("temp dir");
2222        let path = dir.path().join("legacy.kg");
2223        std::fs::write(
2224            &path,
2225            r#"{
2226  "metadata": {"name": "legacy", "version": "1.0", "description": "x", "node_count": 0, "edge_count": 0},
2227  "nodes": [],
2228  "edges": [],
2229  "notes": []
2230}"#,
2231        )
2232        .expect("write legacy payload");
2233
2234        let loaded = GraphFile::load(&path).expect("load legacy kg");
2235        assert_eq!(loaded.metadata.name, "legacy");
2236        assert_eq!(loaded.metadata.schema_version, GRAPH_SCHEMA_VERSION);
2237        assert_eq!(loaded.nodes.len(), 1);
2238        assert!(loaded.node_by_id(GRAPH_INFO_NODE_ID).is_some());
2239    }
2240
2241    #[test]
2242    fn load_kg_auto_migrates_legacy_id_prefixes() {
2243        let dir = tempfile::tempdir().expect("temp dir");
2244        let path = dir.path().join("legacy-ids.kg");
2245        std::fs::write(
2246            &path,
2247            "@ K:concept:x\nN X\nD Desc\nV 0.5\nP U\nS docs/a.md\n> R datastore:y\n",
2248        )
2249        .expect("write kg");
2250
2251        let loaded = GraphFile::load(&path).expect("load kg");
2252        assert_eq!(loaded.metadata.schema_version, GRAPH_SCHEMA_VERSION);
2253        assert!(loaded.node_by_id("concept:x").is_some());
2254
2255        let persisted = std::fs::read_to_string(&path).expect("read migrated kg");
2256        assert!(persisted.contains("@ K:x"));
2257        assert!(persisted.contains("> R D:y"));
2258        assert!(persisted.contains(&format!("schema_version={GRAPH_SCHEMA_VERSION}")));
2259    }
2260
2261    #[test]
2262    fn load_kg_ignores_invalid_timestamp_format() {
2263        let dir = tempfile::tempdir().expect("temp dir");
2264        let path = dir.path().join("invalid-ts.kg");
2265        std::fs::write(
2266            &path,
2267            "@ K:concept:x\nN X\nD Desc\nE 2026-04-04 12:00:00\nV 4\nP U\n",
2268        )
2269        .expect("write kg");
2270
2271        let loaded = GraphFile::load(&path).expect("invalid timestamp should be ignored");
2272        assert_eq!(loaded.nodes.len(), 2);
2273        assert!(
2274            loaded
2275                .node_by_id("concept:x")
2276                .expect("concept node")
2277                .properties
2278                .created_at
2279                .is_empty()
2280        );
2281    }
2282
2283    #[test]
2284    fn load_kg_ignores_invalid_edge_timestamp_format() {
2285        let dir = tempfile::tempdir().expect("temp dir");
2286        let path = dir.path().join("invalid-edge-ts.kg");
2287        std::fs::write(
2288            &path,
2289            "@ K:concept:x\nN X\nD Desc\nE 2026-04-04T12:00:00Z\nV 4\nP U\nS docs/a.md\n> H concept:y\ni 2026-04-04 12:00:00\n",
2290        )
2291        .expect("write kg");
2292
2293        let loaded = GraphFile::load(&path).expect("invalid edge timestamp should be ignored");
2294        assert_eq!(loaded.edges.len(), 1);
2295        assert!(loaded.edges[0].properties.valid_from.is_empty());
2296    }
2297
2298    #[test]
2299    fn load_kg_preserves_whitespace_and_dedupes_exact_duplicates() {
2300        let dir = tempfile::tempdir().expect("temp dir");
2301        let path = dir.path().join("normalize.kg");
2302        std::fs::write(
2303            &path,
2304            "@ K:concept:x\nN  Name   With   Spaces \nD  Desc   with   spaces \nA Alias\nA Alias\nF fact one\nF FACT   one\nS docs/a.md\nS docs/a.md\nE 2026-04-04T12:00:00Z\nV 4\nP U\n",
2305        )
2306        .expect("write kg");
2307
2308        let loaded = GraphFile::load(&path).expect("load kg");
2309        let node = loaded.node_by_id("concept:x").expect("concept node");
2310        assert_eq!(node.name, " Name   With   Spaces ");
2311        assert_eq!(node.properties.description, " Desc   with   spaces ");
2312        assert_eq!(node.properties.alias.len(), 1);
2313        assert_eq!(node.properties.key_facts.len(), 2);
2314        assert_eq!(node.source_files.len(), 1);
2315    }
2316
2317    #[test]
2318    fn save_and_load_kg_roundtrip_keeps_notes_without_json_fallback() {
2319        let dir = tempfile::tempdir().expect("temp dir");
2320        let path = dir.path().join("graph-notes.kg");
2321
2322        let mut graph = GraphFile::new("graph-notes");
2323        graph.nodes.push(crate::Node {
2324            id: "concept:refrigerator".to_owned(),
2325            r#type: "Concept".to_owned(),
2326            name: "Lodowka".to_owned(),
2327            properties: crate::NodeProperties {
2328                description: "Urzadzenie chlodzace".to_owned(),
2329                provenance: "U".to_owned(),
2330                created_at: "2026-04-04T12:00:00Z".to_owned(),
2331                ..Default::default()
2332            },
2333            source_files: vec!["docs/fridge.md".to_owned()],
2334        });
2335        graph.notes.push(crate::Note {
2336            id: "note:1".to_owned(),
2337            node_id: "concept:refrigerator".to_owned(),
2338            body: "Important maintenance insight".to_owned(),
2339            tags: vec!["Maintenance".to_owned(), "maintenance".to_owned()],
2340            author: "alice".to_owned(),
2341            created_at: "1712345678".to_owned(),
2342            provenance: "U".to_owned(),
2343            source_files: vec!["docs/a.md".to_owned(), "docs/a.md".to_owned()],
2344        });
2345
2346        graph.save(&path).expect("save kg");
2347        let raw = std::fs::read_to_string(&path).expect("read kg");
2348        assert!(raw.contains("! note:1 K:refrigerator"));
2349        assert!(!raw.trim_start().starts_with('{'));
2350
2351        let loaded = GraphFile::load(&path).expect("load kg");
2352        assert_eq!(loaded.notes.len(), 1);
2353        let note = &loaded.notes[0];
2354        assert_eq!(note.id, "note:1");
2355        assert_eq!(note.node_id, "concept:refrigerator");
2356        assert_eq!(note.body, "Important maintenance insight");
2357        assert_eq!(note.tags.len(), 1);
2358        assert_eq!(note.source_files.len(), 1);
2359    }
2360
2361    #[test]
2362    fn save_and_load_kg_roundtrip_preserves_multiline_text_fields() {
2363        let dir = tempfile::tempdir().expect("temp dir");
2364        let path = dir.path().join("graph-multiline.kg");
2365
2366        let mut graph = GraphFile::new("graph-multiline");
2367        graph.nodes.push(crate::Node {
2368            id: "concept:refrigerator".to_owned(),
2369            r#type: "Concept".to_owned(),
2370            name: "Lodowka\nSmart".to_owned(),
2371            properties: crate::NodeProperties {
2372                description: "Linia 1\nLinia 2\\nliteral".to_owned(),
2373                provenance: "user\nimport".to_owned(),
2374                created_at: "2026-04-04T12:00:00Z".to_owned(),
2375                importance: 5.0,
2376                key_facts: vec!["Fakt 1\nFakt 2".to_owned()],
2377                alias: vec!["Alias\nA".to_owned()],
2378                domain_area: "ops\nfield".to_owned(),
2379                ..Default::default()
2380            },
2381            source_files: vec!["docs/fridge\nnotes.md".to_owned()],
2382        });
2383        graph.edges.push(crate::Edge {
2384            source_id: "concept:refrigerator".to_owned(),
2385            relation: "READS_FROM".to_owned(),
2386            target_id: "datastore:settings".to_owned(),
2387            properties: crate::EdgeProperties {
2388                detail: "runtime\nread".to_owned(),
2389                valid_from: "2026-04-04T12:00:00Z".to_owned(),
2390                valid_to: "2026-04-05T12:00:00Z".to_owned(),
2391                ..Default::default()
2392            },
2393        });
2394        graph.notes.push(crate::Note {
2395            id: "note:1".to_owned(),
2396            node_id: "concept:refrigerator".to_owned(),
2397            body: "line1\nline2\\nkeep".to_owned(),
2398            tags: vec!["multi\nline".to_owned()],
2399            author: "alice\nbob".to_owned(),
2400            created_at: "1712345678".to_owned(),
2401            provenance: "manual\nentry".to_owned(),
2402            source_files: vec!["docs/a\nb.md".to_owned()],
2403        });
2404
2405        graph.save(&path).expect("save kg");
2406        let raw = std::fs::read_to_string(&path).expect("read kg");
2407        assert!(raw.contains("@ K:refrigerator"));
2408        assert!(raw.contains("> R D:settings"));
2409        assert!(raw.contains("! note:1 K:refrigerator"));
2410        assert!(raw.contains("N Lodowka\\nSmart"));
2411        assert!(raw.contains("D Linia 1\\nLinia 2\\\\nliteral"));
2412        assert!(raw.contains("- domain_area ops\\nfield"));
2413        assert!(raw.contains("d runtime\\nread"));
2414        assert!(raw.contains("b line1\\nline2\\\\nkeep"));
2415
2416        let loaded = GraphFile::load(&path).expect("load kg");
2417        let node = loaded
2418            .node_by_id("concept:refrigerator")
2419            .expect("domain node");
2420        assert_eq!(node.name, "Lodowka\nSmart");
2421        assert_eq!(node.properties.description, "Linia 1\nLinia 2\\nliteral");
2422        assert_eq!(node.properties.provenance, "user\nimport");
2423        assert_eq!(node.properties.alias, vec!["Alias\nA".to_owned()]);
2424        assert_eq!(node.properties.key_facts, vec!["Fakt 1\nFakt 2".to_owned()]);
2425        assert_eq!(node.properties.domain_area, "ops\nfield");
2426        assert_eq!(node.source_files, vec!["docs/fridge\nnotes.md".to_owned()]);
2427        assert_eq!(loaded.edges[0].properties.detail, "runtime\nread");
2428        let note = &loaded.notes[0];
2429        assert_eq!(note.body, "line1\nline2\\nkeep");
2430        assert_eq!(note.tags, vec!["multi\nline".to_owned()]);
2431        assert_eq!(note.author, "alice\nbob");
2432        assert_eq!(note.provenance, "manual\nentry");
2433        assert_eq!(note.source_files, vec!["docs/a\nb.md".to_owned()]);
2434    }
2435
2436    #[test]
2437    fn compress_kg_text_only_touches_generated_node_blocks() {
2438        let raw = concat!(
2439            "@ GDIR:src\n",
2440            "N alpha beta gamma\n",
2441            "D alpha beta gamma and more\n",
2442            "\n",
2443            "@ K:concept:plain\n",
2444            "N alpha beta gamma\n",
2445            "D alpha beta gamma and more\n",
2446            "E 2026-04-04T12:00:00Z\n",
2447            "V 4\n",
2448            "P U\n",
2449            "S docs/plain.md\n",
2450            "\n",
2451        );
2452
2453        let (compressed, stats) = compress_kg_text(raw, KG_TEXT_COMPRESSION_MIN_LEN);
2454        assert!(stats.dictionary_entries > 0);
2455        assert!(compressed.contains("`1 "));
2456        assert!(compressed.contains("N`1`"));
2457        assert!(compressed.contains("D`1` and more"));
2458
2459        let manual_block = compressed
2460            .split("@ K:concept:plain")
2461            .nth(1)
2462            .expect("manual block");
2463        assert!(!manual_block.contains("`1`"));
2464
2465        let decompressed = expand_kg_tokens(&compressed);
2466        assert_eq!(decompressed, raw);
2467    }
2468
2469    #[test]
2470    fn load_kg_expands_backtick_tokens_before_parsing() {
2471        let dir = tempfile::tempdir().expect("temp dir");
2472        let path = dir.path().join("compressed.kg");
2473        std::fs::write(
2474            &path,
2475            concat!(
2476                "`1 alpha beta gamma\n",
2477                "@ GDIR:src\n",
2478                "N `1`\n",
2479                "D `1` and more\n",
2480                "\n",
2481            ),
2482        )
2483        .expect("write kg");
2484
2485        let loaded = GraphFile::load(&path).expect("load kg");
2486        let node = loaded.node_by_id("GDIR:src").expect("generated node");
2487        assert_eq!(node.name, "alpha beta gamma");
2488        assert_eq!(node.properties.description, "alpha beta gamma and more");
2489    }
2490
2491    #[test]
2492    fn parse_bidirectional_similarity_edge_is_canonical_and_scored() {
2493        let raw = "@ ~:dedupe_b\nN B\nD Desc\nV 0.5\nP U\nS docs/b.md\n= ~ ~:dedupe_a\nd C1 0.11\nd C2 0.83\nd 0.91\n\n@ ~:dedupe_a\nN A\nD Desc\nV 0.5\nP U\nS docs/a.md\n";
2494        let graph = parse_kg(raw, "virt", true).expect("parse kg");
2495
2496        assert_eq!(graph.nodes.len(), 2);
2497        assert_eq!(graph.edges.len(), 1);
2498        let edge = &graph.edges[0];
2499        assert_eq!(edge.relation, "~");
2500        assert_eq!(edge.source_id, "~:dedupe_a");
2501        assert_eq!(edge.target_id, "~:dedupe_b");
2502        assert_eq!(edge.properties.detail, "0.91");
2503        assert!(edge.properties.bidirectional);
2504        assert_eq!(edge.properties.score_components.get("C1"), Some(&0.11));
2505        assert_eq!(edge.properties.score_components.get("C2"), Some(&0.83));
2506    }
2507
2508    #[test]
2509    fn serialize_bidirectional_similarity_edge_uses_equals_operator() {
2510        let dir = tempfile::tempdir().expect("temp dir");
2511        let path = dir.path().join("virt.kg");
2512        let mut graph = GraphFile::new("virt");
2513        graph.nodes.push(crate::Node {
2514            id: "~:dedupe_a".to_owned(),
2515            r#type: "~".to_owned(),
2516            name: "A".to_owned(),
2517            properties: crate::NodeProperties {
2518                description: "Desc".to_owned(),
2519                provenance: "U".to_owned(),
2520                created_at: "2026-04-10T00:00:00Z".to_owned(),
2521                importance: 0.6,
2522                ..Default::default()
2523            },
2524            source_files: vec!["docs/a.md".to_owned()],
2525        });
2526        graph.nodes.push(crate::Node {
2527            id: "~:dedupe_b".to_owned(),
2528            r#type: "~".to_owned(),
2529            name: "B".to_owned(),
2530            properties: crate::NodeProperties {
2531                description: "Desc".to_owned(),
2532                provenance: "U".to_owned(),
2533                created_at: "2026-04-10T00:00:00Z".to_owned(),
2534                importance: 0.6,
2535                ..Default::default()
2536            },
2537            source_files: vec!["docs/b.md".to_owned()],
2538        });
2539        graph.edges.push(crate::Edge {
2540            source_id: "~:dedupe_a".to_owned(),
2541            relation: "~".to_owned(),
2542            target_id: "~:dedupe_b".to_owned(),
2543            properties: crate::EdgeProperties {
2544                detail: "0.75".to_owned(),
2545                bidirectional: true,
2546                score_components: std::collections::BTreeMap::from([
2547                    ("C1".to_owned(), 0.2),
2548                    ("C2".to_owned(), 0.8),
2549                ]),
2550                ..Default::default()
2551            },
2552        });
2553
2554        graph.save(&path).expect("save");
2555        let raw = std::fs::read_to_string(&path).expect("read");
2556        assert!(raw.contains("= ~ ~:dedupe_b"));
2557        assert!(raw.contains("d C1 0.200000"));
2558        assert!(raw.contains("d C2 0.800000"));
2559        assert!(!raw.contains("> ~ ~:dedupe_b"));
2560
2561        let loaded = GraphFile::load(&path).expect("load");
2562        assert_eq!(loaded.edges.len(), 1);
2563        assert!(loaded.edges[0].properties.bidirectional);
2564        assert_eq!(loaded.edges[0].properties.detail, "0.75");
2565        assert_eq!(
2566            loaded.edges[0].properties.score_components.get("C1"),
2567            Some(&0.2)
2568        );
2569        assert_eq!(
2570            loaded.edges[0].properties.score_components.get("C2"),
2571            Some(&0.8)
2572        );
2573    }
2574
2575    #[test]
2576    fn strict_mode_rejects_bidirectional_relation_other_than_similarity() {
2577        let raw = "@ K:concept:a\nN A\nD Desc\nV 0.5\nP U\nS docs/a.md\n= HAS concept:b\n";
2578        let err = parse_kg(raw, "x", true).expect_err("strict mode should reject invalid '='");
2579        assert!(format!("{err:#}").contains("expected '~'"));
2580    }
2581
2582    #[test]
2583    fn strict_mode_rejects_out_of_order_node_fields() {
2584        let raw = "@ K:concept:x\nD Desc\nN Name\nE 2026-04-04T12:00:00Z\nV 4\nP U\nS docs/a.md\n";
2585        let err = parse_kg(raw, "x", true).expect_err("strict mode should fail on field order");
2586        assert!(format!("{err:#}").contains("invalid field order"));
2587    }
2588
2589    #[test]
2590    fn strict_mode_rejects_overlong_name_but_compat_mode_allows_it() {
2591        let long_name = "N ".to_owned() + &"X".repeat(121);
2592        let raw = format!(
2593            "@ K:concept:x\n{}\nD Desc\nE 2026-04-04T12:00:00Z\nV 4\nP U\nS docs/a.md\n",
2594            long_name
2595        );
2596
2597        let strict_err = parse_kg(&raw, "x", true).expect_err("strict mode should fail on length");
2598        assert!(format!("{strict_err:#}").contains("invalid N length"));
2599
2600        parse_kg(&raw, "x", false).expect("compat mode keeps permissive behavior");
2601    }
2602
2603    #[test]
2604    fn save_kg_skips_empty_e_and_p_fields() {
2605        let dir = tempfile::tempdir().expect("temp dir");
2606        let path = dir.path().join("no-empty-ep.kg");
2607
2608        let mut graph = GraphFile::new("graph");
2609        graph.nodes.push(crate::Node {
2610            id: "concept:x".to_owned(),
2611            r#type: "Concept".to_owned(),
2612            name: "X".to_owned(),
2613            properties: crate::NodeProperties {
2614                description: "Desc".to_owned(),
2615                provenance: String::new(),
2616                created_at: String::new(),
2617                ..Default::default()
2618            },
2619            source_files: vec!["docs/a.md".to_owned()],
2620        });
2621
2622        graph.save(&path).expect("save kg");
2623        let raw = std::fs::read_to_string(&path).expect("read kg");
2624        assert!(!raw.contains("\nE \n"));
2625        assert!(!raw.contains("\nP \n"));
2626    }
2627
2628    #[test]
2629    fn load_generates_graph_info_node_when_missing() {
2630        let dir = tempfile::tempdir().expect("temp dir");
2631        let path = dir.path().join("meta.kg");
2632        let raw = "@ K:concept:x\nN X\nD Desc\nV 0.5\nP U\nS docs/a.md\n";
2633        std::fs::write(&path, raw).expect("write kg");
2634
2635        let loaded = GraphFile::load(&path).expect("load kg");
2636        let info = loaded
2637            .node_by_id(GRAPH_INFO_NODE_ID)
2638            .expect("graph info node should be generated");
2639        assert_eq!(info.r#type, GRAPH_INFO_NODE_TYPE);
2640        assert!(
2641            info.properties
2642                .key_facts
2643                .iter()
2644                .any(|fact| fact.starts_with(GRAPH_UUID_FACT_PREFIX))
2645        );
2646
2647        let persisted = std::fs::read_to_string(&path).expect("read persisted kg");
2648        assert!(persisted.contains("graph_info"));
2649        assert!(persisted.contains("graph_uuid="));
2650        assert!(persisted.contains("schema_version="));
2651    }
2652}