1use std::collections::{BTreeMap, HashMap};
2use std::fs;
3use std::io::Write;
4use std::path::{Path, PathBuf};
5use std::time::{SystemTime, UNIX_EPOCH};
6
7use anyhow::{Context, Result};
8use flate2::Compression;
9use flate2::write::GzEncoder;
10use serde::{Deserialize, Serialize};
11
12const GRAPH_INFO_NODE_ID: &str = "^:graph_info";
13const GRAPH_INFO_NODE_TYPE: &str = "^";
14const GRAPH_UUID_FACT_PREFIX: &str = "graph_uuid=";
15const GRAPH_SCHEMA_VERSION: u32 = 2;
16const GRAPH_SCHEMA_VERSION_FACT_PREFIX: &str = "schema_version=";
17const KG_TEXT_COMPRESSION_MIN_LEN: usize = 7;
18
19fn atomic_write(dest: &Path, data: &str) -> Result<()> {
24 let unique = SystemTime::now()
25 .duration_since(UNIX_EPOCH)
26 .unwrap_or_default()
27 .as_nanos();
28 let tmp = dest.with_extension(format!("tmp.{}.{}", std::process::id(), unique));
29 fs::write(&tmp, data).with_context(|| format!("failed to write tmp: {}", tmp.display()))?;
30 if dest.exists() {
31 let bak = backup_bak_path(dest)?;
32 if should_refresh_bak(&bak)? {
33 fs::copy(dest, &bak)
34 .with_context(|| format!("failed to create backup: {}", bak.display()))?;
35 }
36 }
37 fs::rename(&tmp, dest).with_context(|| format!("failed to rename tmp to {}", dest.display()))
38}
39
40const BACKUP_BAK_STALE_SECS: u64 = 5 * 60;
41const BACKUP_STALE_SECS: u64 = 60 * 60;
42
43fn should_refresh_bak(bak_path: &Path) -> Result<bool> {
44 if !bak_path.exists() {
45 return Ok(true);
46 }
47 let modified = fs::metadata(bak_path)
48 .and_then(|m| m.modified())
49 .with_context(|| format!("failed to read backup mtime: {}", bak_path.display()))?;
50 let age_secs = SystemTime::now()
51 .duration_since(modified)
52 .unwrap_or_default()
53 .as_secs();
54 Ok(age_secs >= BACKUP_BAK_STALE_SECS)
55}
56
57fn backup_graph_if_stale(path: &Path, data: &str) -> Result<()> {
58 let cache_dir = backup_cache_dir(path)?;
59 let stem = match path.file_stem().and_then(|s| s.to_str()) {
60 Some(stem) => stem,
61 None => return Ok(()),
62 };
63 let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("json");
64 let backup_prefix = format!("{stem}.{ext}");
65 let now = SystemTime::now()
66 .duration_since(UNIX_EPOCH)
67 .context("time went backwards")?
68 .as_secs();
69 if let Some(latest) = latest_backup_ts(&cache_dir, &backup_prefix)? {
70 if now.saturating_sub(latest) < BACKUP_STALE_SECS {
71 return Ok(());
72 }
73 }
74
75 let backup_path = cache_dir.join(format!("{backup_prefix}.bck.{now}.gz"));
76 let tmp_path = backup_path.with_extension("tmp");
77 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
78 encoder.write_all(data.as_bytes())?;
79 let encoded = encoder.finish()?;
80 fs::write(&tmp_path, encoded)
81 .with_context(|| format!("failed to write tmp: {}", tmp_path.display()))?;
82 fs::rename(&tmp_path, &backup_path)
83 .with_context(|| format!("failed to rename tmp to {}", backup_path.display()))?;
84 Ok(())
85}
86
87fn backup_cache_dir(path: &Path) -> Result<PathBuf> {
88 let dir = crate::cache_paths::cache_root_for_graph(path);
89 fs::create_dir_all(&dir)
90 .with_context(|| format!("failed to create cache directory: {}", dir.display()))?;
91 Ok(dir)
92}
93
94fn backup_bak_path(dest: &Path) -> Result<PathBuf> {
95 let cache_dir = backup_cache_dir(dest)?;
96 let stem = dest.file_stem().and_then(|s| s.to_str()).unwrap_or("graph");
97 let ext = dest.extension().and_then(|s| s.to_str()).unwrap_or("json");
98 Ok(cache_dir.join(format!("{stem}.{ext}.bak")))
99}
100
101fn latest_backup_ts(dir: &Path, stem: &str) -> Result<Option<u64>> {
102 let prefix = format!("{stem}.bck.");
103 let suffix = ".gz";
104 let mut latest = None;
105 for entry in fs::read_dir(dir).with_context(|| format!("read dir: {}", dir.display()))? {
106 let entry = entry?;
107 let name = entry.file_name();
108 let name = name.to_string_lossy();
109 if !name.starts_with(&prefix) || !name.ends_with(suffix) {
110 continue;
111 }
112 let ts_part = &name[prefix.len()..name.len() - suffix.len()];
113 if let Ok(ts) = ts_part.parse::<u64>() {
114 match latest {
115 Some(current) => {
116 if ts > current {
117 latest = Some(ts);
118 }
119 }
120 None => latest = Some(ts),
121 }
122 }
123 }
124 Ok(latest)
125}
126
127fn node_type_to_code(node_type: &str) -> &str {
128 match node_type {
129 "Feature" => "F",
130 "Concept" => "K",
131 "Interface" => "I",
132 "Process" => "P",
133 "DataStore" => "D",
134 "Attribute" => "A",
135 "Entity" => "Y",
136 "Note" => "N",
137 "Rule" => "R",
138 "Convention" => "C",
139 "Bug" => "B",
140 "Decision" => "Z",
141 "OpenQuestion" => "O",
142 "Claim" => "Q",
143 "Insight" => "W",
144 "Reference" => "M",
145 "Term" => "T",
146 "Status" => "S",
147 "Doubt" => "L",
148 _ => node_type,
149 }
150}
151
152fn encode_node_type_token(node_type: &str) -> String {
153 let code = node_type_to_code(node_type);
154 if code != node_type {
155 return code.to_owned();
156 }
157 if code_to_node_type(node_type) != node_type {
158 return format!("={node_type}");
159 }
160 node_type.to_owned()
161}
162
163fn code_to_node_type(code: &str) -> &str {
164 match code {
165 "F" => "Feature",
166 "K" => "Concept",
167 "I" => "Interface",
168 "P" => "Process",
169 "D" => "DataStore",
170 "A" => "Attribute",
171 "Y" => "Entity",
172 "N" => "Note",
173 "R" => "Rule",
174 "C" => "Convention",
175 "B" => "Bug",
176 "Z" => "Decision",
177 "O" => "OpenQuestion",
178 "Q" => "Claim",
179 "W" => "Insight",
180 "M" => "Reference",
181 "T" => "Term",
182 "S" => "Status",
183 "L" => "Doubt",
184 _ => code,
185 }
186}
187
188fn decode_node_type_token(token: &str) -> String {
189 token
190 .strip_prefix('=')
191 .map(str::to_owned)
192 .unwrap_or_else(|| code_to_node_type(token).to_owned())
193}
194
195fn relation_to_code(relation: &str) -> &str {
196 match relation {
197 "DOCUMENTED_IN" | "DOCUMENTS" => "D",
198 "HAS" => "H",
199 "TRIGGERS" => "T",
200 "AFFECTED_BY" | "AFFECTS" => "A",
201 "READS_FROM" | "READS" => "R",
202 "GOVERNED_BY" | "GOVERNS" => "G",
203 "DEPENDS_ON" => "O",
204 "AVAILABLE_IN" => "I",
205 "SUPPORTS" => "S",
206 "SUMMARIZES" => "U",
207 "RELATED_TO" => "L",
208 "CONTRADICTS" => "V",
209 "CREATED_BY" | "CREATES" => "C",
210 _ => relation,
211 }
212}
213
214fn code_to_relation(code: &str) -> &str {
215 match code {
216 "D" => "DOCUMENTED_IN",
217 "H" => "HAS",
218 "T" => "TRIGGERS",
219 "A" => "AFFECTED_BY",
220 "R" => "READS_FROM",
221 "G" => "GOVERNED_BY",
222 "O" => "DEPENDS_ON",
223 "I" => "AVAILABLE_IN",
224 "S" => "SUPPORTS",
225 "U" => "SUMMARIZES",
226 "L" => "RELATED_TO",
227 "V" => "CONTRADICTS",
228 "C" => "CREATED_BY",
229 _ => code,
230 }
231}
232
233fn canonicalize_bidirectional_pair(a: &str, b: &str) -> (String, String) {
234 if a <= b {
235 (a.to_owned(), b.to_owned())
236 } else {
237 (b.to_owned(), a.to_owned())
238 }
239}
240
241fn is_score_component_label(value: &str) -> bool {
242 let mut chars = value.chars();
243 matches!(chars.next(), Some('C'))
244 && chars.clone().next().is_some()
245 && chars.all(|ch| ch.is_ascii_digit())
246}
247
248fn sort_case_insensitive(values: &[String]) -> Vec<String> {
249 let mut sorted = values.to_vec();
250 sorted.sort_by(|a, b| {
251 let la = a.to_ascii_lowercase();
252 let lb = b.to_ascii_lowercase();
253 la.cmp(&lb).then_with(|| a.cmp(b))
254 });
255 sorted
256}
257
258fn decode_kg_text(value: &str) -> String {
259 let mut out = String::new();
260 let mut chars = value.chars();
261 while let Some(ch) = chars.next() {
262 if ch != '\\' {
263 out.push(ch);
264 continue;
265 }
266 match chars.next() {
267 Some('n') => out.push('\n'),
268 Some('r') => out.push('\r'),
269 Some('\\') => out.push('\\'),
270 Some(other) => {
271 out.push('\\');
272 out.push(other);
273 }
274 None => out.push('\\'),
275 }
276 }
277 out
278}
279
280fn escape_kg_text(value: &str) -> String {
281 let mut out = String::new();
282 for ch in value.chars() {
283 match ch {
284 '\\' => out.push_str("\\\\"),
285 '\n' => out.push_str("\\n"),
286 '\r' => out.push_str("\\r"),
287 _ => out.push(ch),
288 }
289 }
290 out
291}
292
293fn parse_text_field(value: &str) -> String {
294 decode_kg_text(value)
295}
296
297fn push_text_line(out: &mut String, key: &str, value: &str) {
298 out.push_str(key);
299 out.push(' ');
300 out.push_str(&escape_kg_text(value));
301 out.push('\n');
302}
303
304#[derive(Debug, Clone)]
305struct KgCompressionCandidate {
306 token: usize,
307 value: String,
308 first_line: usize,
309 first_col: usize,
310}
311
312#[derive(Debug, Default, Clone, Copy)]
313struct KgCompressionStats {
314 original_bytes: usize,
315 compressed_bytes: usize,
316 dictionary_entries: usize,
317}
318
319#[derive(Debug, Clone)]
320struct LineOccurrence {
321 line_idx: usize,
322 col_idx: usize,
323}
324
325fn decode_kg_token_reference_line(line: &str) -> Option<(String, String)> {
326 let rest = line.strip_prefix('`')?;
327 let (token, value) = rest.split_once(' ')?;
328 if token.is_empty() || !token.chars().all(|ch| ch.is_ascii_digit()) {
329 return None;
330 }
331 Some((token.to_owned(), value.to_owned()))
332}
333
334fn expand_kg_tokens_in_line(line: &str, dictionary: &std::collections::HashMap<String, String>) -> String {
335 let mut out = String::new();
336 let chars: Vec<char> = line.chars().collect();
337 let mut idx = 0;
338
339 while idx < chars.len() {
340 if chars[idx] != '`' {
341 out.push(chars[idx]);
342 idx += 1;
343 continue;
344 }
345
346 let start = idx;
347 idx += 1;
348 let mut token = String::new();
349 while idx < chars.len() && chars[idx].is_ascii_digit() {
350 token.push(chars[idx]);
351 idx += 1;
352 }
353
354 if !token.is_empty() && idx < chars.len() && chars[idx] == '`' {
355 idx += 1;
356 if let Some(value) = dictionary.get(&token) {
357 out.push_str(value);
358 } else {
359 out.push('`');
360 out.push_str(&token);
361 out.push('`');
362 }
363 continue;
364 }
365
366 out.push('`');
367 out.push_str(&token);
368 if idx < chars.len() {
369 out.push(chars[idx]);
370 idx += 1;
371 } else if start + 1 < chars.len() {
372 }
374 }
375
376 out
377}
378
379fn expand_kg_tokens(raw: &str) -> String {
380 let mut dictionary = std::collections::HashMap::new();
381 let mut out = String::new();
382
383 for line in raw.lines() {
384 if let Some((token, value)) = decode_kg_token_reference_line(line) {
385 dictionary.insert(token, value);
386 continue;
387 }
388 out.push_str(&expand_kg_tokens_in_line(line, &dictionary));
389 out.push('\n');
390 }
391
392 out
393}
394
395fn node_header_type_token(line: &str) -> Option<&str> {
396 let rest = line.strip_prefix("@ ")?;
397 let (type_token, _) = rest.split_once(':')?;
398 Some(type_token.trim())
399}
400
401fn is_generated_node_block_header(line: &str) -> bool {
402 node_header_type_token(line)
403 .is_some_and(|token| token.starts_with('G'))
404}
405
406fn collect_generated_text_lines(raw: &str) -> Vec<(usize, String)> {
407 let mut lines = Vec::new();
408 let mut in_block = false;
409 let mut generated_block = false;
410
411 for (idx, line) in raw.lines().enumerate() {
412 let trimmed = line.trim();
413 if trimmed.is_empty() {
414 in_block = false;
415 generated_block = false;
416 continue;
417 }
418
419 if trimmed.starts_with("@ ") {
420 in_block = true;
421 generated_block = is_generated_node_block_header(trimmed);
422 continue;
423 }
424
425 if in_block && generated_block {
426 lines.push((idx, line.to_owned()));
427 }
428 }
429
430 lines
431}
432
433fn extend_repeated_seed(
434 seed: &str,
435 occurrences: &[LineOccurrence],
436 source_lines: &[(usize, String)],
437) -> Option<String> {
438 let seed_chars: Vec<char> = seed.chars().collect();
439 let mut candidate = seed_chars.clone();
440
441 loop {
442 let mut next_char: Option<char> = None;
443
444 for occurrence in occurrences {
445 let (_, line) = source_lines
446 .iter()
447 .find(|(line_idx, _)| *line_idx == occurrence.line_idx)?;
448 let chars: Vec<char> = line.chars().collect();
449 let next_index = occurrence.col_idx + candidate.len();
450 let Some(&ch) = chars.get(next_index) else {
451 return Some(candidate.into_iter().collect());
452 };
453 if ch == '`' {
454 return Some(candidate.into_iter().collect());
455 }
456 match next_char {
457 Some(prev) if prev != ch => return Some(candidate.into_iter().collect()),
458 None => next_char = Some(ch),
459 _ => {}
460 }
461 }
462
463 let Some(ch) = next_char else {
464 return Some(candidate.into_iter().collect());
465 };
466 candidate.push(ch);
467 if candidate.len() > seed_chars.len() + 256 {
468 return Some(candidate.into_iter().collect());
469 }
470 }
471}
472
473fn discover_kg_compression_candidates(
474 source_lines: &[(usize, String)],
475 min_len: usize,
476) -> Vec<KgCompressionCandidate> {
477 let mut seeds: std::collections::HashMap<String, Vec<LineOccurrence>> =
478 std::collections::HashMap::new();
479
480 for (line_idx, line) in source_lines {
481 let chars: Vec<char> = line.chars().collect();
482 if chars.len() < min_len {
483 continue;
484 }
485
486 for start in 0..=chars.len() - min_len {
487 if chars[start..start + min_len].iter().any(|ch| *ch == '`') {
488 continue;
489 }
490 let seed: String = chars[start..start + min_len].iter().collect();
491 seeds.entry(seed).or_default().push(LineOccurrence {
492 line_idx: *line_idx,
493 col_idx: start,
494 });
495 }
496 }
497
498 let mut discovered: std::collections::HashMap<String, KgCompressionCandidate> =
499 std::collections::HashMap::new();
500
501 for (seed, occurrences) in seeds {
502 if occurrences.len() < 2 {
503 continue;
504 }
505
506 let Some(value) = extend_repeated_seed(&seed, &occurrences, source_lines) else {
507 continue;
508 };
509 if value.chars().count() < min_len || value.contains('`') {
510 continue;
511 }
512
513 let first = occurrences
514 .iter()
515 .min_by_key(|occ| (occ.line_idx, occ.col_idx))
516 .expect("at least one occurrence");
517
518 discovered
519 .entry(value.clone())
520 .and_modify(|candidate| {
521 let first_pos = (first.line_idx, first.col_idx);
522 let current_pos = (candidate.first_line, candidate.first_col);
523 if first_pos < current_pos {
524 candidate.first_line = first.line_idx;
525 candidate.first_col = first.col_idx;
526 }
527 })
528 .or_insert(KgCompressionCandidate {
529 token: 0,
530 value,
531 first_line: first.line_idx,
532 first_col: first.col_idx,
533 });
534 }
535
536 let mut candidates: Vec<KgCompressionCandidate> = discovered.into_values().collect();
537 candidates.sort_by(|a, b| {
538 b.value
539 .chars()
540 .count()
541 .cmp(&a.value.chars().count())
542 .then_with(|| a.first_line.cmp(&b.first_line))
543 .then_with(|| a.first_col.cmp(&b.first_col))
544 .then_with(|| a.value.cmp(&b.value))
545 });
546
547 let mut filtered: Vec<KgCompressionCandidate> = Vec::new();
548 'candidate: for candidate in candidates {
549 for kept in &filtered {
550 if kept.value.contains(&candidate.value) {
551 continue 'candidate;
552 }
553 }
554 filtered.push(candidate);
555 }
556
557 filtered.sort_by(|a, b| {
558 a.first_line
559 .cmp(&b.first_line)
560 .then_with(|| b.value.chars().count().cmp(&a.value.chars().count()))
561 .then_with(|| a.first_col.cmp(&b.first_col))
562 .then_with(|| a.value.cmp(&b.value))
563 });
564
565 for (idx, candidate) in filtered.iter_mut().enumerate() {
566 candidate.token = idx + 1;
567 }
568
569 filtered
570}
571
572fn replace_kg_text_with_tokens(line: &str, candidates: &[KgCompressionCandidate]) -> String {
573 let chars: Vec<char> = line.chars().collect();
574 let mut out = String::new();
575 let mut idx = 0;
576
577 while idx < chars.len() {
578 let mut best: Option<&KgCompressionCandidate> = None;
579
580 for candidate in candidates {
581 let candidate_chars: Vec<char> = candidate.value.chars().collect();
582 if idx + candidate_chars.len() > chars.len() {
583 continue;
584 }
585 if chars[idx..idx + candidate_chars.len()] != candidate_chars[..] {
586 continue;
587 }
588 match best {
589 Some(current)
590 if current.value.chars().count() >= candidate_chars.len() => {}
591 _ => best = Some(candidate),
592 }
593 }
594
595 if let Some(candidate) = best {
596 out.push('`');
597 out.push_str(&candidate.token.to_string());
598 out.push('`');
599 idx += candidate.value.chars().count();
600 continue;
601 }
602
603 out.push(chars[idx]);
604 idx += 1;
605 }
606
607 out
608}
609
610fn compress_kg_text(raw: &str, min_len: usize) -> (String, KgCompressionStats) {
611 let source_lines = collect_generated_text_lines(raw);
612 let candidates = discover_kg_compression_candidates(&source_lines, min_len);
613
614 let mut defs_by_line: std::collections::HashMap<usize, Vec<&KgCompressionCandidate>> =
615 std::collections::HashMap::new();
616 for candidate in &candidates {
617 defs_by_line.entry(candidate.first_line).or_default().push(candidate);
618 }
619 for defs in defs_by_line.values_mut() {
620 defs.sort_by(|a, b| {
621 b.value
622 .chars()
623 .count()
624 .cmp(&a.value.chars().count())
625 .then_with(|| a.token.cmp(&b.token))
626 });
627 }
628
629 let compressed_source_lines: std::collections::HashSet<usize> =
630 source_lines.iter().map(|(idx, _)| *idx).collect();
631 let mut compressed = String::new();
632
633 for (idx, line) in raw.lines().enumerate() {
634 if let Some(defs) = defs_by_line.get(&idx) {
635 for def in defs {
636 compressed.push('`');
637 compressed.push_str(&def.token.to_string());
638 compressed.push(' ');
639 compressed.push_str(&def.value);
640 compressed.push('\n');
641 }
642 }
643
644 let rendered = if compressed_source_lines.contains(&idx) {
645 replace_kg_text_with_tokens(line, &candidates)
646 } else {
647 line.to_owned()
648 };
649 compressed.push_str(&rendered);
650 compressed.push('\n');
651 }
652
653 let original_bytes = raw.len();
654 let compressed_bytes = compressed.len();
655 let dictionary_entries = candidates.len();
656
657 (
658 if compressed_bytes < original_bytes {
659 compressed
660 } else {
661 raw.to_owned()
662 },
663 KgCompressionStats {
664 original_bytes,
665 compressed_bytes,
666 dictionary_entries,
667 },
668 )
669}
670
671fn dedupe_case_insensitive(values: Vec<String>) -> Vec<String> {
672 let mut seen = std::collections::HashSet::new();
673 let mut out = Vec::new();
674 for value in values {
675 let key = value.to_ascii_lowercase();
676 if seen.insert(key) {
677 out.push(value);
678 }
679 }
680 out
681}
682
683fn parse_utc_timestamp(value: &str) -> bool {
684 if value.len() != 20 {
685 return false;
686 }
687 let bytes = value.as_bytes();
688 let is_digit = |idx: usize| bytes.get(idx).is_some_and(|b| b.is_ascii_digit());
689 if !(is_digit(0)
690 && is_digit(1)
691 && is_digit(2)
692 && is_digit(3)
693 && bytes.get(4) == Some(&b'-')
694 && is_digit(5)
695 && is_digit(6)
696 && bytes.get(7) == Some(&b'-')
697 && is_digit(8)
698 && is_digit(9)
699 && bytes.get(10) == Some(&b'T')
700 && is_digit(11)
701 && is_digit(12)
702 && bytes.get(13) == Some(&b':')
703 && is_digit(14)
704 && is_digit(15)
705 && bytes.get(16) == Some(&b':')
706 && is_digit(17)
707 && is_digit(18)
708 && bytes.get(19) == Some(&b'Z'))
709 {
710 return false;
711 }
712
713 let month = value[5..7].parse::<u32>().ok();
714 let day = value[8..10].parse::<u32>().ok();
715 let hour = value[11..13].parse::<u32>().ok();
716 let minute = value[14..16].parse::<u32>().ok();
717 let second = value[17..19].parse::<u32>().ok();
718 matches!(month, Some(1..=12))
719 && matches!(day, Some(1..=31))
720 && matches!(hour, Some(0..=23))
721 && matches!(minute, Some(0..=59))
722 && matches!(second, Some(0..=59))
723}
724
725fn parse_boolish(value: &str) -> Option<bool> {
726 match value.trim().to_ascii_lowercase().as_str() {
727 "1" | "true" | "yes" | "on" => Some(true),
728 "0" | "false" | "no" | "off" => Some(false),
729 _ => None,
730 }
731}
732
733fn strict_kg_mode() -> bool {
734 let Ok(value) = std::env::var("KG_STRICT_FORMAT") else {
735 return false;
736 };
737 matches!(
738 value.trim().to_ascii_lowercase().as_str(),
739 "1" | "true" | "yes" | "on"
740 )
741}
742
743fn abbreviated_line(line: &str) -> String {
744 const MAX_CHARS: usize = 160;
745 let trimmed = line.trim();
746 let mut out = String::new();
747 for (idx, ch) in trimmed.chars().enumerate() {
748 if idx >= MAX_CHARS {
749 out.push_str("...");
750 break;
751 }
752 out.push(ch);
753 }
754 out
755}
756
757fn line_fragment(line: &str) -> String {
758 let snippet = abbreviated_line(line);
759 if snippet.is_empty() {
760 "fragment: <empty line>".to_owned()
761 } else {
762 format!("fragment: {snippet}")
763 }
764}
765
766fn json_error_detail(label: &str, path: &Path, raw: &str, error: &serde_json::Error) -> String {
767 let line_no = error.line();
768 let column = error.column();
769 let fragment = raw
770 .lines()
771 .nth(line_no.saturating_sub(1))
772 .map(line_fragment)
773 .unwrap_or_else(|| "fragment: <unavailable>".to_owned());
774 format!(
775 "{label}: {} at line {line_no}, column {column}: {error}\n{fragment}",
776 path.display()
777 )
778}
779
780fn validate_len(
781 line_no: usize,
782 field: &str,
783 value: &str,
784 raw_line: &str,
785 min: usize,
786 max: usize,
787 strict: bool,
788) -> Result<()> {
789 let len = value.chars().count();
790 if strict && (len < min || len > max) {
791 return Err(anyhow::anyhow!(
792 "invalid {field} length at line {line_no}: expected {min}..={max}, got {len}\n{}",
793 line_fragment(raw_line)
794 ));
795 }
796 Ok(())
797}
798
799fn enforce_field_order(
800 line_no: usize,
801 key: &str,
802 rank: u8,
803 last_rank: &mut u8,
804 section: &str,
805 raw_line: &str,
806 strict: bool,
807) -> Result<()> {
808 if strict && rank < *last_rank {
809 return Err(anyhow::anyhow!(
810 "invalid field order at line {line_no}: {key} in {section} block\n{}",
811 line_fragment(raw_line)
812 ));
813 }
814 if rank > *last_rank {
815 *last_rank = rank;
816 }
817 Ok(())
818}
819
820fn field_value<'a>(line: &'a str, key: &str) -> Option<&'a str> {
821 if line == key {
822 Some("")
823 } else {
824 line.strip_prefix(key)
825 .and_then(|rest| rest.strip_prefix(' '))
826 }
827}
828
829fn fail_or_warn(strict: bool, warnings: &mut Vec<String>, message: String) -> Result<()> {
830 if strict {
831 Err(anyhow::anyhow!(message))
832 } else {
833 warnings.push(message);
834 Ok(())
835 }
836}
837
838#[cfg(test)]
839fn parse_kg(raw: &str, graph_name: &str, strict: bool) -> Result<GraphFile> {
840 Ok(parse_kg_with_warnings(raw, graph_name, strict)?.0)
841}
842
843fn parse_kg_with_warnings(
844 raw: &str,
845 graph_name: &str,
846 strict: bool,
847) -> Result<(GraphFile, Vec<String>)> {
848 let mut graph = GraphFile::new(graph_name);
849 let mut warnings = Vec::new();
850 let mut current_node: Option<Node> = None;
851 let mut current_note: Option<Note> = None;
852 let mut current_edge_index: Option<usize> = None;
853 let mut last_node_rank: u8 = 0;
854 let mut last_note_rank: u8 = 0;
855 let mut last_edge_rank: u8 = 0;
856
857 for (idx, line) in raw.lines().enumerate() {
858 let line_no = idx + 1;
859 let raw_line = line.strip_suffix('\r').unwrap_or(line);
860 let trimmed = raw_line.trim();
861 if trimmed.is_empty() || trimmed.starts_with('#') {
862 continue;
863 }
864
865 if let Some(rest) = trimmed.strip_prefix("@ ") {
866 if let Some(note) = current_note.take() {
867 graph.notes.push(note);
868 }
869 if let Some(node) = current_node.take() {
870 graph.nodes.push(node);
871 }
872 let Some((type_code, node_id)) = rest.split_once(':') else {
873 fail_or_warn(
874 strict,
875 &mut warnings,
876 format!("invalid node header at line {line_no}: {trimmed}"),
877 )?;
878 current_edge_index = None;
879 continue;
880 };
881 let decoded_type = decode_node_type_token(type_code.trim());
882 let parsed_id = {
883 let raw_id = node_id.trim();
884 if crate::validate::is_generated_node_type(&decoded_type) {
885 if let Some((head, suffix)) = raw_id.split_once(':') {
886 if head == decoded_type {
887 suffix.to_owned()
888 } else {
889 raw_id.to_owned()
890 }
891 } else {
892 raw_id.to_owned()
893 }
894 } else if type_code.trim().starts_with('=') && raw_id.contains(':') {
895 raw_id.to_owned()
896 } else if raw_id.contains(':') {
897 crate::validate::normalize_node_id(raw_id)
898 } else if code_to_node_type(type_code.trim()) != type_code.trim() {
899 crate::validate::normalize_node_id(&format!("{}:{raw_id}", type_code.trim()))
900 } else {
901 format!("{}:{raw_id}", decoded_type)
902 }
903 };
904 current_node = Some(Node {
905 id: parsed_id,
906 r#type: decoded_type,
907 name: String::new(),
908 properties: NodeProperties::default(),
909 source_files: Vec::new(),
910 });
911 current_edge_index = None;
912 last_node_rank = 0;
913 last_edge_rank = 0;
914 continue;
915 }
916
917 if let Some(rest) = trimmed.strip_prefix("! ") {
918 if let Some(node) = current_node.take() {
919 graph.nodes.push(node);
920 }
921 if let Some(note) = current_note.take() {
922 graph.notes.push(note);
923 }
924 let mut parts = rest.split_whitespace();
925 let Some(id) = parts.next() else {
926 fail_or_warn(
927 strict,
928 &mut warnings,
929 format!("invalid note header at line {line_no}: {trimmed}"),
930 )?;
931 current_edge_index = None;
932 continue;
933 };
934 let Some(node_id) = parts.next() else {
935 fail_or_warn(
936 strict,
937 &mut warnings,
938 format!("invalid note header at line {line_no}: {trimmed}"),
939 )?;
940 current_edge_index = None;
941 continue;
942 };
943 current_note = Some(Note {
944 id: id.to_owned(),
945 node_id: node_id.to_owned(),
946 ..Default::default()
947 });
948 current_edge_index = None;
949 last_note_rank = 0;
950 continue;
951 }
952
953 if let Some(note) = current_note.as_mut() {
954 if let Some(rest) = field_value(raw_line, "b") {
955 enforce_field_order(
956 line_no,
957 "b",
958 1,
959 &mut last_note_rank,
960 "note",
961 raw_line,
962 strict,
963 )?;
964 note.body = parse_text_field(rest);
965 continue;
966 }
967 if let Some(rest) = field_value(raw_line, "t") {
968 enforce_field_order(
969 line_no,
970 "t",
971 2,
972 &mut last_note_rank,
973 "note",
974 raw_line,
975 strict,
976 )?;
977 let value = parse_text_field(rest);
978 if !value.is_empty() {
979 note.tags.push(value);
980 }
981 continue;
982 }
983 if let Some(rest) = field_value(raw_line, "a") {
984 enforce_field_order(
985 line_no,
986 "a",
987 3,
988 &mut last_note_rank,
989 "note",
990 raw_line,
991 strict,
992 )?;
993 note.author = parse_text_field(rest);
994 continue;
995 }
996 if let Some(rest) = field_value(raw_line, "e") {
997 enforce_field_order(
998 line_no,
999 "e",
1000 4,
1001 &mut last_note_rank,
1002 "note",
1003 raw_line,
1004 strict,
1005 )?;
1006 note.created_at = rest.trim().to_owned();
1007 continue;
1008 }
1009 if let Some(rest) = field_value(raw_line, "p") {
1010 enforce_field_order(
1011 line_no,
1012 "p",
1013 5,
1014 &mut last_note_rank,
1015 "note",
1016 raw_line,
1017 strict,
1018 )?;
1019 note.provenance = parse_text_field(rest);
1020 continue;
1021 }
1022 if let Some(rest) = field_value(raw_line, "s") {
1023 enforce_field_order(
1024 line_no,
1025 "s",
1026 6,
1027 &mut last_note_rank,
1028 "note",
1029 raw_line,
1030 strict,
1031 )?;
1032 let value = parse_text_field(rest);
1033 if !value.is_empty() {
1034 note.source_files.push(value);
1035 }
1036 continue;
1037 }
1038 fail_or_warn(
1039 strict,
1040 &mut warnings,
1041 format!("unrecognized note line at {line_no}: {trimmed}"),
1042 )?;
1043 continue;
1044 }
1045
1046 let Some(node) = current_node.as_mut() else {
1047 fail_or_warn(
1048 strict,
1049 &mut warnings,
1050 format!("unexpected line before first node at line {line_no}: {trimmed}"),
1051 )?;
1052 continue;
1053 };
1054
1055 if let Some(rest) = field_value(raw_line, "N") {
1056 enforce_field_order(
1057 line_no,
1058 "N",
1059 1,
1060 &mut last_node_rank,
1061 "node",
1062 raw_line,
1063 strict,
1064 )?;
1065 let value = parse_text_field(rest);
1066 validate_len(line_no, "N", &value, raw_line, 1, 120, strict)?;
1067 node.name = value;
1068 continue;
1069 }
1070 if let Some(rest) = field_value(raw_line, "D") {
1071 enforce_field_order(
1072 line_no,
1073 "D",
1074 2,
1075 &mut last_node_rank,
1076 "node",
1077 raw_line,
1078 strict,
1079 )?;
1080 let value = parse_text_field(rest);
1081 validate_len(line_no, "D", &value, raw_line, 1, 200, strict)?;
1082 node.properties.description = value;
1083 continue;
1084 }
1085 if let Some(rest) = field_value(raw_line, "A") {
1086 enforce_field_order(
1087 line_no,
1088 "A",
1089 3,
1090 &mut last_node_rank,
1091 "node",
1092 raw_line,
1093 strict,
1094 )?;
1095 let value = parse_text_field(rest);
1096 validate_len(line_no, "A", &value, raw_line, 1, 80, strict)?;
1097 node.properties.alias.push(value);
1098 continue;
1099 }
1100 if let Some(rest) = field_value(raw_line, "F") {
1101 enforce_field_order(
1102 line_no,
1103 "F",
1104 4,
1105 &mut last_node_rank,
1106 "node",
1107 raw_line,
1108 strict,
1109 )?;
1110 let value = parse_text_field(rest);
1111 validate_len(line_no, "F", &value, raw_line, 1, 200, strict)?;
1112 node.properties.key_facts.push(value);
1113 continue;
1114 }
1115 if let Some(rest) = field_value(raw_line, "E") {
1116 enforce_field_order(
1117 line_no,
1118 "E",
1119 5,
1120 &mut last_node_rank,
1121 "node",
1122 raw_line,
1123 strict,
1124 )?;
1125 let value = rest.trim();
1126 if !value.is_empty() && !parse_utc_timestamp(value) {
1127 fail_or_warn(
1128 strict,
1129 &mut warnings,
1130 format!(
1131 "invalid E timestamp at line {line_no}: expected YYYY-MM-DDTHH:MM:SSZ\n{}",
1132 line_fragment(raw_line)
1133 ),
1134 )?;
1135 continue;
1136 }
1137 node.properties.created_at = value.to_owned();
1138 continue;
1139 }
1140 if let Some(rest) = field_value(raw_line, "C") {
1141 enforce_field_order(
1142 line_no,
1143 "C",
1144 6,
1145 &mut last_node_rank,
1146 "node",
1147 raw_line,
1148 strict,
1149 )?;
1150 if !rest.trim().is_empty() {
1151 node.properties.confidence = rest.trim().parse::<f64>().ok();
1152 }
1153 continue;
1154 }
1155 if let Some(rest) = field_value(raw_line, "V") {
1156 enforce_field_order(
1157 line_no,
1158 "V",
1159 7,
1160 &mut last_node_rank,
1161 "node",
1162 raw_line,
1163 strict,
1164 )?;
1165 if let Ok(value) = rest.trim().parse::<f64>() {
1166 node.properties.importance = value;
1167 }
1168 continue;
1169 }
1170 if let Some(rest) = field_value(raw_line, "P") {
1171 enforce_field_order(
1172 line_no,
1173 "P",
1174 8,
1175 &mut last_node_rank,
1176 "node",
1177 raw_line,
1178 strict,
1179 )?;
1180 node.properties.provenance = parse_text_field(rest);
1181 continue;
1182 }
1183 if let Some(rest) = field_value(raw_line, "S") {
1184 enforce_field_order(
1185 line_no,
1186 "S",
1187 10,
1188 &mut last_node_rank,
1189 "node",
1190 raw_line,
1191 strict,
1192 )?;
1193 let value = parse_text_field(rest);
1194 validate_len(line_no, "S", &value, raw_line, 1, 200, strict)?;
1195 node.source_files.push(value);
1196 continue;
1197 }
1198
1199 if let Some(rest) = trimmed.strip_prefix("> ") {
1200 let mut parts = rest.split_whitespace();
1201 let Some(relation) = parts.next() else {
1202 fail_or_warn(
1203 strict,
1204 &mut warnings,
1205 format!("missing relation in edge at line {line_no}: {trimmed}"),
1206 )?;
1207 current_edge_index = None;
1208 continue;
1209 };
1210 let Some(target_id) = parts.next() else {
1211 fail_or_warn(
1212 strict,
1213 &mut warnings,
1214 format!("missing target id in edge at line {line_no}: {trimmed}"),
1215 )?;
1216 current_edge_index = None;
1217 continue;
1218 };
1219 graph.edges.push(Edge {
1220 source_id: node.id.clone(),
1221 relation: code_to_relation(relation).to_owned(),
1222 target_id: target_id.to_owned(),
1223 properties: EdgeProperties::default(),
1224 });
1225 current_edge_index = Some(graph.edges.len() - 1);
1226 last_edge_rank = 0;
1227 continue;
1228 }
1229
1230 if let Some(rest) = trimmed.strip_prefix("= ") {
1231 let mut parts = rest.split_whitespace();
1232 let Some(relation) = parts.next() else {
1233 fail_or_warn(
1234 strict,
1235 &mut warnings,
1236 format!("missing relation in bidirectional edge at line {line_no}: {trimmed}"),
1237 )?;
1238 current_edge_index = None;
1239 continue;
1240 };
1241 let Some(target_id) = parts.next() else {
1242 fail_or_warn(
1243 strict,
1244 &mut warnings,
1245 format!("missing target id in bidirectional edge at line {line_no}: {trimmed}"),
1246 )?;
1247 current_edge_index = None;
1248 continue;
1249 };
1250 let relation = code_to_relation(relation).to_owned();
1251 if relation != "~" {
1252 fail_or_warn(
1253 strict,
1254 &mut warnings,
1255 format!(
1256 "invalid bidirectional relation at line {line_no}: expected '~', got '{}'",
1257 relation
1258 ),
1259 )?;
1260 current_edge_index = None;
1261 continue;
1262 }
1263
1264 let target_id = target_id.to_owned();
1265 let (source_id, target_id) = canonicalize_bidirectional_pair(&node.id, &target_id);
1266 graph.edges.push(Edge {
1267 source_id,
1268 relation,
1269 target_id,
1270 properties: EdgeProperties {
1271 bidirectional: true,
1272 ..EdgeProperties::default()
1273 },
1274 });
1275 current_edge_index = Some(graph.edges.len() - 1);
1276 last_edge_rank = 0;
1277 continue;
1278 }
1279
1280 if let Some(rest) = field_value(raw_line, "d") {
1281 enforce_field_order(
1282 line_no,
1283 "d",
1284 1,
1285 &mut last_edge_rank,
1286 "edge",
1287 raw_line,
1288 strict,
1289 )?;
1290 let Some(edge_idx) = current_edge_index else {
1291 fail_or_warn(
1292 strict,
1293 &mut warnings,
1294 format!(
1295 "edge detail without preceding edge at line {line_no}\n{}",
1296 line_fragment(raw_line)
1297 ),
1298 )?;
1299 continue;
1300 };
1301 let trimmed_rest = rest.trim();
1302 let mut parts = trimmed_rest.split_whitespace();
1303 if let (Some(label), Some(raw_score), None) = (parts.next(), parts.next(), parts.next())
1304 {
1305 if is_score_component_label(label) {
1306 let score = raw_score.parse::<f64>().map_err(|_| {
1307 anyhow::anyhow!(
1308 "invalid score component value at line {line_no}: expected number in '{}', got '{}'",
1309 line_fragment(raw_line),
1310 raw_score
1311 )
1312 })?;
1313 graph.edges[edge_idx]
1314 .properties
1315 .score_components
1316 .insert(label.to_owned(), score);
1317 continue;
1318 }
1319 }
1320
1321 let value = parse_text_field(rest);
1322 validate_len(line_no, "d", &value, raw_line, 1, 200, strict)?;
1323 graph.edges[edge_idx].properties.detail = value;
1324 continue;
1325 }
1326
1327 if let Some(rest) = field_value(raw_line, "i") {
1328 enforce_field_order(
1329 line_no,
1330 "i",
1331 2,
1332 &mut last_edge_rank,
1333 "edge",
1334 raw_line,
1335 strict,
1336 )?;
1337 let Some(edge_idx) = current_edge_index else {
1338 fail_or_warn(
1339 strict,
1340 &mut warnings,
1341 format!(
1342 "edge valid_from without preceding edge at line {line_no}\n{}",
1343 line_fragment(raw_line)
1344 ),
1345 )?;
1346 continue;
1347 };
1348 let value = rest.trim();
1349 if !value.is_empty() && !parse_utc_timestamp(value) {
1350 fail_or_warn(
1351 strict,
1352 &mut warnings,
1353 format!(
1354 "invalid i timestamp at line {line_no}: expected YYYY-MM-DDTHH:MM:SSZ\n{}",
1355 line_fragment(raw_line)
1356 ),
1357 )?;
1358 continue;
1359 }
1360 graph.edges[edge_idx].properties.valid_from = value.to_owned();
1361 continue;
1362 }
1363
1364 if let Some(rest) = field_value(raw_line, "x") {
1365 enforce_field_order(
1366 line_no,
1367 "x",
1368 3,
1369 &mut last_edge_rank,
1370 "edge",
1371 raw_line,
1372 strict,
1373 )?;
1374 let Some(edge_idx) = current_edge_index else {
1375 fail_or_warn(
1376 strict,
1377 &mut warnings,
1378 format!(
1379 "edge valid_to without preceding edge at line {line_no}\n{}",
1380 line_fragment(raw_line)
1381 ),
1382 )?;
1383 continue;
1384 };
1385 let value = rest.trim();
1386 if !value.is_empty() && !parse_utc_timestamp(value) {
1387 fail_or_warn(
1388 strict,
1389 &mut warnings,
1390 format!(
1391 "invalid x timestamp at line {line_no}: expected YYYY-MM-DDTHH:MM:SSZ\n{}",
1392 line_fragment(raw_line)
1393 ),
1394 )?;
1395 continue;
1396 }
1397 graph.edges[edge_idx].properties.valid_to = value.to_owned();
1398 continue;
1399 }
1400
1401 if let Some(rest) = field_value(raw_line, "-") {
1402 let (key, value) = rest
1403 .split_once(char::is_whitespace)
1404 .map(|(key, value)| (key.trim(), value))
1405 .unwrap_or((rest.trim(), ""));
1406 let is_edge_custom = matches!(
1407 key,
1408 "edge_feedback_score" | "edge_feedback_count" | "edge_feedback_last_ts_ms"
1409 );
1410 if is_edge_custom {
1411 enforce_field_order(
1412 line_no,
1413 "-",
1414 4,
1415 &mut last_edge_rank,
1416 "edge",
1417 raw_line,
1418 strict,
1419 )?;
1420 } else {
1421 enforce_field_order(
1422 line_no,
1423 "-",
1424 9,
1425 &mut last_node_rank,
1426 "node",
1427 raw_line,
1428 strict,
1429 )?;
1430 }
1431 match key {
1432 "domain_area" => node.properties.domain_area = parse_text_field(value),
1433 "scan" => {
1434 node.properties.scan = parse_boolish(value);
1435 }
1436 "scan_ignore_unknown" => {
1437 node.properties.scan_ignore_unknown = parse_boolish(value);
1438 }
1439 "feedback_score" => {
1440 node.properties.feedback_score = value.trim().parse::<f64>().unwrap_or(0.0)
1441 }
1442 "feedback_count" => {
1443 node.properties.feedback_count = value.trim().parse::<u64>().unwrap_or(0)
1444 }
1445 "feedback_last_ts_ms" => {
1446 node.properties.feedback_last_ts_ms = value.trim().parse::<u64>().ok()
1447 }
1448 "edge_feedback_score" => {
1449 if let Some(edge_idx) = current_edge_index {
1450 graph.edges[edge_idx].properties.feedback_score =
1451 value.trim().parse::<f64>().unwrap_or(0.0);
1452 }
1453 }
1454 "edge_feedback_count" => {
1455 if let Some(edge_idx) = current_edge_index {
1456 graph.edges[edge_idx].properties.feedback_count =
1457 value.trim().parse::<u64>().unwrap_or(0);
1458 }
1459 }
1460 "edge_feedback_last_ts_ms" => {
1461 if let Some(edge_idx) = current_edge_index {
1462 graph.edges[edge_idx].properties.feedback_last_ts_ms =
1463 value.trim().parse::<u64>().ok();
1464 }
1465 }
1466 _ => {}
1467 }
1468 continue;
1469 }
1470
1471 fail_or_warn(
1472 strict,
1473 &mut warnings,
1474 format!("unrecognized line at {line_no}: {trimmed}"),
1475 )?;
1476 }
1477
1478 if let Some(node) = current_node.take() {
1479 graph.nodes.push(node);
1480 }
1481 if let Some(note) = current_note.take() {
1482 graph.notes.push(note);
1483 }
1484
1485 for node in &mut graph.nodes {
1486 node.properties.alias =
1487 sort_case_insensitive(&dedupe_case_insensitive(node.properties.alias.clone()));
1488 node.properties.key_facts =
1489 sort_case_insensitive(&dedupe_case_insensitive(node.properties.key_facts.clone()));
1490 node.source_files =
1491 sort_case_insensitive(&dedupe_case_insensitive(node.source_files.clone()));
1492 }
1493
1494 graph.edges.sort_by(|a, b| {
1495 a.source_id
1496 .cmp(&b.source_id)
1497 .then_with(|| a.relation.cmp(&b.relation))
1498 .then_with(|| a.target_id.cmp(&b.target_id))
1499 .then_with(|| a.properties.bidirectional.cmp(&b.properties.bidirectional))
1500 .then_with(|| a.properties.detail.cmp(&b.properties.detail))
1501 });
1502
1503 for note in &mut graph.notes {
1504 note.tags = sort_case_insensitive(&dedupe_case_insensitive(note.tags.clone()));
1505 note.source_files =
1506 sort_case_insensitive(&dedupe_case_insensitive(note.source_files.clone()));
1507 }
1508 graph.notes.sort_by(|a, b| {
1509 a.id.cmp(&b.id)
1510 .then_with(|| a.node_id.cmp(&b.node_id))
1511 .then_with(|| a.created_at.cmp(&b.created_at))
1512 });
1513
1514 graph.refresh_counts();
1515 Ok((graph, warnings))
1516}
1517
1518fn serialize_kg(graph: &GraphFile) -> String {
1519 let mut out = String::new();
1520 let mut nodes = graph.nodes.clone();
1521 nodes.sort_by(|a, b| a.id.cmp(&b.id));
1522
1523 for node in nodes {
1524 let generated = crate::validate::is_generated_node_type(&node.r#type);
1525 out.push_str(&format!(
1526 "@ {}:{}\n",
1527 encode_node_type_token(&node.r#type),
1528 display_node_id(&node.id, &node.r#type)
1529 ));
1530 if !node.name.is_empty() {
1531 push_text_line(&mut out, "N", &node.name);
1532 }
1533 if !node.properties.description.is_empty() {
1534 push_text_line(&mut out, "D", &node.properties.description);
1535 }
1536
1537 for alias in sort_case_insensitive(&node.properties.alias) {
1538 push_text_line(&mut out, "A", &alias);
1539 }
1540 for fact in sort_case_insensitive(&node.properties.key_facts) {
1541 push_text_line(&mut out, "F", &fact);
1542 }
1543
1544 if !generated {
1545 if !node.properties.created_at.is_empty() {
1546 out.push_str(&format!("E {}\n", node.properties.created_at));
1547 }
1548 if let Some(confidence) = node.properties.confidence {
1549 out.push_str(&format!("C {}\n", confidence));
1550 }
1551 out.push_str(&format!("V {}\n", node.properties.importance));
1552 if !node.properties.provenance.is_empty() {
1553 push_text_line(&mut out, "P", &node.properties.provenance);
1554 }
1555 if !node.properties.domain_area.is_empty() {
1556 out.push_str("- domain_area ");
1557 out.push_str(&escape_kg_text(&node.properties.domain_area));
1558 out.push('\n');
1559 }
1560 if let Some(scan) = node.properties.scan {
1561 out.push_str(&format!("- scan {}\n", scan));
1562 }
1563 if let Some(scan_ignore_unknown) = node.properties.scan_ignore_unknown {
1564 out.push_str(&format!("- scan_ignore_unknown {}\n", scan_ignore_unknown));
1565 }
1566 if node.properties.feedback_score != 0.0 {
1567 out.push_str(&format!(
1568 "- feedback_score {}\n",
1569 node.properties.feedback_score
1570 ));
1571 }
1572 if node.properties.feedback_count != 0 {
1573 out.push_str(&format!(
1574 "- feedback_count {}\n",
1575 node.properties.feedback_count
1576 ));
1577 }
1578 if let Some(ts) = node.properties.feedback_last_ts_ms {
1579 out.push_str(&format!("- feedback_last_ts_ms {}\n", ts));
1580 }
1581
1582 for source in sort_case_insensitive(&node.source_files) {
1583 push_text_line(&mut out, "S", &source);
1584 }
1585 }
1586
1587 let mut edges: Vec<Edge> = graph
1588 .edges
1589 .iter()
1590 .filter(|edge| edge.source_id == node.id)
1591 .cloned()
1592 .collect();
1593 edges.sort_by(|a, b| {
1594 a.relation
1595 .cmp(&b.relation)
1596 .then_with(|| a.target_id.cmp(&b.target_id))
1597 .then_with(|| a.properties.bidirectional.cmp(&b.properties.bidirectional))
1598 .then_with(|| a.properties.detail.cmp(&b.properties.detail))
1599 });
1600
1601 for edge in edges {
1602 let op = if edge.properties.bidirectional && edge.relation == "~" {
1603 "="
1604 } else {
1605 ">"
1606 };
1607 out.push_str(&format!(
1608 "{} {} {}\n",
1609 op,
1610 relation_to_code(&edge.relation),
1611 canonical_node_id_for_storage(&edge.target_id)
1612 ));
1613 for (label, score) in &edge.properties.score_components {
1614 out.push_str(&format!("d {} {:.6}\n", label, score));
1615 }
1616 if !edge.properties.detail.is_empty() {
1617 push_text_line(&mut out, "d", &edge.properties.detail);
1618 }
1619 if !edge.properties.valid_from.is_empty() {
1620 out.push_str(&format!("i {}\n", edge.properties.valid_from));
1621 }
1622 if !edge.properties.valid_to.is_empty() {
1623 out.push_str(&format!("x {}\n", edge.properties.valid_to));
1624 }
1625 if edge.properties.feedback_score != 0.0 {
1626 out.push_str(&format!(
1627 "- edge_feedback_score {}\n",
1628 edge.properties.feedback_score
1629 ));
1630 }
1631 if edge.properties.feedback_count != 0 {
1632 out.push_str(&format!(
1633 "- edge_feedback_count {}\n",
1634 edge.properties.feedback_count
1635 ));
1636 }
1637 if let Some(ts) = edge.properties.feedback_last_ts_ms {
1638 out.push_str(&format!("- edge_feedback_last_ts_ms {}\n", ts));
1639 }
1640 }
1641
1642 out.push('\n');
1643 }
1644
1645 let mut notes = graph.notes.clone();
1646 notes.sort_by(|a, b| {
1647 a.id.cmp(&b.id)
1648 .then_with(|| a.node_id.cmp(&b.node_id))
1649 .then_with(|| a.created_at.cmp(&b.created_at))
1650 });
1651 for note in notes {
1652 out.push_str(&format!(
1653 "! {} {}\n",
1654 note.id,
1655 canonical_node_id_for_storage(¬e.node_id)
1656 ));
1657 push_text_line(&mut out, "b", ¬e.body);
1658 for tag in sort_case_insensitive(¬e.tags) {
1659 push_text_line(&mut out, "t", &tag);
1660 }
1661 if !note.author.is_empty() {
1662 push_text_line(&mut out, "a", ¬e.author);
1663 }
1664 if !note.created_at.is_empty() {
1665 out.push_str(&format!("e {}\n", note.created_at));
1666 }
1667 if !note.provenance.is_empty() {
1668 push_text_line(&mut out, "p", ¬e.provenance);
1669 }
1670 for source in sort_case_insensitive(¬e.source_files) {
1671 push_text_line(&mut out, "s", &source);
1672 }
1673 out.push('\n');
1674 }
1675
1676 out
1677}
1678
1679#[derive(Debug, Clone, Serialize, Deserialize)]
1680pub struct GraphFile {
1681 pub metadata: Metadata,
1682 #[serde(default)]
1683 pub nodes: Vec<Node>,
1684 #[serde(default)]
1685 pub edges: Vec<Edge>,
1686 #[serde(default)]
1687 pub notes: Vec<Note>,
1688}
1689
1690#[derive(Debug, Clone, Serialize, Deserialize)]
1691pub struct Metadata {
1692 pub name: String,
1693 #[serde(default = "default_graph_schema_version")]
1694 pub schema_version: u32,
1695 pub version: String,
1696 pub description: String,
1697 pub node_count: usize,
1698 pub edge_count: usize,
1699}
1700
1701#[derive(Debug, Clone, Serialize, Deserialize)]
1702pub struct Node {
1703 pub id: String,
1704 #[serde(rename = "type")]
1705 pub r#type: String,
1706 pub name: String,
1707 #[serde(default)]
1708 pub properties: NodeProperties,
1709 #[serde(default)]
1710 pub source_files: Vec<String>,
1711}
1712
1713#[derive(Debug, Clone, Serialize, Deserialize)]
1714pub struct NodeProperties {
1715 #[serde(default)]
1716 pub description: String,
1717 #[serde(default)]
1718 pub domain_area: String,
1719 #[serde(default)]
1720 pub provenance: String,
1721 #[serde(default)]
1722 pub confidence: Option<f64>,
1723 #[serde(default)]
1724 pub created_at: String,
1725 #[serde(default = "default_importance")]
1726 pub importance: f64,
1727 #[serde(default)]
1728 pub key_facts: Vec<String>,
1729 #[serde(default)]
1730 pub alias: Vec<String>,
1731 #[serde(default)]
1732 pub valid_from: String,
1733 #[serde(default)]
1734 pub valid_to: String,
1735 #[serde(default)]
1736 pub scan: Option<bool>,
1737 #[serde(default)]
1738 pub scan_ignore_unknown: Option<bool>,
1739 #[serde(default)]
1740 pub feedback_score: f64,
1741 #[serde(default)]
1742 pub feedback_count: u64,
1743 #[serde(default)]
1744 pub feedback_last_ts_ms: Option<u64>,
1745}
1746
1747fn default_importance() -> f64 {
1748 0.5
1749}
1750
1751fn default_graph_schema_version() -> u32 {
1752 1
1753}
1754
1755impl Default for NodeProperties {
1756 fn default() -> Self {
1757 Self {
1758 description: String::new(),
1759 domain_area: String::new(),
1760 provenance: String::new(),
1761 confidence: None,
1762 created_at: String::new(),
1763 importance: default_importance(),
1764 key_facts: Vec::new(),
1765 alias: Vec::new(),
1766 valid_from: String::new(),
1767 valid_to: String::new(),
1768 scan: None,
1769 scan_ignore_unknown: None,
1770 feedback_score: 0.0,
1771 feedback_count: 0,
1772 feedback_last_ts_ms: None,
1773 }
1774 }
1775}
1776
1777#[derive(Debug, Clone, Serialize, Deserialize)]
1778pub struct Edge {
1779 pub source_id: String,
1780 pub relation: String,
1781 pub target_id: String,
1782 #[serde(default)]
1783 pub properties: EdgeProperties,
1784}
1785
1786#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1787pub struct EdgeProperties {
1788 #[serde(default)]
1789 pub detail: String,
1790 #[serde(default)]
1791 pub valid_from: String,
1792 #[serde(default)]
1793 pub valid_to: String,
1794 #[serde(default)]
1795 pub feedback_score: f64,
1796 #[serde(default)]
1797 pub feedback_count: u64,
1798 #[serde(default)]
1799 pub feedback_last_ts_ms: Option<u64>,
1800 #[serde(default)]
1801 pub bidirectional: bool,
1802 #[serde(default)]
1803 pub score_components: BTreeMap<String, f64>,
1804}
1805
1806#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1807pub struct Note {
1808 pub id: String,
1809 pub node_id: String,
1810 #[serde(default)]
1811 pub body: String,
1812 #[serde(default)]
1813 pub tags: Vec<String>,
1814 #[serde(default)]
1815 pub author: String,
1816 #[serde(default)]
1817 pub created_at: String,
1818 #[serde(default)]
1819 pub provenance: String,
1820 #[serde(default)]
1821 pub source_files: Vec<String>,
1822}
1823
1824impl GraphFile {
1825 pub fn new(name: &str) -> Self {
1826 Self {
1827 metadata: Metadata {
1828 name: name.to_owned(),
1829 schema_version: default_graph_schema_version(),
1830 version: "1.0".to_owned(),
1831 description: format!("Knowledge graph: {name}"),
1832 node_count: 0,
1833 edge_count: 0,
1834 },
1835 nodes: Vec::new(),
1836 edges: Vec::new(),
1837 notes: Vec::new(),
1838 }
1839 }
1840
1841 pub fn load(path: &Path) -> Result<Self> {
1842 let raw = fs::read_to_string(path)
1843 .with_context(|| format!("failed to read graph: {}", path.display()))?;
1844 let ext = path
1845 .extension()
1846 .and_then(|ext| ext.to_str())
1847 .unwrap_or("json");
1848 let mut graph = if ext == "kg" {
1849 if raw.trim_start().starts_with('{') {
1850 serde_json::from_str(&raw).map_err(|error| {
1851 anyhow::anyhow!(json_error_detail(
1852 "invalid legacy JSON payload in .kg file",
1853 path,
1854 &raw,
1855 &error,
1856 ))
1857 })?
1858 } else {
1859 let graph_name = path
1860 .file_stem()
1861 .and_then(|stem| stem.to_str())
1862 .unwrap_or("graph");
1863 let decompressed = expand_kg_tokens(&raw);
1864 let (graph, warnings) = parse_kg_with_warnings(
1865 &decompressed,
1866 graph_name,
1867 strict_kg_mode(),
1868 )
1869 .with_context(|| format!("failed to parse .kg graph: {}", path.display()))?;
1870 for warning in warnings {
1871 let _ = crate::kg_sidecar::append_warning(
1872 path,
1873 &format!(
1874 "ignored invalid graph entry in {}: {warning}",
1875 path.display()
1876 ),
1877 );
1878 }
1879 graph
1880 }
1881 } else {
1882 serde_json::from_str(&raw).map_err(|error| {
1883 anyhow::anyhow!(json_error_detail("invalid JSON", path, &raw, &error))
1884 })?
1885 };
1886 let schema_version_before = graph_schema_version(&graph);
1887 normalize_graph_ids(&mut graph);
1888 let created_graph_info = ensure_graph_info_node(&mut graph);
1889 graph.metadata.schema_version = GRAPH_SCHEMA_VERSION;
1890 graph.refresh_counts();
1891 if created_graph_info || schema_version_before < GRAPH_SCHEMA_VERSION {
1892 graph.save(path)?;
1893 }
1894 Ok(graph)
1895 }
1896
1897 pub fn save(&self, path: &Path) -> Result<()> {
1898 let mut graph = self.clone();
1899 ensure_graph_info_node(&mut graph);
1900 graph.metadata.schema_version = GRAPH_SCHEMA_VERSION;
1901 graph.refresh_counts();
1902 let ext = path
1903 .extension()
1904 .and_then(|ext| ext.to_str())
1905 .unwrap_or("json");
1906 let raw = if ext == "kg" {
1907 let serialized = serialize_kg(&graph);
1908 let (compressed, stats) = compress_kg_text(&serialized, KG_TEXT_COMPRESSION_MIN_LEN);
1909 let saved_bytes = serialized.len().saturating_sub(compressed.len());
1910 let saved_percent = if serialized.is_empty() {
1911 0.0
1912 } else {
1913 (saved_bytes as f64 * 100.0) / serialized.len() as f64
1914 };
1915 if saved_bytes > 0 {
1916 eprintln!(
1917 "kg compression: {:.1}% saved ({} -> {} bytes, {} dictionary entries)",
1918 saved_percent,
1919 stats.original_bytes,
1920 stats.compressed_bytes.min(stats.original_bytes),
1921 stats.dictionary_entries
1922 );
1923 }
1924 compressed
1925 } else {
1926 serde_json::to_string_pretty(&graph).context("failed to serialize graph")?
1927 };
1928 atomic_write(path, &raw)?;
1929 backup_graph_if_stale(path, &raw)
1930 }
1931
1932 pub fn refresh_counts(&mut self) {
1933 self.metadata.node_count = self.nodes.len();
1934 self.metadata.edge_count = self.edges.len();
1935 }
1936
1937 pub fn node_by_id(&self, id: &str) -> Option<&Node> {
1938 self.nodes.iter().find(|node| node.id == id)
1939 }
1940
1941 pub fn node_by_id_sorted(&self, id: &str) -> Option<&Node> {
1942 self.nodes
1943 .binary_search_by(|node| node.id.as_str().cmp(id))
1944 .ok()
1945 .and_then(|idx| self.nodes.get(idx))
1946 }
1947
1948 pub fn node_by_id_mut(&mut self, id: &str) -> Option<&mut Node> {
1949 self.nodes.iter_mut().find(|node| node.id == id)
1950 }
1951
1952 pub fn has_edge(&self, source_id: &str, relation: &str, target_id: &str) -> bool {
1953 self.edges.iter().any(|edge| {
1954 edge.source_id == source_id && edge.relation == relation && edge.target_id == target_id
1955 })
1956 }
1957}
1958
1959fn normalize_graph_ids(graph: &mut GraphFile) {
1960 let mut remap: HashMap<String, String> = HashMap::new();
1961 for node in &mut graph.nodes {
1962 let normalized = crate::validate::canonicalize_node_id_for_type(&node.id, &node.r#type)
1963 .unwrap_or_else(|_| crate::validate::normalize_node_id(&node.id));
1964 if normalized != node.id {
1965 remap.insert(node.id.clone(), normalized.clone());
1966 node.id = normalized;
1967 }
1968 }
1969
1970 let known_ids: std::collections::HashSet<&str> =
1971 graph.nodes.iter().map(|node| node.id.as_str()).collect();
1972
1973 for edge in &mut graph.edges {
1974 edge.source_id = remap.get(&edge.source_id).cloned().unwrap_or_else(|| {
1975 if known_ids.contains(edge.source_id.as_str()) {
1976 edge.source_id.clone()
1977 } else {
1978 crate::validate::normalize_node_id(&edge.source_id)
1979 }
1980 });
1981 edge.target_id = remap.get(&edge.target_id).cloned().unwrap_or_else(|| {
1982 if known_ids.contains(edge.target_id.as_str()) {
1983 edge.target_id.clone()
1984 } else {
1985 crate::validate::normalize_node_id(&edge.target_id)
1986 }
1987 });
1988 if edge.properties.bidirectional {
1989 let (source_id, target_id) =
1990 canonicalize_bidirectional_pair(&edge.source_id, &edge.target_id);
1991 edge.source_id = source_id;
1992 edge.target_id = target_id;
1993 }
1994 }
1995
1996 for note in &mut graph.notes {
1997 note.node_id = remap.get(¬e.node_id).cloned().unwrap_or_else(|| {
1998 if known_ids.contains(note.node_id.as_str()) {
1999 note.node_id.clone()
2000 } else {
2001 crate::validate::normalize_node_id(¬e.node_id)
2002 }
2003 });
2004 }
2005}
2006
2007fn ensure_graph_info_node(graph: &mut GraphFile) -> bool {
2008 if let Some(node) = graph.node_by_id_mut(GRAPH_INFO_NODE_ID) {
2009 let mut changed = false;
2010 if node.r#type != GRAPH_INFO_NODE_TYPE {
2011 node.r#type = GRAPH_INFO_NODE_TYPE.to_owned();
2012 changed = true;
2013 }
2014 if node.name.is_empty() {
2015 node.name = "Graph Metadata".to_owned();
2016 changed = true;
2017 }
2018 if node.properties.description.is_empty() {
2019 node.properties.description =
2020 "Internal graph metadata for cross-graph linking".to_owned();
2021 changed = true;
2022 }
2023 if !node
2024 .properties
2025 .key_facts
2026 .iter()
2027 .any(|fact| fact.starts_with(GRAPH_UUID_FACT_PREFIX))
2028 {
2029 node.properties
2030 .key_facts
2031 .push(format!("{GRAPH_UUID_FACT_PREFIX}{}", generate_graph_uuid()));
2032 changed = true;
2033 }
2034 let schema_fact = format!("{GRAPH_SCHEMA_VERSION_FACT_PREFIX}{GRAPH_SCHEMA_VERSION}");
2035 let had_schema_fact = node
2036 .properties
2037 .key_facts
2038 .iter()
2039 .any(|fact| fact.starts_with(GRAPH_SCHEMA_VERSION_FACT_PREFIX));
2040 if !had_schema_fact {
2041 node.properties.key_facts.push(schema_fact);
2042 changed = true;
2043 } else {
2044 let mut replaced = false;
2045 for fact in &mut node.properties.key_facts {
2046 if fact.starts_with(GRAPH_SCHEMA_VERSION_FACT_PREFIX) {
2047 if *fact != schema_fact {
2048 *fact = schema_fact.clone();
2049 replaced = true;
2050 }
2051 }
2052 }
2053 if replaced {
2054 changed = true;
2055 }
2056 }
2057 return changed;
2058 }
2059
2060 graph.nodes.push(Node {
2061 id: GRAPH_INFO_NODE_ID.to_owned(),
2062 r#type: GRAPH_INFO_NODE_TYPE.to_owned(),
2063 name: "Graph Metadata".to_owned(),
2064 properties: NodeProperties {
2065 description: "Internal graph metadata for cross-graph linking".to_owned(),
2066 domain_area: "internal_metadata".to_owned(),
2067 provenance: "A".to_owned(),
2068 importance: 1.0,
2069 key_facts: vec![
2070 format!("{GRAPH_UUID_FACT_PREFIX}{}", generate_graph_uuid()),
2071 format!("{GRAPH_SCHEMA_VERSION_FACT_PREFIX}{GRAPH_SCHEMA_VERSION}"),
2072 ],
2073 ..NodeProperties::default()
2074 },
2075 source_files: vec!["DOC .kg/internal/graph_info".to_owned()],
2076 });
2077 true
2078}
2079
2080fn graph_schema_version(graph: &GraphFile) -> u32 {
2081 graph
2082 .node_by_id(GRAPH_INFO_NODE_ID)
2083 .and_then(|node| {
2084 node.properties.key_facts.iter().find_map(|fact| {
2085 fact.strip_prefix(GRAPH_SCHEMA_VERSION_FACT_PREFIX)
2086 .and_then(|value| value.parse::<u32>().ok())
2087 })
2088 })
2089 .unwrap_or(graph.metadata.schema_version)
2090}
2091
2092fn display_node_id(id: &str, node_type: &str) -> String {
2093 let Some((head, suffix)) = id.split_once(':') else {
2094 return id.to_owned();
2095 };
2096 if head == node_type
2097 || crate::validate::canonical_type_code_for(node_type).is_some_and(|code| code == head)
2098 || crate::validate::TYPE_TO_PREFIX
2099 .iter()
2100 .any(|(typ, prefix)| *typ == node_type && *prefix == head)
2101 {
2102 return suffix.to_owned();
2103 }
2104 id.to_owned()
2105}
2106
2107fn canonical_node_id_for_storage(id: &str) -> String {
2108 let Some((head, suffix)) = id.split_once(':') else {
2109 return id.to_owned();
2110 };
2111 let Some(node_type) = crate::validate::TYPE_TO_PREFIX
2112 .iter()
2113 .find(|(typ, prefix)| {
2114 crate::validate::canonical_type_code_for(typ).is_some_and(|code| code == head)
2115 || *prefix == head
2116 })
2117 .map(|(typ, _)| *typ)
2118 else {
2119 return id.to_owned();
2120 };
2121 crate::validate::canonical_type_code_for(node_type)
2122 .map(|code| format!("{code}:{suffix}"))
2123 .unwrap_or_else(|| id.to_owned())
2124}
2125
2126fn generate_graph_uuid() -> String {
2127 let mut bytes = [0u8; 10];
2128 if fs::File::open("/dev/urandom")
2129 .and_then(|mut file| {
2130 use std::io::Read;
2131 file.read_exact(&mut bytes)
2132 })
2133 .is_err()
2134 {
2135 let nanos = SystemTime::now()
2136 .duration_since(UNIX_EPOCH)
2137 .unwrap_or_default()
2138 .as_nanos();
2139 let pid = std::process::id() as u128;
2140 let mixed = nanos ^ (pid << 64) ^ (nanos.rotate_left(17));
2141 bytes.copy_from_slice(&mixed.to_be_bytes()[6..16]);
2142 }
2143 let mut out = String::with_capacity(20);
2144 for byte in bytes {
2145 out.push_str(&format!("{byte:02x}"));
2146 }
2147 out
2148}
2149
2150#[cfg(test)]
2151mod tests {
2152 use super::{
2153 compress_kg_text, expand_kg_tokens, GRAPH_INFO_NODE_ID, GRAPH_INFO_NODE_TYPE,
2154 GRAPH_SCHEMA_VERSION, GRAPH_UUID_FACT_PREFIX, GraphFile, KG_TEXT_COMPRESSION_MIN_LEN,
2155 parse_kg,
2156 };
2157
2158 #[test]
2159 fn save_and_load_kg_roundtrip_keeps_core_fields() {
2160 let dir = tempfile::tempdir().expect("temp dir");
2161 let path = dir.path().join("graph.kg");
2162
2163 let mut graph = GraphFile::new("graph");
2164 graph.nodes.push(crate::Node {
2165 id: "concept:refrigerator".to_owned(),
2166 r#type: "Concept".to_owned(),
2167 name: "Lodowka".to_owned(),
2168 properties: crate::NodeProperties {
2169 description: "Urzadzenie chlodzace".to_owned(),
2170 provenance: "U".to_owned(),
2171 created_at: "2026-04-04T12:00:00Z".to_owned(),
2172 importance: 5.0,
2173 key_facts: vec!["A".to_owned(), "b".to_owned()],
2174 alias: vec!["Fridge".to_owned()],
2175 scan: Some(true),
2176 scan_ignore_unknown: Some(true),
2177 ..Default::default()
2178 },
2179 source_files: vec!["docs/fridge.md".to_owned()],
2180 });
2181 graph.edges.push(crate::Edge {
2182 source_id: "concept:refrigerator".to_owned(),
2183 relation: "READS_FROM".to_owned(),
2184 target_id: "datastore:settings".to_owned(),
2185 properties: crate::EdgeProperties {
2186 detail: "runtime read".to_owned(),
2187 valid_from: "2026-04-04T12:00:00Z".to_owned(),
2188 valid_to: "2026-04-05T12:00:00Z".to_owned(),
2189 ..Default::default()
2190 },
2191 });
2192
2193 graph.save(&path).expect("save kg");
2194 let raw = std::fs::read_to_string(&path).expect("read kg");
2195 assert!(raw.contains("@ K:refrigerator"));
2196 assert!(raw.contains("> R D:settings"));
2197
2198 let loaded = GraphFile::load(&path).expect("load kg");
2199 assert_eq!(loaded.nodes.len(), 2);
2200 assert_eq!(loaded.edges.len(), 1);
2201 let node = loaded
2202 .node_by_id("concept:refrigerator")
2203 .expect("domain node");
2204 assert_eq!(node.properties.importance, 5.0);
2205 assert_eq!(node.properties.provenance, "U");
2206 assert_eq!(node.properties.scan, Some(true));
2207 assert_eq!(node.properties.scan_ignore_unknown, Some(true));
2208 assert_eq!(node.name, "Lodowka");
2209 assert_eq!(loaded.edges[0].relation, "READS_FROM");
2210 assert_eq!(loaded.edges[0].properties.detail, "runtime read");
2211 assert_eq!(
2212 loaded.edges[0].properties.valid_from,
2213 "2026-04-04T12:00:00Z"
2214 );
2215 assert_eq!(loaded.edges[0].properties.valid_to, "2026-04-05T12:00:00Z");
2216 assert_eq!(loaded.metadata.schema_version, GRAPH_SCHEMA_VERSION);
2217 }
2218
2219 #[test]
2220 fn load_supports_legacy_json_payload_with_kg_extension() {
2221 let dir = tempfile::tempdir().expect("temp dir");
2222 let path = dir.path().join("legacy.kg");
2223 std::fs::write(
2224 &path,
2225 r#"{
2226 "metadata": {"name": "legacy", "version": "1.0", "description": "x", "node_count": 0, "edge_count": 0},
2227 "nodes": [],
2228 "edges": [],
2229 "notes": []
2230}"#,
2231 )
2232 .expect("write legacy payload");
2233
2234 let loaded = GraphFile::load(&path).expect("load legacy kg");
2235 assert_eq!(loaded.metadata.name, "legacy");
2236 assert_eq!(loaded.metadata.schema_version, GRAPH_SCHEMA_VERSION);
2237 assert_eq!(loaded.nodes.len(), 1);
2238 assert!(loaded.node_by_id(GRAPH_INFO_NODE_ID).is_some());
2239 }
2240
2241 #[test]
2242 fn load_kg_auto_migrates_legacy_id_prefixes() {
2243 let dir = tempfile::tempdir().expect("temp dir");
2244 let path = dir.path().join("legacy-ids.kg");
2245 std::fs::write(
2246 &path,
2247 "@ K:concept:x\nN X\nD Desc\nV 0.5\nP U\nS docs/a.md\n> R datastore:y\n",
2248 )
2249 .expect("write kg");
2250
2251 let loaded = GraphFile::load(&path).expect("load kg");
2252 assert_eq!(loaded.metadata.schema_version, GRAPH_SCHEMA_VERSION);
2253 assert!(loaded.node_by_id("concept:x").is_some());
2254
2255 let persisted = std::fs::read_to_string(&path).expect("read migrated kg");
2256 assert!(persisted.contains("@ K:x"));
2257 assert!(persisted.contains("> R D:y"));
2258 assert!(persisted.contains(&format!("schema_version={GRAPH_SCHEMA_VERSION}")));
2259 }
2260
2261 #[test]
2262 fn load_kg_ignores_invalid_timestamp_format() {
2263 let dir = tempfile::tempdir().expect("temp dir");
2264 let path = dir.path().join("invalid-ts.kg");
2265 std::fs::write(
2266 &path,
2267 "@ K:concept:x\nN X\nD Desc\nE 2026-04-04 12:00:00\nV 4\nP U\n",
2268 )
2269 .expect("write kg");
2270
2271 let loaded = GraphFile::load(&path).expect("invalid timestamp should be ignored");
2272 assert_eq!(loaded.nodes.len(), 2);
2273 assert!(
2274 loaded
2275 .node_by_id("concept:x")
2276 .expect("concept node")
2277 .properties
2278 .created_at
2279 .is_empty()
2280 );
2281 }
2282
2283 #[test]
2284 fn load_kg_ignores_invalid_edge_timestamp_format() {
2285 let dir = tempfile::tempdir().expect("temp dir");
2286 let path = dir.path().join("invalid-edge-ts.kg");
2287 std::fs::write(
2288 &path,
2289 "@ K:concept:x\nN X\nD Desc\nE 2026-04-04T12:00:00Z\nV 4\nP U\nS docs/a.md\n> H concept:y\ni 2026-04-04 12:00:00\n",
2290 )
2291 .expect("write kg");
2292
2293 let loaded = GraphFile::load(&path).expect("invalid edge timestamp should be ignored");
2294 assert_eq!(loaded.edges.len(), 1);
2295 assert!(loaded.edges[0].properties.valid_from.is_empty());
2296 }
2297
2298 #[test]
2299 fn load_kg_preserves_whitespace_and_dedupes_exact_duplicates() {
2300 let dir = tempfile::tempdir().expect("temp dir");
2301 let path = dir.path().join("normalize.kg");
2302 std::fs::write(
2303 &path,
2304 "@ K:concept:x\nN Name With Spaces \nD Desc with spaces \nA Alias\nA Alias\nF fact one\nF FACT one\nS docs/a.md\nS docs/a.md\nE 2026-04-04T12:00:00Z\nV 4\nP U\n",
2305 )
2306 .expect("write kg");
2307
2308 let loaded = GraphFile::load(&path).expect("load kg");
2309 let node = loaded.node_by_id("concept:x").expect("concept node");
2310 assert_eq!(node.name, " Name With Spaces ");
2311 assert_eq!(node.properties.description, " Desc with spaces ");
2312 assert_eq!(node.properties.alias.len(), 1);
2313 assert_eq!(node.properties.key_facts.len(), 2);
2314 assert_eq!(node.source_files.len(), 1);
2315 }
2316
2317 #[test]
2318 fn save_and_load_kg_roundtrip_keeps_notes_without_json_fallback() {
2319 let dir = tempfile::tempdir().expect("temp dir");
2320 let path = dir.path().join("graph-notes.kg");
2321
2322 let mut graph = GraphFile::new("graph-notes");
2323 graph.nodes.push(crate::Node {
2324 id: "concept:refrigerator".to_owned(),
2325 r#type: "Concept".to_owned(),
2326 name: "Lodowka".to_owned(),
2327 properties: crate::NodeProperties {
2328 description: "Urzadzenie chlodzace".to_owned(),
2329 provenance: "U".to_owned(),
2330 created_at: "2026-04-04T12:00:00Z".to_owned(),
2331 ..Default::default()
2332 },
2333 source_files: vec!["docs/fridge.md".to_owned()],
2334 });
2335 graph.notes.push(crate::Note {
2336 id: "note:1".to_owned(),
2337 node_id: "concept:refrigerator".to_owned(),
2338 body: "Important maintenance insight".to_owned(),
2339 tags: vec!["Maintenance".to_owned(), "maintenance".to_owned()],
2340 author: "alice".to_owned(),
2341 created_at: "1712345678".to_owned(),
2342 provenance: "U".to_owned(),
2343 source_files: vec!["docs/a.md".to_owned(), "docs/a.md".to_owned()],
2344 });
2345
2346 graph.save(&path).expect("save kg");
2347 let raw = std::fs::read_to_string(&path).expect("read kg");
2348 assert!(raw.contains("! note:1 K:refrigerator"));
2349 assert!(!raw.trim_start().starts_with('{'));
2350
2351 let loaded = GraphFile::load(&path).expect("load kg");
2352 assert_eq!(loaded.notes.len(), 1);
2353 let note = &loaded.notes[0];
2354 assert_eq!(note.id, "note:1");
2355 assert_eq!(note.node_id, "concept:refrigerator");
2356 assert_eq!(note.body, "Important maintenance insight");
2357 assert_eq!(note.tags.len(), 1);
2358 assert_eq!(note.source_files.len(), 1);
2359 }
2360
2361 #[test]
2362 fn save_and_load_kg_roundtrip_preserves_multiline_text_fields() {
2363 let dir = tempfile::tempdir().expect("temp dir");
2364 let path = dir.path().join("graph-multiline.kg");
2365
2366 let mut graph = GraphFile::new("graph-multiline");
2367 graph.nodes.push(crate::Node {
2368 id: "concept:refrigerator".to_owned(),
2369 r#type: "Concept".to_owned(),
2370 name: "Lodowka\nSmart".to_owned(),
2371 properties: crate::NodeProperties {
2372 description: "Linia 1\nLinia 2\\nliteral".to_owned(),
2373 provenance: "user\nimport".to_owned(),
2374 created_at: "2026-04-04T12:00:00Z".to_owned(),
2375 importance: 5.0,
2376 key_facts: vec!["Fakt 1\nFakt 2".to_owned()],
2377 alias: vec!["Alias\nA".to_owned()],
2378 domain_area: "ops\nfield".to_owned(),
2379 ..Default::default()
2380 },
2381 source_files: vec!["docs/fridge\nnotes.md".to_owned()],
2382 });
2383 graph.edges.push(crate::Edge {
2384 source_id: "concept:refrigerator".to_owned(),
2385 relation: "READS_FROM".to_owned(),
2386 target_id: "datastore:settings".to_owned(),
2387 properties: crate::EdgeProperties {
2388 detail: "runtime\nread".to_owned(),
2389 valid_from: "2026-04-04T12:00:00Z".to_owned(),
2390 valid_to: "2026-04-05T12:00:00Z".to_owned(),
2391 ..Default::default()
2392 },
2393 });
2394 graph.notes.push(crate::Note {
2395 id: "note:1".to_owned(),
2396 node_id: "concept:refrigerator".to_owned(),
2397 body: "line1\nline2\\nkeep".to_owned(),
2398 tags: vec!["multi\nline".to_owned()],
2399 author: "alice\nbob".to_owned(),
2400 created_at: "1712345678".to_owned(),
2401 provenance: "manual\nentry".to_owned(),
2402 source_files: vec!["docs/a\nb.md".to_owned()],
2403 });
2404
2405 graph.save(&path).expect("save kg");
2406 let raw = std::fs::read_to_string(&path).expect("read kg");
2407 assert!(raw.contains("@ K:refrigerator"));
2408 assert!(raw.contains("> R D:settings"));
2409 assert!(raw.contains("! note:1 K:refrigerator"));
2410 assert!(raw.contains("N Lodowka\\nSmart"));
2411 assert!(raw.contains("D Linia 1\\nLinia 2\\\\nliteral"));
2412 assert!(raw.contains("- domain_area ops\\nfield"));
2413 assert!(raw.contains("d runtime\\nread"));
2414 assert!(raw.contains("b line1\\nline2\\\\nkeep"));
2415
2416 let loaded = GraphFile::load(&path).expect("load kg");
2417 let node = loaded
2418 .node_by_id("concept:refrigerator")
2419 .expect("domain node");
2420 assert_eq!(node.name, "Lodowka\nSmart");
2421 assert_eq!(node.properties.description, "Linia 1\nLinia 2\\nliteral");
2422 assert_eq!(node.properties.provenance, "user\nimport");
2423 assert_eq!(node.properties.alias, vec!["Alias\nA".to_owned()]);
2424 assert_eq!(node.properties.key_facts, vec!["Fakt 1\nFakt 2".to_owned()]);
2425 assert_eq!(node.properties.domain_area, "ops\nfield");
2426 assert_eq!(node.source_files, vec!["docs/fridge\nnotes.md".to_owned()]);
2427 assert_eq!(loaded.edges[0].properties.detail, "runtime\nread");
2428 let note = &loaded.notes[0];
2429 assert_eq!(note.body, "line1\nline2\\nkeep");
2430 assert_eq!(note.tags, vec!["multi\nline".to_owned()]);
2431 assert_eq!(note.author, "alice\nbob");
2432 assert_eq!(note.provenance, "manual\nentry");
2433 assert_eq!(note.source_files, vec!["docs/a\nb.md".to_owned()]);
2434 }
2435
2436 #[test]
2437 fn compress_kg_text_only_touches_generated_node_blocks() {
2438 let raw = concat!(
2439 "@ GDIR:src\n",
2440 "N alpha beta gamma\n",
2441 "D alpha beta gamma and more\n",
2442 "\n",
2443 "@ K:concept:plain\n",
2444 "N alpha beta gamma\n",
2445 "D alpha beta gamma and more\n",
2446 "E 2026-04-04T12:00:00Z\n",
2447 "V 4\n",
2448 "P U\n",
2449 "S docs/plain.md\n",
2450 "\n",
2451 );
2452
2453 let (compressed, stats) = compress_kg_text(raw, KG_TEXT_COMPRESSION_MIN_LEN);
2454 assert!(stats.dictionary_entries > 0);
2455 assert!(compressed.contains("`1 "));
2456 assert!(compressed.contains("N`1`"));
2457 assert!(compressed.contains("D`1` and more"));
2458
2459 let manual_block = compressed
2460 .split("@ K:concept:plain")
2461 .nth(1)
2462 .expect("manual block");
2463 assert!(!manual_block.contains("`1`"));
2464
2465 let decompressed = expand_kg_tokens(&compressed);
2466 assert_eq!(decompressed, raw);
2467 }
2468
2469 #[test]
2470 fn load_kg_expands_backtick_tokens_before_parsing() {
2471 let dir = tempfile::tempdir().expect("temp dir");
2472 let path = dir.path().join("compressed.kg");
2473 std::fs::write(
2474 &path,
2475 concat!(
2476 "`1 alpha beta gamma\n",
2477 "@ GDIR:src\n",
2478 "N `1`\n",
2479 "D `1` and more\n",
2480 "\n",
2481 ),
2482 )
2483 .expect("write kg");
2484
2485 let loaded = GraphFile::load(&path).expect("load kg");
2486 let node = loaded.node_by_id("GDIR:src").expect("generated node");
2487 assert_eq!(node.name, "alpha beta gamma");
2488 assert_eq!(node.properties.description, "alpha beta gamma and more");
2489 }
2490
2491 #[test]
2492 fn parse_bidirectional_similarity_edge_is_canonical_and_scored() {
2493 let raw = "@ ~:dedupe_b\nN B\nD Desc\nV 0.5\nP U\nS docs/b.md\n= ~ ~:dedupe_a\nd C1 0.11\nd C2 0.83\nd 0.91\n\n@ ~:dedupe_a\nN A\nD Desc\nV 0.5\nP U\nS docs/a.md\n";
2494 let graph = parse_kg(raw, "virt", true).expect("parse kg");
2495
2496 assert_eq!(graph.nodes.len(), 2);
2497 assert_eq!(graph.edges.len(), 1);
2498 let edge = &graph.edges[0];
2499 assert_eq!(edge.relation, "~");
2500 assert_eq!(edge.source_id, "~:dedupe_a");
2501 assert_eq!(edge.target_id, "~:dedupe_b");
2502 assert_eq!(edge.properties.detail, "0.91");
2503 assert!(edge.properties.bidirectional);
2504 assert_eq!(edge.properties.score_components.get("C1"), Some(&0.11));
2505 assert_eq!(edge.properties.score_components.get("C2"), Some(&0.83));
2506 }
2507
2508 #[test]
2509 fn serialize_bidirectional_similarity_edge_uses_equals_operator() {
2510 let dir = tempfile::tempdir().expect("temp dir");
2511 let path = dir.path().join("virt.kg");
2512 let mut graph = GraphFile::new("virt");
2513 graph.nodes.push(crate::Node {
2514 id: "~:dedupe_a".to_owned(),
2515 r#type: "~".to_owned(),
2516 name: "A".to_owned(),
2517 properties: crate::NodeProperties {
2518 description: "Desc".to_owned(),
2519 provenance: "U".to_owned(),
2520 created_at: "2026-04-10T00:00:00Z".to_owned(),
2521 importance: 0.6,
2522 ..Default::default()
2523 },
2524 source_files: vec!["docs/a.md".to_owned()],
2525 });
2526 graph.nodes.push(crate::Node {
2527 id: "~:dedupe_b".to_owned(),
2528 r#type: "~".to_owned(),
2529 name: "B".to_owned(),
2530 properties: crate::NodeProperties {
2531 description: "Desc".to_owned(),
2532 provenance: "U".to_owned(),
2533 created_at: "2026-04-10T00:00:00Z".to_owned(),
2534 importance: 0.6,
2535 ..Default::default()
2536 },
2537 source_files: vec!["docs/b.md".to_owned()],
2538 });
2539 graph.edges.push(crate::Edge {
2540 source_id: "~:dedupe_a".to_owned(),
2541 relation: "~".to_owned(),
2542 target_id: "~:dedupe_b".to_owned(),
2543 properties: crate::EdgeProperties {
2544 detail: "0.75".to_owned(),
2545 bidirectional: true,
2546 score_components: std::collections::BTreeMap::from([
2547 ("C1".to_owned(), 0.2),
2548 ("C2".to_owned(), 0.8),
2549 ]),
2550 ..Default::default()
2551 },
2552 });
2553
2554 graph.save(&path).expect("save");
2555 let raw = std::fs::read_to_string(&path).expect("read");
2556 assert!(raw.contains("= ~ ~:dedupe_b"));
2557 assert!(raw.contains("d C1 0.200000"));
2558 assert!(raw.contains("d C2 0.800000"));
2559 assert!(!raw.contains("> ~ ~:dedupe_b"));
2560
2561 let loaded = GraphFile::load(&path).expect("load");
2562 assert_eq!(loaded.edges.len(), 1);
2563 assert!(loaded.edges[0].properties.bidirectional);
2564 assert_eq!(loaded.edges[0].properties.detail, "0.75");
2565 assert_eq!(
2566 loaded.edges[0].properties.score_components.get("C1"),
2567 Some(&0.2)
2568 );
2569 assert_eq!(
2570 loaded.edges[0].properties.score_components.get("C2"),
2571 Some(&0.8)
2572 );
2573 }
2574
2575 #[test]
2576 fn strict_mode_rejects_bidirectional_relation_other_than_similarity() {
2577 let raw = "@ K:concept:a\nN A\nD Desc\nV 0.5\nP U\nS docs/a.md\n= HAS concept:b\n";
2578 let err = parse_kg(raw, "x", true).expect_err("strict mode should reject invalid '='");
2579 assert!(format!("{err:#}").contains("expected '~'"));
2580 }
2581
2582 #[test]
2583 fn strict_mode_rejects_out_of_order_node_fields() {
2584 let raw = "@ K:concept:x\nD Desc\nN Name\nE 2026-04-04T12:00:00Z\nV 4\nP U\nS docs/a.md\n";
2585 let err = parse_kg(raw, "x", true).expect_err("strict mode should fail on field order");
2586 assert!(format!("{err:#}").contains("invalid field order"));
2587 }
2588
2589 #[test]
2590 fn strict_mode_rejects_overlong_name_but_compat_mode_allows_it() {
2591 let long_name = "N ".to_owned() + &"X".repeat(121);
2592 let raw = format!(
2593 "@ K:concept:x\n{}\nD Desc\nE 2026-04-04T12:00:00Z\nV 4\nP U\nS docs/a.md\n",
2594 long_name
2595 );
2596
2597 let strict_err = parse_kg(&raw, "x", true).expect_err("strict mode should fail on length");
2598 assert!(format!("{strict_err:#}").contains("invalid N length"));
2599
2600 parse_kg(&raw, "x", false).expect("compat mode keeps permissive behavior");
2601 }
2602
2603 #[test]
2604 fn save_kg_skips_empty_e_and_p_fields() {
2605 let dir = tempfile::tempdir().expect("temp dir");
2606 let path = dir.path().join("no-empty-ep.kg");
2607
2608 let mut graph = GraphFile::new("graph");
2609 graph.nodes.push(crate::Node {
2610 id: "concept:x".to_owned(),
2611 r#type: "Concept".to_owned(),
2612 name: "X".to_owned(),
2613 properties: crate::NodeProperties {
2614 description: "Desc".to_owned(),
2615 provenance: String::new(),
2616 created_at: String::new(),
2617 ..Default::default()
2618 },
2619 source_files: vec!["docs/a.md".to_owned()],
2620 });
2621
2622 graph.save(&path).expect("save kg");
2623 let raw = std::fs::read_to_string(&path).expect("read kg");
2624 assert!(!raw.contains("\nE \n"));
2625 assert!(!raw.contains("\nP \n"));
2626 }
2627
2628 #[test]
2629 fn load_generates_graph_info_node_when_missing() {
2630 let dir = tempfile::tempdir().expect("temp dir");
2631 let path = dir.path().join("meta.kg");
2632 let raw = "@ K:concept:x\nN X\nD Desc\nV 0.5\nP U\nS docs/a.md\n";
2633 std::fs::write(&path, raw).expect("write kg");
2634
2635 let loaded = GraphFile::load(&path).expect("load kg");
2636 let info = loaded
2637 .node_by_id(GRAPH_INFO_NODE_ID)
2638 .expect("graph info node should be generated");
2639 assert_eq!(info.r#type, GRAPH_INFO_NODE_TYPE);
2640 assert!(
2641 info.properties
2642 .key_facts
2643 .iter()
2644 .any(|fact| fact.starts_with(GRAPH_UUID_FACT_PREFIX))
2645 );
2646
2647 let persisted = std::fs::read_to_string(&path).expect("read persisted kg");
2648 assert!(persisted.contains("graph_info"));
2649 assert!(persisted.contains("graph_uuid="));
2650 assert!(persisted.contains("schema_version="));
2651 }
2652}