Skip to main content

seshat_scanner/
documentation.rs

1//! Documentation ingestion for the knowledge graph.
2//!
3//! Parses structured information from documentation files into
4//! [`KnowledgeNode`]s that enrich the knowledge graph. Supports:
5//!
6//! - **Markdown** (`.md`): headings and lists extracted as Fact/Rule nodes
7//! - **JSON Schema** (`.json`): data structure definitions extracted as Fact nodes
8//! - **OpenAPI** (`.yaml`, `.yml`): endpoint definitions extracted as Fact nodes
9//!
10//! All documentation-sourced nodes are tagged with `"source": "documentation"`
11//! in their `ext_data` field. No NLP or prose-level convention extraction is
12//! performed — only structured information is extracted.
13
14use std::path::{Path, PathBuf};
15
16use seshat_core::{BranchId, KnowledgeNature, KnowledgeNode, KnowledgeWeight, NodeId};
17
18use crate::error::ScanError;
19
20/// The type of documentation file being parsed.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum DocType {
23    /// Markdown documentation (`.md`).
24    Markdown,
25    /// JSON Schema definition (`.json`).
26    JsonSchema,
27    /// OpenAPI specification (`.yaml` / `.yml`).
28    OpenApi,
29}
30
31impl DocType {
32    /// Detect documentation type from file extension.
33    ///
34    /// Returns `None` if the extension is not a recognised documentation format.
35    pub fn from_extension(ext: &str) -> Option<Self> {
36        match ext.to_lowercase().as_str() {
37            "md" => Some(Self::Markdown),
38            "json" => Some(Self::JsonSchema),
39            "yaml" | "yml" => Some(Self::OpenApi),
40            _ => None,
41        }
42    }
43}
44
45/// Result of parsing a single documentation file.
46#[derive(Debug, Clone)]
47pub struct DocumentationResult {
48    /// The path to the documentation file (relative to project root).
49    pub path: PathBuf,
50    /// The type of documentation file.
51    pub doc_type: DocType,
52    /// Knowledge nodes extracted from this file.
53    pub nodes: Vec<KnowledgeNode>,
54}
55
56/// Parse a documentation file and extract structured knowledge nodes.
57///
58/// # Arguments
59///
60/// * `path` - Relative path from the project root.
61/// * `content` - The raw file content as a string.
62/// * `branch_id` - The branch identifier for the knowledge graph nodes.
63///
64/// # Returns
65///
66/// A [`DocumentationResult`] containing the extracted knowledge nodes, or a
67/// [`ScanError::DocumentationError`] if the file cannot be parsed.
68pub fn parse_documentation(
69    path: &Path,
70    content: &str,
71    branch_id: &BranchId,
72) -> Result<DocumentationResult, ScanError> {
73    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
74
75    let doc_type = DocType::from_extension(ext).ok_or_else(|| ScanError::DocumentationError {
76        path: path.to_path_buf(),
77        reason: format!("Unsupported documentation extension: {ext}"),
78    })?;
79
80    let nodes = match doc_type {
81        DocType::Markdown => parse_markdown(path, content, branch_id),
82        DocType::JsonSchema => parse_json_schema(path, content, branch_id)?,
83        DocType::OpenApi => parse_openapi(path, content, branch_id)?,
84    };
85
86    Ok(DocumentationResult {
87        path: path.to_path_buf(),
88        doc_type,
89        nodes,
90    })
91}
92
93// ---------------------------------------------------------------------------
94// Markdown parsing
95// ---------------------------------------------------------------------------
96
97/// Parse Markdown content and extract H1/H2 sections as knowledge nodes.
98///
99/// Each H1 or H2 heading starts a new section node. Everything between two
100/// H1/H2 headings — including H3-H6 sub-headings, list items, and prose —
101/// is collected as the section's `content` in `ext_data` rather than
102/// generating separate nodes. This prevents a single large Markdown file
103/// from producing thousands of noisy nodes in the knowledge graph.
104///
105/// Files with no H1/H2 headings produce no nodes (prose-only files are
106/// intentionally skipped).
107fn parse_markdown(path: &Path, content: &str, branch_id: &BranchId) -> Vec<KnowledgeNode> {
108    /// Emit a completed section as a [`KnowledgeNode`], if `section` is `Some`.
109    fn flush_section(
110        counter: &mut i64,
111        nodes: &mut Vec<KnowledgeNode>,
112        section: Option<(String, u32, Vec<String>)>,
113        path: &Path,
114        branch_id: &BranchId,
115    ) {
116        let Some((title, level, body_lines)) = section else {
117            return;
118        };
119        // Trim trailing blank lines from body.
120        let body = body_lines
121            .iter()
122            .map(String::as_str)
123            .collect::<Vec<_>>()
124            .join("\n")
125            .trim_end()
126            .to_owned();
127        *counter += 1;
128        nodes.push(make_doc_node(
129            NodeId(*counter),
130            branch_id,
131            KnowledgeNature::Fact,
132            KnowledgeWeight::Info,
133            title,
134            serde_json::json!({
135                "source": "documentation",
136                "doc_type": "markdown",
137                "file": path.to_string_lossy(),
138                "element": "section",
139                "level": level,
140                "content": body,
141            }),
142        ));
143    }
144
145    let mut nodes = Vec::new();
146    let mut node_counter: i64 = 0;
147
148    // Current open section: (heading_text, heading_level, content_lines).
149    let mut current: Option<(String, u32, Vec<String>)> = None;
150
151    for line in content.lines() {
152        let trimmed = line.trim();
153
154        if let Some(heading) = parse_heading(trimmed) {
155            // Only H1 and H2 open new section nodes; H3+ are body content.
156            if heading.level <= 2 {
157                // Flush the previous section.
158                flush_section(
159                    &mut node_counter,
160                    &mut nodes,
161                    current.take(),
162                    path,
163                    branch_id,
164                );
165                current = Some((heading.text, heading.level, Vec::new()));
166                continue;
167            }
168        }
169
170        // Everything else (H3+, lists, prose, blank lines) is body content
171        // of the current section.
172        if let Some((_, _, ref mut body)) = current {
173            body.push(line.to_owned());
174        }
175        // Lines before the first H1/H2 are silently discarded.
176    }
177
178    // Flush the final section.
179    flush_section(&mut node_counter, &mut nodes, current, path, branch_id);
180
181    nodes
182}
183
184/// A parsed Markdown heading.
185struct HeadingInfo {
186    level: u32,
187    text: String,
188}
189
190/// Try to parse a line as a Markdown heading (`# Heading`).
191fn parse_heading(line: &str) -> Option<HeadingInfo> {
192    if !line.starts_with('#') {
193        return None;
194    }
195
196    let hashes = line.chars().take_while(|&c| c == '#').count() as u32;
197    if hashes > 6 {
198        return None;
199    }
200
201    let rest = &line[hashes as usize..];
202    // Must be followed by a space (ATX heading requirement)
203    if !rest.starts_with(' ') {
204        return None;
205    }
206
207    let text = rest.trim().to_string();
208    if text.is_empty() {
209        return None;
210    }
211
212    Some(HeadingInfo {
213        level: hashes,
214        text,
215    })
216}
217
218// ---------------------------------------------------------------------------
219// JSON Schema parsing
220// ---------------------------------------------------------------------------
221
222/// Parse a JSON Schema file and extract data structure definitions.
223///
224/// Extracts the schema title/description and all property definitions as
225/// Fact/Info knowledge nodes.
226fn parse_json_schema(
227    path: &Path,
228    content: &str,
229    branch_id: &BranchId,
230) -> Result<Vec<KnowledgeNode>, ScanError> {
231    let value: serde_json::Value =
232        serde_json::from_str(content).map_err(|e| ScanError::DocumentationError {
233            path: path.to_path_buf(),
234            reason: format!("Invalid JSON: {e}"),
235        })?;
236
237    // Verify this looks like a JSON Schema (has "$schema", "type", or "properties")
238    let obj = value
239        .as_object()
240        .ok_or_else(|| ScanError::DocumentationError {
241            path: path.to_path_buf(),
242            reason: "JSON Schema must be an object".to_string(),
243        })?;
244
245    let is_schema = obj.contains_key("$schema")
246        || obj.contains_key("properties")
247        || (obj.contains_key("type") && obj.contains_key("title"));
248
249    if !is_schema {
250        return Ok(Vec::new());
251    }
252
253    let mut nodes = Vec::new();
254    let mut node_counter: i64 = 0;
255
256    // Extract the root schema definition
257    let schema_title = obj
258        .get("title")
259        .and_then(|v| v.as_str())
260        .unwrap_or("Untitled Schema");
261
262    let schema_description = obj
263        .get("description")
264        .and_then(|v| v.as_str())
265        .unwrap_or("");
266
267    let description = if schema_description.is_empty() {
268        format!("JSON Schema: {schema_title}")
269    } else {
270        format!("JSON Schema: {schema_title} — {schema_description}")
271    };
272
273    node_counter += 1;
274    nodes.push(make_doc_node(
275        NodeId(node_counter),
276        branch_id,
277        KnowledgeNature::Fact,
278        KnowledgeWeight::Info,
279        description,
280        serde_json::json!({
281            "source": "documentation",
282            "doc_type": "json_schema",
283            "file": path.to_string_lossy(),
284            "element": "schema",
285            "schema_title": schema_title,
286        }),
287    ));
288
289    // Extract properties as individual nodes
290    if let Some(properties) = obj.get("properties").and_then(|v| v.as_object()) {
291        let required: Vec<&str> = obj
292            .get("required")
293            .and_then(|v| v.as_array())
294            .map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
295            .unwrap_or_default();
296
297        for (prop_name, prop_value) in properties {
298            let prop_type = prop_value
299                .get("type")
300                .and_then(|v| v.as_str())
301                .unwrap_or("unknown");
302            let prop_desc = prop_value
303                .get("description")
304                .and_then(|v| v.as_str())
305                .unwrap_or("");
306            let is_required = required.contains(&prop_name.as_str());
307
308            let desc = if prop_desc.is_empty() {
309                format!(
310                    "Property: {prop_name} ({prop_type}{})",
311                    if is_required { ", required" } else { "" }
312                )
313            } else {
314                format!(
315                    "Property: {prop_name} ({prop_type}{}) — {prop_desc}",
316                    if is_required { ", required" } else { "" }
317                )
318            };
319
320            node_counter += 1;
321            nodes.push(make_doc_node(
322                NodeId(node_counter),
323                branch_id,
324                KnowledgeNature::Fact,
325                KnowledgeWeight::Info,
326                desc,
327                serde_json::json!({
328                    "source": "documentation",
329                    "doc_type": "json_schema",
330                    "file": path.to_string_lossy(),
331                    "element": "property",
332                    "schema_title": schema_title,
333                    "property_name": prop_name,
334                    "property_type": prop_type,
335                    "required": is_required,
336                }),
337            ));
338        }
339    }
340
341    // Extract definitions/$defs as additional type nodes
342    let defs = obj
343        .get("definitions")
344        .or_else(|| obj.get("$defs"))
345        .and_then(|v| v.as_object());
346
347    if let Some(definitions) = defs {
348        for (def_name, def_value) in definitions {
349            let def_desc = def_value
350                .get("description")
351                .and_then(|v| v.as_str())
352                .unwrap_or("");
353            let def_type = def_value
354                .get("type")
355                .and_then(|v| v.as_str())
356                .unwrap_or("object");
357
358            let desc = if def_desc.is_empty() {
359                format!("Definition: {def_name} ({def_type})")
360            } else {
361                format!("Definition: {def_name} ({def_type}) — {def_desc}")
362            };
363
364            node_counter += 1;
365            nodes.push(make_doc_node(
366                NodeId(node_counter),
367                branch_id,
368                KnowledgeNature::Fact,
369                KnowledgeWeight::Info,
370                desc,
371                serde_json::json!({
372                    "source": "documentation",
373                    "doc_type": "json_schema",
374                    "file": path.to_string_lossy(),
375                    "element": "definition",
376                    "definition_name": def_name,
377                    "definition_type": def_type,
378                }),
379            ));
380        }
381    }
382
383    Ok(nodes)
384}
385
386// ---------------------------------------------------------------------------
387// OpenAPI parsing
388// ---------------------------------------------------------------------------
389
390/// Parse an OpenAPI specification and extract endpoint definitions.
391///
392/// Extracts each path + method combination as a Fact/Info knowledge node.
393fn parse_openapi(
394    path: &Path,
395    content: &str,
396    branch_id: &BranchId,
397) -> Result<Vec<KnowledgeNode>, ScanError> {
398    let value: serde_norway::Value =
399        serde_norway::from_str(content).map_err(|e| ScanError::DocumentationError {
400            path: path.to_path_buf(),
401            reason: format!("Invalid YAML: {e}"),
402        })?;
403
404    // Verify this looks like an OpenAPI spec
405    let mapping = value
406        .as_mapping()
407        .ok_or_else(|| ScanError::DocumentationError {
408            path: path.to_path_buf(),
409            reason: "OpenAPI spec must be a YAML mapping".to_string(),
410        })?;
411
412    let has_openapi = mapping.contains_key(yaml_key("openapi"));
413    let has_swagger = mapping.contains_key(yaml_key("swagger"));
414
415    if !has_openapi && !has_swagger {
416        return Ok(Vec::new());
417    }
418
419    let mut nodes = Vec::new();
420    let mut node_counter: i64 = 0;
421
422    // Extract API title from info.title
423    let api_title = yaml_get_mapping(mapping, "info")
424        .and_then(|m| yaml_get_str(m, "title"))
425        .unwrap_or("Untitled API");
426
427    let api_version = yaml_get_mapping(mapping, "info")
428        .and_then(|m| yaml_get_str(m, "version"))
429        .unwrap_or("");
430
431    let api_desc = if api_version.is_empty() {
432        format!("API: {api_title}")
433    } else {
434        format!("API: {api_title} (v{api_version})")
435    };
436
437    node_counter += 1;
438    nodes.push(make_doc_node(
439        NodeId(node_counter),
440        branch_id,
441        KnowledgeNature::Fact,
442        KnowledgeWeight::Info,
443        api_desc,
444        serde_json::json!({
445            "source": "documentation",
446            "doc_type": "openapi",
447            "file": path.to_string_lossy(),
448            "element": "api",
449            "api_title": api_title,
450            "api_version": api_version,
451        }),
452    ));
453
454    // Extract paths/endpoints
455    if let Some(paths) = yaml_get_mapping(mapping, "paths") {
456        let http_methods = [
457            "get", "post", "put", "delete", "patch", "options", "head", "trace",
458        ];
459
460        for (path_key, path_value) in paths {
461            let endpoint_path = match path_key.as_str() {
462                Some(p) => p,
463                None => continue,
464            };
465
466            let methods = match path_value.as_mapping() {
467                Some(m) => m,
468                None => continue,
469            };
470
471            for method_name in &http_methods {
472                let method_key = serde_norway::Value::String(method_name.to_string());
473                if let Some(method_value) = methods.get(&method_key) {
474                    let method_map = method_value.as_mapping();
475
476                    let summary = method_map
477                        .and_then(|m| yaml_get_str(m, "summary"))
478                        .unwrap_or("");
479
480                    let operation_id = method_map
481                        .and_then(|m| yaml_get_str(m, "operationId"))
482                        .unwrap_or("");
483
484                    let method_upper = method_name.to_uppercase();
485                    let desc = if summary.is_empty() {
486                        format!("Endpoint: {method_upper} {endpoint_path}")
487                    } else {
488                        format!("Endpoint: {method_upper} {endpoint_path} — {summary}")
489                    };
490
491                    // Extract response codes
492                    let response_codes: Vec<String> = method_map
493                        .and_then(|m| yaml_get_mapping(m, "responses"))
494                        .map(|responses| {
495                            responses
496                                .keys()
497                                .filter_map(|k| k.as_str().map(String::from))
498                                .collect()
499                        })
500                        .unwrap_or_default();
501
502                    // Extract tags
503                    let tags: Vec<String> = method_map
504                        .and_then(|m| yaml_get_seq(m, "tags"))
505                        .map(|seq| {
506                            seq.iter()
507                                .filter_map(|v| v.as_str().map(String::from))
508                                .collect()
509                        })
510                        .unwrap_or_default();
511
512                    node_counter += 1;
513                    nodes.push(make_doc_node(
514                        NodeId(node_counter),
515                        branch_id,
516                        KnowledgeNature::Fact,
517                        KnowledgeWeight::Info,
518                        desc,
519                        serde_json::json!({
520                            "source": "documentation",
521                            "doc_type": "openapi",
522                            "file": path.to_string_lossy(),
523                            "element": "endpoint",
524                            "api_title": api_title,
525                            "path": endpoint_path,
526                            "method": method_upper,
527                            "operation_id": operation_id,
528                            "response_codes": response_codes,
529                            "tags": tags,
530                        }),
531                    ));
532                }
533            }
534        }
535    }
536
537    // Extract component schemas (OpenAPI 3.x)
538    if let Some(schemas) =
539        yaml_get_mapping(mapping, "components").and_then(|m| yaml_get_mapping(m, "schemas"))
540    {
541        for (schema_key, schema_value) in schemas {
542            let schema_name = match schema_key.as_str() {
543                Some(n) => n,
544                None => continue,
545            };
546
547            let schema_map = schema_value.as_mapping();
548
549            let schema_type = schema_map
550                .and_then(|m| yaml_get_str(m, "type"))
551                .unwrap_or("object");
552
553            let schema_desc = schema_map
554                .and_then(|m| yaml_get_str(m, "description"))
555                .unwrap_or("");
556
557            let desc = if schema_desc.is_empty() {
558                format!("Schema: {schema_name} ({schema_type})")
559            } else {
560                format!("Schema: {schema_name} ({schema_type}) — {schema_desc}")
561            };
562
563            node_counter += 1;
564            nodes.push(make_doc_node(
565                NodeId(node_counter),
566                branch_id,
567                KnowledgeNature::Fact,
568                KnowledgeWeight::Info,
569                desc,
570                serde_json::json!({
571                    "source": "documentation",
572                    "doc_type": "openapi",
573                    "file": path.to_string_lossy(),
574                    "element": "schema",
575                    "api_title": api_title,
576                    "schema_name": schema_name,
577                    "schema_type": schema_type,
578                }),
579            ));
580        }
581    }
582
583    // Extract Swagger 2.0 definitions
584    if let Some(definitions) = yaml_get_mapping(mapping, "definitions") {
585        for (def_key, def_value) in definitions {
586            let def_name = match def_key.as_str() {
587                Some(n) => n,
588                None => continue,
589            };
590
591            let def_map = def_value.as_mapping();
592
593            let def_type = def_map
594                .and_then(|m| yaml_get_str(m, "type"))
595                .unwrap_or("object");
596
597            let def_desc = def_map
598                .and_then(|m| yaml_get_str(m, "description"))
599                .unwrap_or("");
600
601            let desc = if def_desc.is_empty() {
602                format!("Schema: {def_name} ({def_type})")
603            } else {
604                format!("Schema: {def_name} ({def_type}) — {def_desc}")
605            };
606
607            node_counter += 1;
608            nodes.push(make_doc_node(
609                NodeId(node_counter),
610                branch_id,
611                KnowledgeNature::Fact,
612                KnowledgeWeight::Info,
613                desc,
614                serde_json::json!({
615                    "source": "documentation",
616                    "doc_type": "openapi",
617                    "file": path.to_string_lossy(),
618                    "element": "schema",
619                    "api_title": api_title,
620                    "schema_name": def_name,
621                    "schema_type": def_type,
622                }),
623            ));
624        }
625    }
626
627    Ok(nodes)
628}
629
630// ---------------------------------------------------------------------------
631// Helpers
632// ---------------------------------------------------------------------------
633
634/// Create a `serde_norway::Value::String` key for YAML mapping lookups.
635fn yaml_key(key: &str) -> serde_norway::Value {
636    serde_norway::Value::String(key.to_string())
637}
638
639/// Get a string value from a YAML mapping by key.
640fn yaml_get_str<'a>(mapping: &'a serde_norway::Mapping, key: &str) -> Option<&'a str> {
641    mapping.get(yaml_key(key)).and_then(|v| v.as_str())
642}
643
644/// Get a nested mapping from a YAML mapping by key.
645fn yaml_get_mapping<'a>(
646    mapping: &'a serde_norway::Mapping,
647    key: &str,
648) -> Option<&'a serde_norway::Mapping> {
649    mapping.get(yaml_key(key)).and_then(|v| v.as_mapping())
650}
651
652/// Get a nested sequence from a YAML mapping by key.
653fn yaml_get_seq<'a>(
654    mapping: &'a serde_norway::Mapping,
655    key: &str,
656) -> Option<&'a serde_norway::Sequence> {
657    mapping.get(yaml_key(key)).and_then(|v| v.as_sequence())
658}
659
660/// Create a documentation-sourced knowledge node with standard fields.
661fn make_doc_node(
662    id: NodeId,
663    branch_id: &BranchId,
664    nature: KnowledgeNature,
665    weight: KnowledgeWeight,
666    description: String,
667    ext_data: serde_json::Value,
668) -> KnowledgeNode {
669    KnowledgeNode {
670        id,
671        branch_id: branch_id.clone(),
672        nature,
673        weight,
674        confidence: 1.0,
675        adoption_count: 1,
676        total_count: 1,
677        description,
678        ext_data: Some(ext_data),
679    }
680}
681
682// ---------------------------------------------------------------------------
683// Tests
684// ---------------------------------------------------------------------------
685
686#[cfg(test)]
687mod tests {
688    use super::*;
689    use seshat_core::BranchId;
690
691    fn branch() -> BranchId {
692        BranchId::from("test")
693    }
694
695    // -----------------------------------------------------------------------
696    // DocType detection
697    // -----------------------------------------------------------------------
698
699    #[test]
700    fn doc_type_from_extension_markdown() {
701        assert_eq!(DocType::from_extension("md"), Some(DocType::Markdown));
702    }
703
704    #[test]
705    fn doc_type_from_extension_json() {
706        assert_eq!(DocType::from_extension("json"), Some(DocType::JsonSchema));
707    }
708
709    #[test]
710    fn doc_type_from_extension_yaml() {
711        assert_eq!(DocType::from_extension("yaml"), Some(DocType::OpenApi));
712        assert_eq!(DocType::from_extension("yml"), Some(DocType::OpenApi));
713    }
714
715    #[test]
716    fn doc_type_from_extension_unknown() {
717        assert_eq!(DocType::from_extension("rs"), None);
718        assert_eq!(DocType::from_extension("txt"), None);
719    }
720
721    #[test]
722    fn doc_type_case_insensitive() {
723        assert_eq!(DocType::from_extension("MD"), Some(DocType::Markdown));
724        assert_eq!(DocType::from_extension("YAML"), Some(DocType::OpenApi));
725        assert_eq!(DocType::from_extension("Json"), Some(DocType::JsonSchema));
726    }
727
728    // -----------------------------------------------------------------------
729    // parse_documentation dispatch
730    // -----------------------------------------------------------------------
731
732    #[test]
733    fn parse_documentation_unsupported_extension() {
734        let result = parse_documentation(Path::new("file.txt"), "content", &branch());
735        assert!(result.is_err());
736        let err = result.unwrap_err();
737        assert!(matches!(err, ScanError::DocumentationError { .. }));
738    }
739
740    #[test]
741    fn parse_documentation_routes_to_markdown() {
742        // "# Hello\n- item" is one H1 section → one node
743        let content = "# Hello\n- item";
744        let result = parse_documentation(Path::new("README.md"), content, &branch()).unwrap();
745        assert_eq!(result.doc_type, DocType::Markdown);
746        assert_eq!(result.nodes.len(), 1);
747    }
748
749    #[test]
750    fn parse_documentation_routes_to_json_schema() {
751        let content = r#"{"$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "title": "Test"}"#;
752        let result = parse_documentation(Path::new("schema.json"), content, &branch()).unwrap();
753        assert_eq!(result.doc_type, DocType::JsonSchema);
754        assert!(!result.nodes.is_empty());
755    }
756
757    #[test]
758    fn parse_documentation_routes_to_openapi() {
759        let content = "openapi: '3.0.0'\ninfo:\n  title: Test\n  version: '1.0'\npaths: {}";
760        let result = parse_documentation(Path::new("api.yaml"), content, &branch()).unwrap();
761        assert_eq!(result.doc_type, DocType::OpenApi);
762        assert!(!result.nodes.is_empty());
763    }
764
765    // -----------------------------------------------------------------------
766    // Markdown: section-based parsing (H1/H2 = one node, body = content)
767    // -----------------------------------------------------------------------
768
769    #[test]
770    fn markdown_extracts_h1_h2_as_sections() {
771        // H3 (Subsection) must NOT generate its own node — it is body of Section.
772        let content = "# Title\n\nSome text\n\n## Section\n\nMore text\n\n### Subsection";
773        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
774
775        assert_eq!(nodes.len(), 2, "only H1 and H2 create nodes");
776        assert_eq!(nodes[0].description, "Title");
777        assert_eq!(nodes[1].description, "Section");
778
779        // Levels stored in ext_data
780        assert_eq!(nodes[0].ext_data.as_ref().unwrap()["level"], 1);
781        assert_eq!(nodes[1].ext_data.as_ref().unwrap()["level"], 2);
782
783        // H3 sub-heading is part of Section's content
784        let section_content = nodes[1].ext_data.as_ref().unwrap()["content"]
785            .as_str()
786            .unwrap();
787        assert!(
788            section_content.contains("### Subsection"),
789            "H3 should appear in H2 section content"
790        );
791    }
792
793    #[test]
794    fn markdown_heading_requires_space() {
795        let content = "#NoSpace\n# Has Space";
796        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
797        assert_eq!(nodes.len(), 1);
798        assert_eq!(nodes[0].description, "Has Space");
799    }
800
801    #[test]
802    fn markdown_heading_max_level() {
803        // H6 opens a section; H7 is invalid → treated as body content.
804        // But H6 > 2, so it is body too. Only H1/H2 create nodes.
805        let content = "# Top\n###### H6 content\n####### H7 content";
806        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
807        assert_eq!(nodes.len(), 1);
808        assert_eq!(nodes[0].description, "Top");
809        let body = nodes[0].ext_data.as_ref().unwrap()["content"]
810            .as_str()
811            .unwrap();
812        assert!(body.contains("H6 content"));
813    }
814
815    #[test]
816    fn markdown_list_items_are_body_content() {
817        // Lists under an H1 must appear in content, not as separate nodes.
818        let content = "# Section\n- First item\n- Second item\n* Third item";
819        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
820
821        assert_eq!(nodes.len(), 1, "only one node for the H1 section");
822        assert_eq!(nodes[0].description, "Section");
823
824        let body = nodes[0].ext_data.as_ref().unwrap()["content"]
825            .as_str()
826            .unwrap();
827        assert!(body.contains("First item"));
828        assert!(body.contains("Second item"));
829        assert!(body.contains("Third item"));
830    }
831
832    #[test]
833    fn markdown_multiple_h2_sections() {
834        let content = "# Doc\n\npreamble\n\n## Section A\n- item A\n## Section B\n- item B";
835        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
836
837        // H1 + H2 A + H2 B = 3 nodes
838        assert_eq!(nodes.len(), 3);
839        assert_eq!(nodes[0].description, "Doc");
840        assert_eq!(nodes[1].description, "Section A");
841        assert_eq!(nodes[2].description, "Section B");
842
843        let body_a = nodes[1].ext_data.as_ref().unwrap()["content"]
844            .as_str()
845            .unwrap();
846        assert!(body_a.contains("item A"));
847        assert!(!body_a.contains("item B"));
848    }
849
850    #[test]
851    fn markdown_orphan_content_before_first_heading_discarded() {
852        // Content before the first H1/H2 produces no node.
853        let content = "some preamble\n# First heading\nbody";
854        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
855        assert_eq!(nodes.len(), 1);
856        assert_eq!(nodes[0].description, "First heading");
857    }
858
859    #[test]
860    fn markdown_all_nodes_tagged_with_source() {
861        let content = "# Heading\n- Item\n## Sub\ntext";
862        let nodes = parse_markdown(Path::new("doc.md"), content, &branch());
863        for node in &nodes {
864            let ext = node.ext_data.as_ref().unwrap();
865            assert_eq!(ext["source"], "documentation");
866            assert_eq!(ext["doc_type"], "markdown");
867            assert_eq!(ext["element"], "section");
868        }
869    }
870
871    #[test]
872    fn markdown_empty_content() {
873        let content = "";
874        let nodes = parse_markdown(Path::new("empty.md"), content, &branch());
875        assert!(nodes.is_empty());
876    }
877
878    #[test]
879    fn markdown_prose_only_no_structured_content() {
880        // No H1/H2 → no nodes.
881        let content = "This is just a paragraph.\nWith no headings or lists.";
882        let nodes = parse_markdown(Path::new("prose.md"), content, &branch());
883        assert!(nodes.is_empty());
884    }
885
886    // -----------------------------------------------------------------------
887    // JSON Schema: basic
888    // -----------------------------------------------------------------------
889
890    #[test]
891    fn json_schema_extracts_title_and_properties() {
892        let content = r#"{
893            "$schema": "http://json-schema.org/draft-07/schema#",
894            "title": "User",
895            "description": "A user account",
896            "type": "object",
897            "required": ["id", "email"],
898            "properties": {
899                "id": {"type": "integer", "description": "Unique identifier"},
900                "email": {"type": "string", "description": "Email address"},
901                "name": {"type": "string"}
902            }
903        }"#;
904
905        let nodes = parse_json_schema(Path::new("user.json"), content, &branch()).unwrap();
906
907        // 1 schema node + 3 property nodes
908        assert_eq!(nodes.len(), 4);
909        assert!(nodes[0].description.contains("User"));
910        assert!(nodes[0].description.contains("A user account"));
911
912        // Check properties
913        let id_node = nodes.iter().find(|n| n.description.contains("id")).unwrap();
914        assert!(id_node.description.contains("integer"));
915        assert!(id_node.description.contains("required"));
916
917        let email_node = nodes
918            .iter()
919            .find(|n| n.description.contains("email"))
920            .unwrap();
921        assert!(email_node.description.contains("required"));
922
923        let name_node = nodes
924            .iter()
925            .find(|n| n.description.contains("name") && !n.description.contains("User"))
926            .unwrap();
927        assert!(!name_node.description.contains("required"));
928    }
929
930    #[test]
931    fn json_schema_extracts_definitions() {
932        let content = r#"{
933            "$schema": "http://json-schema.org/draft-07/schema#",
934            "title": "API",
935            "type": "object",
936            "definitions": {
937                "Address": {
938                    "type": "object",
939                    "description": "A postal address"
940                },
941                "PhoneNumber": {
942                    "type": "string"
943                }
944            }
945        }"#;
946
947        let nodes = parse_json_schema(Path::new("api.json"), content, &branch()).unwrap();
948
949        // 1 schema + 2 definitions
950        assert_eq!(nodes.len(), 3);
951
952        let addr = nodes
953            .iter()
954            .find(|n| n.description.contains("Address"))
955            .unwrap();
956        assert!(addr.description.contains("A postal address"));
957
958        let phone = nodes
959            .iter()
960            .find(|n| n.description.contains("PhoneNumber"))
961            .unwrap();
962        assert!(phone.description.contains("string"));
963    }
964
965    #[test]
966    fn json_schema_extracts_defs_key() {
967        let content = r#"{
968            "$schema": "https://json-schema.org/draft/2020-12/schema",
969            "title": "Modern",
970            "type": "object",
971            "$defs": {
972                "Color": {"type": "string", "description": "A color value"}
973            }
974        }"#;
975
976        let nodes = parse_json_schema(Path::new("modern.json"), content, &branch()).unwrap();
977        assert_eq!(nodes.len(), 2);
978        assert!(nodes[1].description.contains("Color"));
979    }
980
981    #[test]
982    fn json_schema_not_a_schema() {
983        let content = r#"{"name": "John", "age": 30}"#;
984        let nodes = parse_json_schema(Path::new("data.json"), content, &branch()).unwrap();
985        assert!(nodes.is_empty());
986    }
987
988    #[test]
989    fn json_schema_invalid_json() {
990        let result = parse_json_schema(Path::new("bad.json"), "not json", &branch());
991        assert!(result.is_err());
992    }
993
994    #[test]
995    fn json_schema_not_object() {
996        let result = parse_json_schema(Path::new("array.json"), "[1,2,3]", &branch());
997        assert!(result.is_err());
998    }
999
1000    #[test]
1001    fn json_schema_all_nodes_tagged_with_source() {
1002        let content = r#"{
1003            "$schema": "http://json-schema.org/draft-07/schema#",
1004            "title": "T",
1005            "type": "object",
1006            "properties": {"x": {"type": "string"}}
1007        }"#;
1008        let nodes = parse_json_schema(Path::new("t.json"), content, &branch()).unwrap();
1009        for node in &nodes {
1010            let ext = node.ext_data.as_ref().unwrap();
1011            assert_eq!(ext["source"], "documentation");
1012            assert_eq!(ext["doc_type"], "json_schema");
1013        }
1014    }
1015
1016    // -----------------------------------------------------------------------
1017    // OpenAPI: basic
1018    // -----------------------------------------------------------------------
1019
1020    #[test]
1021    fn openapi_extracts_api_info_and_endpoints() {
1022        let content = r#"
1023openapi: '3.0.0'
1024info:
1025  title: Pet Store
1026  version: '1.0.0'
1027paths:
1028  /pets:
1029    get:
1030      summary: List all pets
1031      operationId: listPets
1032      tags:
1033        - pets
1034      responses:
1035        '200':
1036          description: A list of pets
1037    post:
1038      summary: Create a pet
1039      operationId: createPet
1040      responses:
1041        '201':
1042          description: Pet created
1043  /pets/{petId}:
1044    get:
1045      summary: Get a pet by ID
1046      operationId: showPetById
1047      responses:
1048        '200':
1049          description: A single pet
1050        '404':
1051          description: Pet not found
1052"#;
1053
1054        let nodes = parse_openapi(Path::new("api.yaml"), content, &branch()).unwrap();
1055
1056        // 1 API node + 3 endpoint nodes
1057        assert_eq!(nodes.len(), 4);
1058
1059        let api_node = &nodes[0];
1060        assert!(api_node.description.contains("Pet Store"));
1061        assert!(api_node.description.contains("v1.0.0"));
1062
1063        // Check endpoints
1064        let get_pets = nodes
1065            .iter()
1066            .find(|n| n.description.contains("GET /pets") && !n.description.contains("{petId}"))
1067            .unwrap();
1068        assert!(get_pets.description.contains("List all pets"));
1069
1070        let post_pets = nodes
1071            .iter()
1072            .find(|n| n.description.contains("POST /pets"))
1073            .unwrap();
1074        assert!(post_pets.description.contains("Create a pet"));
1075
1076        let get_pet = nodes
1077            .iter()
1078            .find(|n| n.description.contains("GET /pets/{petId}"))
1079            .unwrap();
1080        assert!(get_pet.description.contains("Get a pet by ID"));
1081
1082        // Check ext_data for endpoint
1083        let ext = get_pets.ext_data.as_ref().unwrap();
1084        assert_eq!(ext["source"], "documentation");
1085        assert_eq!(ext["operation_id"], "listPets");
1086        assert_eq!(ext["tags"], serde_json::json!(["pets"]));
1087        assert_eq!(ext["response_codes"], serde_json::json!(["200"]));
1088    }
1089
1090    #[test]
1091    fn openapi_extracts_component_schemas() {
1092        let content = r#"
1093openapi: '3.0.0'
1094info:
1095  title: Test API
1096  version: '1.0'
1097paths: {}
1098components:
1099  schemas:
1100    Pet:
1101      type: object
1102      description: A pet in the store
1103    Error:
1104      type: object
1105      description: An error response
1106"#;
1107
1108        let nodes = parse_openapi(Path::new("api.yml"), content, &branch()).unwrap();
1109
1110        // 1 API + 2 schemas
1111        assert_eq!(nodes.len(), 3);
1112
1113        let pet = nodes
1114            .iter()
1115            .find(|n| n.description.contains("Pet"))
1116            .unwrap();
1117        assert!(pet.description.contains("A pet in the store"));
1118
1119        let error = nodes
1120            .iter()
1121            .find(|n| n.description.contains("Error"))
1122            .unwrap();
1123        assert!(error.description.contains("An error response"));
1124    }
1125
1126    #[test]
1127    fn openapi_swagger_2_definitions() {
1128        let content = r#"
1129swagger: '2.0'
1130info:
1131  title: Legacy API
1132  version: '0.1'
1133paths:
1134  /users:
1135    get:
1136      summary: List users
1137      responses:
1138        '200':
1139          description: OK
1140definitions:
1141  User:
1142    type: object
1143    description: A user object
1144"#;
1145
1146        let nodes = parse_openapi(Path::new("legacy.yaml"), content, &branch()).unwrap();
1147
1148        // 1 API + 1 endpoint + 1 definition
1149        assert_eq!(nodes.len(), 3);
1150
1151        let user = nodes
1152            .iter()
1153            .find(|n| n.description.contains("User"))
1154            .unwrap();
1155        assert!(user.description.contains("A user object"));
1156    }
1157
1158    #[test]
1159    fn openapi_not_an_api_spec() {
1160        let content = "name: John\nage: 30";
1161        let nodes = parse_openapi(Path::new("data.yaml"), content, &branch()).unwrap();
1162        assert!(nodes.is_empty());
1163    }
1164
1165    #[test]
1166    fn openapi_invalid_yaml() {
1167        let result = parse_openapi(Path::new("bad.yaml"), "{{invalid yaml", &branch());
1168        assert!(result.is_err());
1169    }
1170
1171    #[test]
1172    fn openapi_not_mapping() {
1173        let result = parse_openapi(Path::new("list.yaml"), "- item1\n- item2", &branch());
1174        assert!(result.is_err());
1175    }
1176
1177    #[test]
1178    fn openapi_all_nodes_tagged_with_source() {
1179        let content = r#"
1180openapi: '3.0.0'
1181info:
1182  title: T
1183  version: '1'
1184paths:
1185  /x:
1186    get:
1187      summary: X
1188      responses:
1189        '200':
1190          description: OK
1191"#;
1192        let nodes = parse_openapi(Path::new("api.yaml"), content, &branch()).unwrap();
1193        for node in &nodes {
1194            let ext = node.ext_data.as_ref().unwrap();
1195            assert_eq!(ext["source"], "documentation");
1196            assert_eq!(ext["doc_type"], "openapi");
1197        }
1198    }
1199
1200    #[test]
1201    fn openapi_endpoint_without_summary() {
1202        let content = r#"
1203openapi: '3.0.0'
1204info:
1205  title: Minimal
1206  version: '1'
1207paths:
1208  /health:
1209    get:
1210      responses:
1211        '200':
1212          description: OK
1213"#;
1214        let nodes = parse_openapi(Path::new("api.yaml"), content, &branch()).unwrap();
1215        let endpoint = nodes
1216            .iter()
1217            .find(|n| n.description.contains("GET /health"))
1218            .unwrap();
1219        // No summary means just method + path
1220        assert_eq!(endpoint.description, "Endpoint: GET /health");
1221    }
1222
1223    // -----------------------------------------------------------------------
1224    // Node properties
1225    // -----------------------------------------------------------------------
1226
1227    #[test]
1228    fn all_nodes_are_facts_with_info_weight() {
1229        let md = "# Title\n- Item";
1230        let md_nodes = parse_markdown(Path::new("doc.md"), md, &branch());
1231        for node in &md_nodes {
1232            assert_eq!(node.nature, KnowledgeNature::Fact);
1233            assert_eq!(node.weight, KnowledgeWeight::Info);
1234            assert!((node.confidence - 1.0).abs() < f64::EPSILON);
1235        }
1236    }
1237
1238    #[test]
1239    fn documentation_result_contains_correct_path() {
1240        let result = parse_documentation(Path::new("docs/README.md"), "# Hi", &branch()).unwrap();
1241        assert_eq!(result.path, Path::new("docs/README.md"));
1242    }
1243}