Skip to main content

kbolt_core/ingest/
code.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct CodeExtractor;
8
9impl Extractor for CodeExtractor {
10    fn supports(&self) -> &[&str] {
11        &[
12            "rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "kt", "c", "cpp", "cc", "h", "hpp",
13            "cs", "rb", "php", "swift",
14        ]
15    }
16
17    fn profile_key(&self) -> &'static str {
18        "code"
19    }
20
21    fn extract(&self, path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
22        if let Err(err) = std::str::from_utf8(bytes) {
23            return Err(kbolt_types::KboltError::InvalidInput(format!(
24                "non-utf8 code input: {err}"
25            ))
26            .into());
27        }
28
29        let language = path
30            .extension()
31            .and_then(|value| value.to_str())
32            .map(|value| value.trim().trim_start_matches('.').to_ascii_lowercase())
33            .unwrap_or_default();
34
35        let mut attrs = HashMap::new();
36        if !language.is_empty() {
37            attrs.insert("language".to_string(), language);
38        }
39
40        let block = ExtractedBlock {
41            text: String::from_utf8_lossy(bytes).to_string(),
42            offset: 0,
43            length: bytes.len(),
44            kind: BlockKind::CodeFence,
45            heading_path: Vec::new(),
46            attrs,
47        };
48
49        Ok(ExtractedDocument {
50            blocks: vec![block],
51            metadata: HashMap::new(),
52            title: None,
53        })
54    }
55}
56
57#[cfg(test)]
58mod tests {
59    use std::path::Path;
60
61    use crate::ingest::code::CodeExtractor;
62    use crate::ingest::extract::{BlockKind, Extractor};
63
64    #[test]
65    fn extracts_code_block_with_full_span_and_language_attr() {
66        let extractor = CodeExtractor;
67        let source = b"fn alpha() {}\nfn beta() {}\n";
68        let doc = extractor
69            .extract(Path::new("src/lib.rs"), source)
70            .expect("extract code");
71
72        assert_eq!(extractor.profile_key(), "code");
73        assert_eq!(doc.blocks.len(), 1);
74        assert_eq!(doc.blocks[0].kind, BlockKind::CodeFence);
75        assert_eq!(doc.blocks[0].offset, 0);
76        assert_eq!(doc.blocks[0].length, source.len());
77        assert_eq!(
78            doc.blocks[0].attrs.get("language").map(String::as_str),
79            Some("rs")
80        );
81    }
82
83    #[test]
84    fn rejects_non_utf8_code_bytes() {
85        let extractor = CodeExtractor;
86        let err = extractor
87            .extract(Path::new("src/lib.rs"), &[0xff, 0xfe, 0xfd])
88            .expect_err("invalid utf8 should fail");
89        assert!(err.to_string().contains("non-utf8 code input"));
90    }
91}