kbolt_core/ingest/
code.rs1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct CodeExtractor;
8
9impl Extractor for CodeExtractor {
10 fn supports(&self) -> &[&str] {
11 &[
12 "rs", "py", "js", "ts", "tsx", "jsx", "go", "java", "kt", "c", "cpp", "cc", "h", "hpp",
13 "cs", "rb", "php", "swift",
14 ]
15 }
16
17 fn profile_key(&self) -> &'static str {
18 "code"
19 }
20
21 fn extract(&self, path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
22 if let Err(err) = std::str::from_utf8(bytes) {
23 return Err(kbolt_types::KboltError::InvalidInput(format!(
24 "non-utf8 code input: {err}"
25 ))
26 .into());
27 }
28
29 let language = path
30 .extension()
31 .and_then(|value| value.to_str())
32 .map(|value| value.trim().trim_start_matches('.').to_ascii_lowercase())
33 .unwrap_or_default();
34
35 let mut attrs = HashMap::new();
36 if !language.is_empty() {
37 attrs.insert("language".to_string(), language);
38 }
39
40 let block = ExtractedBlock {
41 text: String::from_utf8_lossy(bytes).to_string(),
42 offset: 0,
43 length: bytes.len(),
44 kind: BlockKind::CodeFence,
45 heading_path: Vec::new(),
46 attrs,
47 };
48
49 Ok(ExtractedDocument {
50 blocks: vec![block],
51 metadata: HashMap::new(),
52 title: None,
53 })
54 }
55}
56
57#[cfg(test)]
58mod tests {
59 use std::path::Path;
60
61 use crate::ingest::code::CodeExtractor;
62 use crate::ingest::extract::{BlockKind, Extractor};
63
64 #[test]
65 fn extracts_code_block_with_full_span_and_language_attr() {
66 let extractor = CodeExtractor;
67 let source = b"fn alpha() {}\nfn beta() {}\n";
68 let doc = extractor
69 .extract(Path::new("src/lib.rs"), source)
70 .expect("extract code");
71
72 assert_eq!(extractor.profile_key(), "code");
73 assert_eq!(doc.blocks.len(), 1);
74 assert_eq!(doc.blocks[0].kind, BlockKind::CodeFence);
75 assert_eq!(doc.blocks[0].offset, 0);
76 assert_eq!(doc.blocks[0].length, source.len());
77 assert_eq!(
78 doc.blocks[0].attrs.get("language").map(String::as_str),
79 Some("rs")
80 );
81 }
82
83 #[test]
84 fn rejects_non_utf8_code_bytes() {
85 let extractor = CodeExtractor;
86 let err = extractor
87 .extract(Path::new("src/lib.rs"), &[0xff, 0xfe, 0xfd])
88 .expect_err("invalid utf8 should fail");
89 assert!(err.to_string().contains("non-utf8 code input"));
90 }
91}