Skip to main content

lean_ctx/core/
content_chunk.rs

1//! Universal content chunk — the atomic unit of the Context Cortex.
2//!
3//! Extends the existing `CodeChunk` (BM25) with a source dimension so that
4//! external data (GitHub issues, Jira tickets, DB schemas, wiki pages) flows
5//! through the same pipeline as code: BM25, embeddings, graph, knowledge.
6//!
7//! Design principles:
8//!   - Backward-compatible: `From<ContentChunk> for CodeChunk` preserves the
9//!     existing BM25 pipeline without changes.
10//!   - Source-aware: `ContentSource` tags where data came from.
11//!   - Reference-carrying: `references` links chunks to code files for
12//!     cross-source graph edges.
13//!
14//! Scientific basis: Neocortical column architecture (Mountcastle) — every
15//! data source is a "column" processing different input through the same
16//! computational template.
17
18use serde::{Deserialize, Serialize};
19
20use super::bm25_index::{ChunkKind, CodeChunk};
21
22/// Where a content chunk originated.
23#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
24#[serde(tag = "type", rename_all = "snake_case")]
25pub enum ContentSource {
26    /// Local filesystem (default, backward-compatible with CodeChunk).
27    #[default]
28    File,
29    /// External data provider (GitHub, Jira, Confluence, etc.).
30    Provider {
31        provider_id: String,
32        resource_type: String,
33    },
34    /// Shell command output.
35    Shell { command: String },
36    /// Knowledge system fact.
37    Knowledge { category: String },
38}
39
40/// A universal content chunk that can represent code, issues, DB schemas,
41/// wiki pages, or any other data source.
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct ContentChunk {
44    pub file_path: String,
45    pub symbol_name: String,
46    pub kind: ChunkKind,
47    pub start_line: usize,
48    pub end_line: usize,
49    pub content: String,
50    #[serde(default)]
51    pub tokens: Vec<String>,
52    pub token_count: usize,
53
54    #[serde(default)]
55    pub source: ContentSource,
56
57    /// URIs or file paths that this chunk references (for cross-source graph edges).
58    #[serde(default)]
59    pub references: Vec<String>,
60
61    /// Provider-specific structured metadata.
62    #[serde(default, skip_serializing_if = "Option::is_none")]
63    pub metadata: Option<serde_json::Value>,
64}
65
66impl ContentChunk {
67    pub fn from_provider(
68        provider_id: &str,
69        resource_type: &str,
70        item_id: &str,
71        title: &str,
72        kind: ChunkKind,
73        content: String,
74        references: Vec<String>,
75        metadata: Option<serde_json::Value>,
76    ) -> Self {
77        let tokens = super::bm25_index::tokenize_for_index(&content);
78        let token_count = tokens.len();
79        Self {
80            file_path: format!("{provider_id}://{resource_type}/{item_id}"),
81            symbol_name: title.to_string(),
82            kind,
83            start_line: 0,
84            end_line: 0,
85            content,
86            tokens,
87            token_count,
88            source: ContentSource::Provider {
89                provider_id: provider_id.to_string(),
90                resource_type: resource_type.to_string(),
91            },
92            references,
93            metadata,
94        }
95    }
96
97    pub fn is_external(&self) -> bool {
98        !matches!(self.source, ContentSource::File)
99    }
100
101    pub fn provider_id(&self) -> Option<&str> {
102        match &self.source {
103            ContentSource::Provider { provider_id, .. } => Some(provider_id),
104            _ => None,
105        }
106    }
107}
108
109impl From<ContentChunk> for CodeChunk {
110    fn from(c: ContentChunk) -> Self {
111        Self {
112            file_path: c.file_path,
113            symbol_name: c.symbol_name,
114            kind: c.kind,
115            start_line: c.start_line,
116            end_line: c.end_line,
117            content: c.content,
118            tokens: c.tokens,
119            token_count: c.token_count,
120        }
121    }
122}
123
124impl From<CodeChunk> for ContentChunk {
125    fn from(c: CodeChunk) -> Self {
126        Self {
127            file_path: c.file_path,
128            symbol_name: c.symbol_name,
129            kind: c.kind,
130            start_line: c.start_line,
131            end_line: c.end_line,
132            content: c.content,
133            tokens: c.tokens,
134            token_count: c.token_count,
135            source: ContentSource::File,
136            references: Vec::new(),
137            metadata: None,
138        }
139    }
140}
141
142// ---------------------------------------------------------------------------
143// Chunk extraction helpers for external data
144// ---------------------------------------------------------------------------
145
146/// Extract file path references from freeform text (issue bodies, PR descriptions).
147/// Looks for patterns like `src/auth.rs`, `lib/handler.ts`, `path/to/file.ext`.
148pub fn extract_file_references(text: &str) -> Vec<String> {
149    static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
150        regex::Regex::new(r"(?:^|[\s`\(\[])([a-zA-Z0-9_\-./]+\.[a-zA-Z]{1,10})(?:[\s`\)\],:;.]|$)")
151            .expect("file ref regex")
152    });
153
154    let mut refs: Vec<String> = RE
155        .captures_iter(text)
156        .filter_map(|cap| {
157            let path = cap.get(1)?.as_str();
158            if path.contains('/')
159                && !path.starts_with("http")
160                && !path.starts_with("www.")
161                && !path.contains('@')
162            {
163                Some(path.to_string())
164            } else {
165                None
166            }
167        })
168        .collect();
169    refs.sort();
170    refs.dedup();
171    refs
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn content_chunk_to_code_chunk_roundtrip() {
180        let cc = ContentChunk::from_provider(
181            "github",
182            "issues",
183            "123",
184            "Auth token expiry",
185            ChunkKind::Other,
186            "Token expires after 1h".into(),
187            vec!["src/auth.rs".into()],
188            None,
189        );
190
191        assert!(cc.is_external());
192        assert_eq!(cc.provider_id(), Some("github"));
193        assert_eq!(cc.file_path, "github://issues/123");
194
195        let code_chunk: CodeChunk = cc.into();
196        assert_eq!(code_chunk.file_path, "github://issues/123");
197        assert_eq!(code_chunk.symbol_name, "Auth token expiry");
198    }
199
200    #[test]
201    fn code_chunk_to_content_chunk() {
202        let code = CodeChunk {
203            file_path: "src/main.rs".into(),
204            symbol_name: "main".into(),
205            kind: ChunkKind::Function,
206            start_line: 1,
207            end_line: 10,
208            content: "fn main() {}".into(),
209            tokens: vec!["main".into()],
210            token_count: 1,
211        };
212
213        let cc: ContentChunk = code.into();
214        assert!(!cc.is_external());
215        assert_eq!(cc.source, ContentSource::File);
216        assert!(cc.references.is_empty());
217    }
218
219    #[test]
220    fn extract_file_refs_from_issue_body() {
221        let body = "The bug is in src/auth/handler.rs and affects lib/db.ts.\n\
222                     See also tests/auth_test.rs for the failing test.";
223        let refs = extract_file_references(body);
224        assert!(refs.contains(&"src/auth/handler.rs".to_string()));
225        assert!(refs.contains(&"lib/db.ts".to_string()));
226        assert!(refs.contains(&"tests/auth_test.rs".to_string()));
227    }
228
229    #[test]
230    fn extract_file_refs_ignores_urls() {
231        let body = "See https://github.com/foo/bar.git and www.example.com/page.html";
232        let refs = extract_file_references(body);
233        assert!(refs.is_empty() || !refs.iter().any(|r| r.contains("http")));
234    }
235
236    #[test]
237    fn extract_file_refs_deduplicates() {
238        let body = "Changed src/auth.rs and also src/auth.rs again";
239        let refs = extract_file_references(body);
240        assert_eq!(refs.iter().filter(|r| *r == "src/auth.rs").count(), 1);
241    }
242
243    #[test]
244    fn default_source_is_file() {
245        assert_eq!(ContentSource::default(), ContentSource::File);
246    }
247
248    #[test]
249    fn provider_source_serializes_with_tag() {
250        let src = ContentSource::Provider {
251            provider_id: "jira".into(),
252            resource_type: "issues".into(),
253        };
254        let json = serde_json::to_string(&src).unwrap();
255        assert!(json.contains("\"type\":\"provider\""));
256        assert!(json.contains("\"provider_id\":\"jira\""));
257    }
258}