1use serde::{Deserialize, Serialize};
19
20use super::bm25_index::{ChunkKind, CodeChunk};
21
22#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
24#[serde(tag = "type", rename_all = "snake_case")]
25pub enum ContentSource {
26 #[default]
28 File,
29 Provider {
31 provider_id: String,
32 resource_type: String,
33 },
34 Shell { command: String },
36 Knowledge { category: String },
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct ContentChunk {
44 pub file_path: String,
45 pub symbol_name: String,
46 pub kind: ChunkKind,
47 pub start_line: usize,
48 pub end_line: usize,
49 pub content: String,
50 #[serde(default)]
51 pub tokens: Vec<String>,
52 pub token_count: usize,
53
54 #[serde(default)]
55 pub source: ContentSource,
56
57 #[serde(default)]
59 pub references: Vec<String>,
60
61 #[serde(default, skip_serializing_if = "Option::is_none")]
63 pub metadata: Option<serde_json::Value>,
64}
65
66impl ContentChunk {
67 pub fn from_provider(
68 provider_id: &str,
69 resource_type: &str,
70 item_id: &str,
71 title: &str,
72 kind: ChunkKind,
73 content: String,
74 references: Vec<String>,
75 metadata: Option<serde_json::Value>,
76 ) -> Self {
77 let tokens = super::bm25_index::tokenize_for_index(&content);
78 let token_count = tokens.len();
79 Self {
80 file_path: format!("{provider_id}://{resource_type}/{item_id}"),
81 symbol_name: title.to_string(),
82 kind,
83 start_line: 0,
84 end_line: 0,
85 content,
86 tokens,
87 token_count,
88 source: ContentSource::Provider {
89 provider_id: provider_id.to_string(),
90 resource_type: resource_type.to_string(),
91 },
92 references,
93 metadata,
94 }
95 }
96
97 pub fn is_external(&self) -> bool {
98 !matches!(self.source, ContentSource::File)
99 }
100
101 pub fn provider_id(&self) -> Option<&str> {
102 match &self.source {
103 ContentSource::Provider { provider_id, .. } => Some(provider_id),
104 _ => None,
105 }
106 }
107}
108
109impl From<ContentChunk> for CodeChunk {
110 fn from(c: ContentChunk) -> Self {
111 Self {
112 file_path: c.file_path,
113 symbol_name: c.symbol_name,
114 kind: c.kind,
115 start_line: c.start_line,
116 end_line: c.end_line,
117 content: c.content,
118 tokens: c.tokens,
119 token_count: c.token_count,
120 }
121 }
122}
123
124impl From<CodeChunk> for ContentChunk {
125 fn from(c: CodeChunk) -> Self {
126 Self {
127 file_path: c.file_path,
128 symbol_name: c.symbol_name,
129 kind: c.kind,
130 start_line: c.start_line,
131 end_line: c.end_line,
132 content: c.content,
133 tokens: c.tokens,
134 token_count: c.token_count,
135 source: ContentSource::File,
136 references: Vec::new(),
137 metadata: None,
138 }
139 }
140}
141
142pub fn extract_file_references(text: &str) -> Vec<String> {
149 static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
150 regex::Regex::new(r"(?:^|[\s`\(\[])([a-zA-Z0-9_\-./]+\.[a-zA-Z]{1,10})(?:[\s`\)\],:;.]|$)")
151 .expect("file ref regex")
152 });
153
154 let mut refs: Vec<String> = RE
155 .captures_iter(text)
156 .filter_map(|cap| {
157 let path = cap.get(1)?.as_str();
158 if path.contains('/')
159 && !path.starts_with("http")
160 && !path.starts_with("www.")
161 && !path.contains('@')
162 {
163 Some(path.to_string())
164 } else {
165 None
166 }
167 })
168 .collect();
169 refs.sort();
170 refs.dedup();
171 refs
172}
173
174#[cfg(test)]
175mod tests {
176 use super::*;
177
178 #[test]
179 fn content_chunk_to_code_chunk_roundtrip() {
180 let cc = ContentChunk::from_provider(
181 "github",
182 "issues",
183 "123",
184 "Auth token expiry",
185 ChunkKind::Other,
186 "Token expires after 1h".into(),
187 vec!["src/auth.rs".into()],
188 None,
189 );
190
191 assert!(cc.is_external());
192 assert_eq!(cc.provider_id(), Some("github"));
193 assert_eq!(cc.file_path, "github://issues/123");
194
195 let code_chunk: CodeChunk = cc.into();
196 assert_eq!(code_chunk.file_path, "github://issues/123");
197 assert_eq!(code_chunk.symbol_name, "Auth token expiry");
198 }
199
200 #[test]
201 fn code_chunk_to_content_chunk() {
202 let code = CodeChunk {
203 file_path: "src/main.rs".into(),
204 symbol_name: "main".into(),
205 kind: ChunkKind::Function,
206 start_line: 1,
207 end_line: 10,
208 content: "fn main() {}".into(),
209 tokens: vec!["main".into()],
210 token_count: 1,
211 };
212
213 let cc: ContentChunk = code.into();
214 assert!(!cc.is_external());
215 assert_eq!(cc.source, ContentSource::File);
216 assert!(cc.references.is_empty());
217 }
218
219 #[test]
220 fn extract_file_refs_from_issue_body() {
221 let body = "The bug is in src/auth/handler.rs and affects lib/db.ts.\n\
222 See also tests/auth_test.rs for the failing test.";
223 let refs = extract_file_references(body);
224 assert!(refs.contains(&"src/auth/handler.rs".to_string()));
225 assert!(refs.contains(&"lib/db.ts".to_string()));
226 assert!(refs.contains(&"tests/auth_test.rs".to_string()));
227 }
228
229 #[test]
230 fn extract_file_refs_ignores_urls() {
231 let body = "See https://github.com/foo/bar.git and www.example.com/page.html";
232 let refs = extract_file_references(body);
233 assert!(refs.is_empty() || !refs.iter().any(|r| r.contains("http")));
234 }
235
236 #[test]
237 fn extract_file_refs_deduplicates() {
238 let body = "Changed src/auth.rs and also src/auth.rs again";
239 let refs = extract_file_references(body);
240 assert_eq!(refs.iter().filter(|r| *r == "src/auth.rs").count(), 1);
241 }
242
243 #[test]
244 fn default_source_is_file() {
245 assert_eq!(ContentSource::default(), ContentSource::File);
246 }
247
248 #[test]
249 fn provider_source_serializes_with_tag() {
250 let src = ContentSource::Provider {
251 provider_id: "jira".into(),
252 resource_type: "issues".into(),
253 };
254 let json = serde_json::to_string(&src).unwrap();
255 assert!(json.contains("\"type\":\"provider\""));
256 assert!(json.contains("\"provider_id\":\"jira\""));
257 }
258}